UDP
UDP和IP的区别是多了端口的概念。
看一下端口绑定的过程:
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); unsigned short snum; int chk_addr_ret; int err; // 如果套接字有自己定义的bind函数则调用,只有raw类型的才有自定义的bind函数。 if (sk->sk_prot->bind) { err = sk->sk_prot->bind(sk, uaddr, addr_len); goto out; } err = -EINVAL; if (addr_len < sizeof(struct sockaddr_in)) goto out; chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); err = -EADDRNOTAVAIL; if (!sysctl_ip_nonlocal_bind && !inet->freebind && addr->sin_addr.s_addr != INADDR_ANY && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) goto out; snum = ntohs(addr->sin_port); err = -EACCES; if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) goto out; lock_sock(sk); err = -EINVAL; if (sk->sk_state != TCP_CLOSE || inet->num) goto out_release_sock; inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->saddr = 0; /* Use device */ //调用具体协议的get_port方法,获取一个端口。 if (sk->sk_prot->get_port(sk, snum)) { inet->saddr = inet->rcv_saddr = 0; err = -EADDRINUSE; goto out_release_sock; } if (inet->rcv_saddr) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; inet->sport = htons(inet->num); inet->daddr = 0; inet->dport = 0; sk_dst_reset(sk); err = 0; out_release_sock: release_sock(sk); out: return err; }
调用具体协议的get_port方法,获取一个端口:sk->sk_prot->get_port(sk, snum)
sk_prot是inetsw_array中的udp_prot,所以get_port是udp_v4_get_port。
注意:从用户态往内核态的协议选择走inetsw_array
static int udp_v4_get_port(struct sock *sk, unsigned short snum) { struct hlist_node *node; struct sock *sk2; struct inet_sock *inet = inet_sk(sk); write_lock_bh(&udp_hash_lock); if (snum == 0) { int best_size_so_far, best, result, i; // 端口号的范围在sysctl_local_port_range // 每次调用都从udp_port_rover开始查找可用的端口 if (udp_port_rover > sysctl_local_port_range[1] || udp_port_rover < sysctl_local_port_range[0]) udp_port_rover = sysctl_local_port_range[0]; best_size_so_far = 32767; best = result = udp_port_rover; // 遍历udp_hash表 // 在128个桶中寻找桶最小的,然后每隔128次试探一次。 for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { struct hlist_head *list; int size; list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)]; if (hlist_empty(list)) { if (result > sysctl_local_port_range[1]) result = sysctl_local_port_range[0] + ((result - sysctl_local_port_range[0]) & (UDP_HTABLE_SIZE - 1)); goto gotit; } size = 0; // 计算当前桶的大小,是否是最小的桶,如果是更新best_size_so_far sk_for_each(sk2, node, list) if (++size >= best_size_so_far) goto next; best_size_so_far = size; best = result; next:; } result = best; // 最小的桶每隔128试探一次。 for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) { if (result > sysctl_local_port_range[1]) result = sysctl_local_port_range[0] + ((result - sysctl_local_port_range[0]) & (UDP_HTABLE_SIZE - 1)); if (!udp_lport_inuse(result)) break; } if (i >= (1 << 16) / UDP_HTABLE_SIZE) goto fail; gotit: udp_port_rover = snum = result; } else { sk_for_each(sk2, node, &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) { struct inet_sock *inet2 = inet_sk(sk2); if (inet2->num == snum && sk2 != sk && !ipv6_only_sock(sk2) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (!inet2->rcv_saddr || !inet->rcv_saddr || inet2->rcv_saddr == inet->rcv_saddr) && (!sk2->sk_reuse || !sk->sk_reuse)) goto fail; } } inet->num = snum; if (sk_unhashed(sk)) { struct hlist_head *h = &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]; sk_add_node(sk, h); sock_prot_inc_use(sk->sk_prot); } write_unlock_bh(&udp_hash_lock); return 0; fail: write_unlock_bh(&udp_hash_lock); return 1; }
下面看一下UDP在内核中收数据过程。软中断最终调用到ip_local_deliver_finish,根据IP报文头部的协议字段skb->nh.iph->protocol在inet_protos找到对应的协议处理函数icmp, udp, tcp。
对于UDP来说
static struct net_protocol udp_protocol = {
.handler = udp_rcv,
.err_handler = udp_err,
.no_policy = 1,
};
int udp_rcv(struct sk_buff *skb) { struct sock *sk; struct udphdr *uh; unsigned short ulen; struct rtable *rt = (struct rtable*)skb->dst; // 从skb获取源地址目的地址 u32 saddr = skb->nh.iph->saddr; u32 daddr = skb->nh.iph->daddr; int len = skb->len; uh = skb->h.uh; ulen = ntohs(uh->len); if (ulen > len || ulen < sizeof(*uh)) goto short_packet; if (pskb_trim(skb, ulen)) goto short_packet; // 检查校验和 if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) goto csum_error; if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return udp_v4_mcast_deliver(skb, uh, saddr, daddr); // 根据skb中的源地址和端口,目的地址和端口,反向查找到这个数据应该发送到哪个sock上 sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex); if (sk != NULL) { // 把skb添加到sk->sk_receive_queue尾部 // 并唤醒等待在sleep上的进程 int ret = udp_queue_rcv_skb(sk, skb); sock_put(sk); if (ret > 0) return -ret; return 0; } if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; /* No socket. Drop packet silently, if checksum is wrong */ if (udp_checksum_complete(skb)) goto csum_error; UDP_INC_STATS_BH(UDP_MIB_NOPORTS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); // 忽略错误处理 ... ... ... }
下面看一下UDP如何把数据返回给用户。
static int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; struct sk_buff *skb; int copied, err; if (addr_len) *addr_len=sizeof(*sin); if (flags & MSG_ERRQUEUE) return ip_recv_error(sk, msg, len); try_again: // 接收一个skb,如果是blocking的socket,而且此时没有数据包会进入wait_for_packet skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) goto out; // 下面开始向用户态拷贝数据 copied = skb->len - sizeof(struct udphdr); if (copied > len) { copied = len; msg->msg_flags |= MSG_TRUNC; } if (skb->ip_summed==CHECKSUM_UNNECESSARY) { err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); } else if (msg->msg_flags&MSG_TRUNC) { if (__udp_checksum_complete(skb)) goto csum_copy_err; err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); } else { err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); if (err == -EINVAL) goto csum_copy_err; } if (err) goto out_free; sock_recv_timestamp(msg, sk, skb); if (sin) { sin->sin_family = AF_INET; sin->sin_port = skb->h.uh->source; sin->sin_addr.s_addr = skb->nh.iph->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); } if (inet->cmsg_flags) ip_cmsg_recv(msg, skb); err = copied; if (flags & MSG_TRUNC) err = skb->len - sizeof(struct udphdr); // 省略错误处理 ... ... ... }