Linux网络解读(3) - 数据包的发送之网络层

send调用

所有和socket相关的调用都是通过sys_socketcall转发

asmlinkage long sys_socketcall(int call, unsigned long __user *args)
{
        switch(call) 
        {
                case SYS_SOCKET:
                        err = sys_socket(a0,a1,a[2]);
                        break;
                case SYS_SEND:
                        err = sys_send(a0, (void __user *)a1, a[2], a[3]);
                        break;
                case SYS_SENDTO:
                        err = sys_sendto(a0,(void __user *)a1, a[2], a[3],
                                         (struct sockaddr __user *)a[4], a[5]);
                        break;
                ...
                ...
        }
}

最终调用socket_sendmsg

int sock_sendmsg(struct socket        *sock, struct msghdr *msg, size_t size)
{
        struct kiocb                iocb;
        struct sock_iocb        siocb;
        int                        ret;
        init_sync_kiocb(&iocb, NULL);
        iocb.private          = &siocb;
        ret                  = __sock_sendmsg(&iocb, sock, msg, size);
        if (-EIOCBQUEUED == ret)
                ret          = wait_on_sync_kiocb(&iocb);
        return ret;
}
static inline int __sock_sendmsg(struct kiocb        *iocb, struct socket *sock, 
                                 struct msghdr        *msg, size_t size)
{
        struct sock_iocb        *si = kiocb_to_siocb(iocb);
        int                         err;
        si->sock = sock;
        si->scm = NULL;
        si->msg         = msg;
        si->size = size;
        err = security_socket_sendmsg(sock, msg, size);
        if (err)
                return err;
        return sock->ops->sendmsg(iocb, sock, msg, size);
}

会调用具体的socket->ops->sendmsg方法

而创建socket的时候在方法inet_create中根据AF_INET和RAW参数找到的ops是inet_sockraw_ops

static struct proto_ops inet_sockraw_ops = {
        .family =        PF_INET,
        .owner =        THIS_MODULE,
        .release =        inet_release,
        .bind =                inet_bind,
        .connect =        inet_dgram_connect,
        .socketpair =        sock_no_socketpair,
        .accept =        sock_no_accept,
        .getname =        inet_getname,
        .poll =                datagram_poll,
        .ioctl =        inet_ioctl,
        .listen =        sock_no_listen,
        .shutdown =        inet_shutdown,
        .setsockopt =        sock_common_setsockopt,
        .getsockopt =        sock_common_getsockopt,
        .sendmsg =        inet_sendmsg,
        .recvmsg =        sock_common_recvmsg,
        .mmap =                sock_no_mmap,
        .sendpage =        inet_sendpage,
};

所以,此处socket->ops->sendmsg是

.sendmsg = inet_sendmsg,

中的inet_sendmsg方法。

从sys_socketcall()到socket->ops->sendmsg是socket层负责完成的事情,接下来具体的协议来决定如何发送数据。

接着看socket->ops->sendmsg的细节。

int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size)
{
    struct sock *sk = sock->sk;
    if (!inet_sk(sk)->num && inet_autobind(sk))
       return -EAGAIN;
    return sk->sk_prot->sendmsg(iocb, sk, msg, size);
}

可以看出socket层的发送函数inet_sendmsg负责完成端口的绑定之后,然后就调用具体协议的发送函数sk->sk_prot->sendmsg了,这里的sk_prot是raw_prot。

对于raw套接字,数据包在transport层仅仅需要做以上这些处理,然后就进入IP层。IP层主要的工作是决定这个数据包该发向何处。

IP层的路由系统

在发送每个报文时,都必须要查询发送接口。这个过程分为3个步骤:

1) 查询路由cache;

2) 查询FIB表;

3) 将最终结果填入路由cache.

路由cache

目的地址的cache表项和路由cache的表项是等价的。
通用的目的地址cache系统如下:
struct rt_hash_bucket {
    struct rtable*chain;
};

这是开链的hash表。

rtable的结构体如下

struct rtable
{
        union
        {
                struct dst_entry        dst; // 目的地址的cache表项
                struct rtable                *rt_next; // 路由表项
        } u;
        struct in_device        *idev;
        unsigned                rt_flags;
        unsigned                rt_type;
        __u32                        rt_dst;
        __u32                        rt_src;
        int                        rt_iif;
        __u32                        rt_gateway;
        // 路由查找key的计算信息
        struct flowi                fl;
        /* Miscellaneous cached information */
        __u32                        rt_spec_dst; /* RFC1122 specific destination */
        struct inet_peer        *peer; /* long-living peer info */
};
注意:rtable的第一个元素u是一个共同体,rtable的第一个元素既可以看作目的cache的指针也可以看作路由表项,如图所示。

Linux网络解读(3) - 数据包的发送之网络层

注意:在hash表中匹配路由时,key的计算信息都在flowi结构体中。

struct flowi {
        int        oif;
        int        iif;
        union {
                struct {
                        __u32                        daddr;
                        __u32                        saddr;
                        __u32                        fwmark;
                        __u8                        tos;
                        __u8                        scope;
                } ip4_u;
                struct {
                        struct in6_addr                daddr;
                        struct in6_addr                saddr;
                        __u32                        flowlabel;
                } ip6_u;
                struct {
                        __u16                        daddr;
                        __u16                        saddr;
                        __u32                        fwmark;
                        __u8                        scope;
                } dn_u;
        } nl_u;
这个结构体区分不同的业务流,i意为identifer。
oif和iinf字段:确定input,output接口。iif是输入接口的索引值,它从net_device结构里的ifIndex获取的,net_device是接收到报文的设备。
fwmark:防火墙mark,流量shaping。
tos:type of service。
scope:是到目的地址的距离,用来归类路由。
可以看出:路由的本质是网络业不同的业务流的标识,而flowi是内核中表示业务流的结构。

再来看dst_entry,这个是目的地址的cache表项。dst_entry的成员dst_ops,指向管理dst_entry函数,供arp协议调用。

对于IPRoute Cache来说,

struct dst_ops
{
        unsigned short                family;
        unsigned short                protocol;
        unsigned                gc_thresh;
        int                        (*gc)(void);
        struct dst_entry *        (*check)(struct dst_entry *, __u32 cookie);
        void                        (*destroy)(struct dst_entry *);
        void                        (*ifdown)(struct dst_entry *,
                                          struct net_device *dev, int how);
        struct dst_entry *        (*negative_advice)(struct dst_entry *);
        void                        (*link_failure)(struct sk_buff *);
        void                        (*update_pmtu)(struct dst_entry *dst, u32 mtu);
        int                        (*get_mss)(struct dst_entry *dst, u32 mtu);
        int                        entry_size;
        atomic_t                entries;
        kmem_cache_t                 *kmem_cachep;
};
family:AF_NET
protocol:0x800
destroy:ipv4_dst_destroy

在下面的raw_sendmsg中,会根据要发送的报文,查找目的地址的cache表项 ip_route_output_flow函数

int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
{
        unsigned hash;
        struct rtable *rth;
        // 根据flowi计算hash值
        hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
        rcu_read_lock_bh();
        // 在全局的hash表rt_hash_table中查找
        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
                rth = rcu_dereference(rth->u.rt_next)) {
                if (rth->fl.fl4_dst == flp->fl4_dst &&
                    rth->fl.fl4_src == flp->fl4_src &&
                    rth->fl.iif == 0 &&
                    rth->fl.oif == flp->oif &&
                    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
                            (IPTOS_RT_MASK | RTO_ONLINK))) {
                            
                        // 如果找到了则返回这个表项    
                        rth->u.dst.lastuse = jiffies;
                        dst_hold(&rth->u.dst);
                        rth->u.dst.__use++;
                        RT_CACHE_STAT_INC(out_hit);
                        rcu_read_unlock_bh();
                        *rp = rth;
                        return 0;
                }
                RT_CACHE_STAT_INC(out_hlist_search);
        }
        rcu_read_unlock_bh();
        // 否则,进行路由解析
        return ip_route_output_slow(rp, flp);
}

发送细节:raw_sendmsg

static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                       size_t len)
{
        // 略过参数检查地址检查等细节
        struct inet_sock *inet = inet_sk(sk);
        {
                struct flowi fl = { .oif = ipc.oif,
                                    .nl_u = { .ip4_u =
                                              { .daddr = daddr, // 目的地址
                                                .saddr = saddr, // 源地址
                                                .tos = tos } },
                                    // socket(AF_INET, SOCK_RAW, ICMP_PROT)的初始化过程中inet->hdrincl被设置为0,表示ICMP报文
                                    // 所以,此处的proto被赋值为IPPROTO_ICMP其值为17
                                    .proto = inet->hdrincl ? IPPROTO_RAW :
                                                                 sk->sk_protocol,
                                  };
                                  
                // 这是一个ICMP报文,从用于拷贝ICMP的code,type                  
                if (!inet->hdrincl)
                        raw_probe_proto_opt(&fl, msg);
                        
                // 路由模块根据fl的内容计算路由信息, 并填入rt结构
                err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT));
        }
        // 如果是RAW报文, 内核不会做过多的干涉, 直接发送出去了。
        if (inet->hdrincl)
                err = raw_send_hdrinc(sk, msg->msg_iov, len, 
                                        rt, msg->msg_flags);
        
         else {
                if (!ipc.addr)
                        ipc.addr = rt->rt_dst;
                lock_sock(sk);
                // ICMP报文会合并小数据包
                err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,
                                        &ipc, rt, msg->msg_flags);
                                        
                // 开始从IP层发送报文
                err = ip_push_pending_frames(sk);
                release_sock(sk);
        }
}

IP层的发送

int ip_push_pending_frames(struct sock *sk)
{
        struct sk_buff *skb, *tmp_skb;
        struct sk_buff **tail_skb;
        struct inet_sock *inet = inet_sk(sk);
        struct ip_options *opt = NULL;
        struct rtable *rt = inet->cork.rt;
        struct iphdr *iph;
        int df = 0;
        __u8 ttl;
        int err = 0;
        if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
                goto out;
        tail_skb = &(skb_shinfo(skb)->frag_list);
        // 把data指针移动到IP层
        if (skb->data < skb->nh.raw)
                __skb_pull(skb, skb->nh.raw - skb->data);
        while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
                __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
                *tail_skb = tmp_skb;
                tail_skb = &(tmp_skb->next);
                skb->len += tmp_skb->len;
                skb->data_len += tmp_skb->len;
                skb->truesize += tmp_skb->truesize;
                __sock_put(tmp_skb->sk);
                tmp_skb->destructor = NULL;
                tmp_skb->sk = NULL;
        }
        // 对ip层报文进行设置        
        iph = (struct iphdr *)skb->data;
        iph->version = 4;
        iph->ihl = 5;
        if (opt) {
                iph->ihl += opt->optlen>>2;
                ip_options_build(skb, opt, inet->cork.addr, rt, 0);
        }
        iph->tos = inet->tos;
        iph->tot_len = htons(skb->len);
        iph->frag_off = df;
        if (!df) {
                __ip_select_ident(iph, &rt->u.dst, 0);
        } else {
                iph->id = htons(inet->id++);
        }
        iph->ttl = ttl;
        iph->protocol = sk->sk_protocol;
        iph->saddr = rt->rt_src;
        iph->daddr = rt->rt_dst;
        ip_send_check(iph);
        skb->priority = sk->sk_priority;
        // 把路由系统的信息赋值给sk_buff中的dst成员
        skb->dst = dst_clone(&rt->u.dst);
        // 这个地方会回调一次NF_IP_LOCAL_OUT
        // netfilter 功能
        // 这个5个hook点中的LOCAL_OUT
        // dst_output发送sk_buff
        err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, 
                      skb->dst->dev, dst_output);
}

dst_output在路由系统中的mkroute_output中设置为ip_output():

int ip_output(struct sk_buff *skb)
{
        IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
        if (skb->len > dst_pmtu(skb->dst) && !skb_shinfo(skb)->tso_size)
                return ip_fragment(skb, ip_finish_output);
        else
                return ip_finish_output(skb);
}

ip_output会判断skb->len>dst_pmtu(skb->dst)是否需要分片。然后,调用ip_finish_output发送。

int ip_finish_output(struct sk_buff *skb)
{
        struct net_device *dev = skb->dst->dev;
        skb->dev = dev;
        skb->protocol = htons(ETH_P_IP);
        // netfilter功能
        // 这个5个hook点中的POST_ROUTING
        return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
                       ip_finish_output2);
}

进入ip_finish_output2:

这个函数的任务是构造2层报文的MAC地址。并发送这个2层报文。

static inline int ip_finish_output2(struct sk_buff *skb)
{
        struct dst_entry *dst = skb->dst;
        struct hh_cache *hh = dst->hh;
        struct net_device *dev = dst->dev;
        int hh_len = LL_RESERVED_SPACE(dev);
        if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
                struct sk_buff *skb2;
                skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
                if (skb2 == NULL) {
                        kfree_skb(skb);
                        return -ENOMEM;
                }
                if (skb->sk)
                        skb_set_owner_w(skb2, skb->sk);
                kfree_skb(skb);
                skb = skb2;
        }
        // dst->neighbour->output指向了neigh_resolve_output(),在arp_constructor函数中,初始化neighbour时候指定的。
        // hh->hh_output 指向了dev_queue_xmit()
        if (hh) {
                int hh_alen;
                read_lock_bh(&hh->hh_lock);
                hh_alen = HH_DATA_ALIGN(hh->hh_len);
                  memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
                read_unlock_bh(&hh->hh_lock);
                skb_push(skb, hh->hh_len);
                return hh->hh_output(skb);
        } else if (dst->neighbour)
                return dst->neighbour->output(skb);
        if (net_ratelimit())
                printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
        kfree_skb(skb);
        return -EINVAL;
}

Linux网络解读(3) - 数据包的发送之网络层

从dst中取出hh(hard header cahe),缓存了mac的信息(邻居的mac,自己的mac,协议号)。

在设置完dev->hard_header后,调用hh->hh_output()或dst->neighbour->output()发送2层报文。

如果没有找到hh,就说明这个目的IP对应的mac地址 不在本机的邻居系统里,需要发送arp查询报文。

下面看看如何发送arp查询报文。

先看dst->neighbour->output()在arp_constructor被设置成了neigh_resolve_output:

int neigh_resolve_output(struct sk_buff *skb)
{
        struct dst_entry *dst = skb->dst;
        struct neighbour *neigh;
        int rc = 0;
        if (!dst || !(neigh = dst->neighbour))
                goto discard;
        __skb_pull(skb, skb->nh.raw - skb->data);
        if (!neigh_event_send(neigh, skb)) {
                int err;
                struct net_device *dev = neigh->dev;
                if (dev->hard_header_cache && !dst->hh) {
                        write_lock_bh(&neigh->lock);
                        if (!dst->hh)
                                neigh_hh_init(neigh, dst, dst->ops->protocol);
                        err = dev->hard_header(skb, dev, ntohs(skb->protocol),
                                               neigh->ha, NULL, skb->len);
                        write_unlock_bh(&neigh->lock);
                } else {
                        read_lock_bh(&neigh->lock);
                        err = dev->hard_header(skb, dev, ntohs(skb->protocol),
                                               neigh->ha, NULL, skb->len);
                        read_unlock_bh(&neigh->lock);
                }
                if (err >= 0)
                        rc = neigh->ops->queue_xmit(skb);
                else
                        goto out_kfree_skb;
        }
}

这个函数的目标就是发送arp报文, 在neigh_event_send()函数中实现:

int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
        int rc;
        unsigned long now;
        write_lock_bh(&neigh->lock);
        rc = 0;
        if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
                goto out_unlock_bh;
        now = jiffies;
        
        if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
                if (neigh->parms->mcast_probes + neigh->parms->app_probes) {
                        atomic_set(&neigh->probes, neigh->parms->ucast_probes);
                        neigh->nud_state     = NUD_INCOMPLETE;
                        neigh_hold(neigh);
                        neigh->timer.expires = now + 1;
                        add_timer(&neigh->timer);
                } else {
                        neigh->nud_state = NUD_FAILED;
                        write_unlock_bh(&neigh->lock);
                        if (skb)
                                kfree_skb(skb);
                        return 1;
                }
        } else if (neigh->nud_state & NUD_STALE) {
                NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
                neigh_hold(neigh);
                neigh->nud_state = NUD_DELAY;
                neigh->timer.expires = jiffies + neigh->parms->delay_probe_time;
                add_timer(&neigh->timer);
        }
        if (neigh->nud_state == NUD_INCOMPLETE) {
                if (skb) {
                        if (skb_queue_len(&neigh->arp_queue) >=
                            neigh->parms->queue_len) {
                                struct sk_buff *buff;
                                buff = neigh->arp_queue.next;
                                __skb_unlink(buff, &neigh->arp_queue);
                                kfree_skb(buff);
                        }
                        __skb_queue_tail(&neigh->arp_queue, skb);
                }
                rc = 1;
        }
out_unlock_bh:
        write_unlock_bh(&neigh->lock);
        return rc;
}

发送arp报文并不是我们想的调用相关的发送函数,

而是设置neigh->nud_state为NUD_INCOMPLETE状态,

同时添加一个timer,把skb挂到neigh->arp_queue队列中。

在timer的回调里从skb取出必要信息,构造arp报文。

典型的 非阻塞操作+回调函数。

neighbour->timer是在neigh_alloc中分配的,指向neigh_timer_handler(),只看和NUD_INCOMPLETE相关的代码

static void neigh_timer_handler(unsigned long arg)
{
        unsigned long now, next;
        struct neighbour *neigh = (struct neighbour *)arg;
        unsigned state;
        int notify = 0;
        write_lock(&neigh->lock);
        state = neigh->nud_state;
        now = jiffies;
        next = now + HZ;
        if (neigh->nud_state & NUD_IN_TIMER) {
                neigh_hold(neigh);
                if (time_before(next, jiffies + HZ/2))
                        next = jiffies + HZ/2;
                neigh->timer.expires = next;
                add_timer(&neigh->timer);
        }
        if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
                struct sk_buff *skb = skb_peek(&neigh->arp_queue);
                if (skb)
                        skb_get(skb);
                write_unlock(&neigh->lock);
                neigh->ops->solicit(neigh, skb);
                atomic_inc(&neigh->probes);
                if (skb)
                        kfree_skb(skb);
        }
}

最终在timer的回调中调用neigh->ops->solicit,发送arp,然后就返回了。

到此,一个报文穿过了transport层,IP层,到了链路层,如果有缓存hh,则直接拷贝缓存的mac地址;如果没有则发送arp查询报文。

在arp的协助下构造好了2层报文后,就会调用设备层的发送函数发送这个报文,咱们下节再续。

Linux网络解读(3) - 数据包的发送之网络层

上一篇:ceph源码 - crush


下一篇:TLA+ Specifying System (1)