TCP/IP协议栈在Linux内核中的运行时序分析

1 概述

​ 该博客主要描述Tcp/ip协议栈的发送和接收流程,linux内核版本为2.6.26

​ 全文直接从发送端和接收端两侧分别展开,然后跟踪数据发送和接收流程

2 send发送过程

2.1 应用层

​ 首先通过在client文件中进行send函数调用

if (send(sockfd, "Hello!\n", 7, 0) == -1)
   perror("send");

​ send函数在socket.h文件中声明:

extern ssize_t send (int __fd, __const void *__buf, size_t __n, int __flags);

​ 找到send函数在send.c文件中

ssize_t
__send (fd, buf, n, flags)
     int fd;
     const void *buf;
     size_t n;
     int flags;
{
  error_t err;
  size_t wrote;

  err = HURD_DPORT_USE (fd, __socket_send (port, MACH_PORT_NULL,
					   flags, buf, n,
					   NULL, MACH_MSG_TYPE_COPY_SEND, 0,
					   NULL, 0, &wrote));

  return err ? __hurd_dfail (fd, err) : wrote;
}
libc_hidden_def (__send)
weak_alias (__send, send)

​ 可以看到weak_alias (__send, send)为send函数定义了别名,使用别名可以让函数在其他位置进行真实定义

send()函数的真实定义在文件send.S中:

#define	socket	send
#define	__socket __libc_send
#define	NARGS	4
#define NEED_CANCELLATION
#include <socket.S>
weak_alias (__libc_send, __send)
libc_hidden_def (__send)

​ 其中包含socket.S文件

#define P(a, b) P2(a, b)
#define P2(a, b) a##b

#ifndef __socket
# ifndef NO_WEAK_ALIAS
#  define __socket P(__,socket)
# else
#  define __socket socket
# endif
#endif

.globl __socket
	cfi_startproc
ENTRY (__socket)

	/* Save registers.  */
	movl %ebx, %edx
	cfi_register (3, 2)
	movl $SYS_ify(socketcall), %eax	/* System call number in %eax.  */
	/* Use ## so `socket' is a separate token that might be #define'd.  */
	movl $P(SOCKOP_,socket), %ebx	/* Subcode is first arg to syscall.  */
	lea 4(%esp), %ecx		/* Address of args is 2nd arg.  */
        /* Do the system call trap.  */
	ENTER_KERNEL
	/* Restore registers.  */
	movl %edx, %ebx
	cfi_restore (3)
	ret

​ 又I386系统的ENTER_KERNEL和SYS_ify的宏定义如下

# define ENTER_KERNEL int $0x80
#define SYS_ify(syscall_name)	__NR_##syscall_name

将宏定义转换可得:

ENTRY (__libc_send)

	movl %ebx, %edx
	cfi_register (3, 2)
	movl $__NR_socketcall, %eax	/* System call number in %eax.  */
	/* Use ## so `socket' is a separate token that might be #define'd.  */
	movl $SOCKOP_send, %ebx	/* Subcode is first arg to syscall.  */
	lea 4(%esp), %ecx		/* Address of args is 2nd arg.  */
        /* Do the system call trap.  */
	int $0x80
	/* Restore registers.  */
	movl %edx, %ebx
	cfi_restore (3)
	ret

​ 系统调用号在unistd_32.h文件中定义为102

#define __NR_socketcall		102

​ 同时SOCKOP_send作为第一个参数保存到ebx寄存器中,第二个参数通过数组指针(地址)方式保存在ecx

​ 在glibcsocketcall.h文件中定义:

#define SOCKOP_send		9

​ 所以SOCKOP_send为9

​ 随即执行软中断指令int $0x80到达系统调用入口函数system_call

system_call实现在文件entry_32.S

ENTRY(system_call)
syscall_call:
	call *sys_call_table(,%eax,4)
	movl %eax,PT_EAX(%esp)		# store the return value
syscall_exit:
	LOCKDEP_SYS_EXIT
	DISABLE_INTERRUPTS(CLBR_ANY)

​ 因为之前传入eax寄存器的值为102,故call *sys_call_table(,%eax,4)为调用系统分配表的102号函数,系统分配表在文件syscall_table.S中:

.long sys_fstatfs	/* 100 */
.long sys_ioperm
.long sys_socketcall
.long sys_syslog
.long sys_setitimer
.long sys_getitimer	/* 105 */

​ 可以看到102号函数为sys_socketcall函数

​ 在net/socket.c文件中找到sys_socketcall函数

asmlinkage long sys_socketcall(int call, unsigned long __user *args)
{
	switch (call) {
	case SYS_SEND:
		err = sys_send(a0, (void __user *)a1, a[2], a[3]);
		break;
	case SYS_SENDTO:
		err = sys_sendto(a0, (void __user *)a1, a[2], a[3],
				 (struct sockaddr __user *)a[4], a[5]);
		break;
    }

​ call函数作为接收系统调用的第一个参数为SOCKOP_send的值9,又SYS_SENDnet.h中定义为

#define SYS_SEND	9

​ 故通过switch会选择SYS_SEND执行

​ 之后开始执行函数sys_send();

asmlinkage long sys_send(int fd, void __user *buff, size_t len, unsigned flags)
{
	return sys_sendto(fd, buff, len, flags, NULL, 0);
}

​ 可以看到sys_send()函数实际调用函数sys_sendto

​ 进入函数sys_send()

asmlinkage long sys_sendto(int fd, void __user *buff, size_t len,
			   unsigned flags, struct sockaddr __user *addr,
			   int addr_len)
{
	......
	err = sock_sendmsg(sock, &msg, len);

out_put:
	fput_light(sock->file, fput_needed);
out:
	return err;
}

​ 可以看到最终会调用函数sock_sendmsg进行消息发送

​ 进入函数sock_sendmsg

int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
	......
	ret = __sock_sendmsg(&iocb, sock, msg, size);
	if (-EIOCBQUEUED == ret)
		ret = wait_on_sync_kiocb(&iocb);
	return ret;
}

​ 可以看到sock_sendmsg会调用__sock_sendmsg进行消息发送

​ 进入到函数__sock_sendmsg

static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock,
				 struct msghdr *msg, size_t size)
{
	struct sock_iocb *si = kiocb_to_siocb(iocb);
	int err;

	si->sock = sock;
	si->scm = NULL;
	si->msg = msg;
	si->size = size;

	err = security_socket_sendmsg(sock, msg, size);
	if (err)
		return err;

	return sock->ops->sendmsg(iocb, sock, msg, size);
}

​ 可以看到在__sock_sendmsg函数中会调用sock结构体的ops指针

​ sock结构体如下所示:

struct socket {
	socket_state		state;
	unsigned long		flags;
	const struct proto_ops	*ops;
	struct fasync_struct	*fasync_list;
	struct file		*file;
	struct sock		*sk;
	wait_queue_head_t	wait;
	short			type;
};

​ 其中包含函数操作结构体proto_ops的指针

proto_ops结构体如下所示:

struct proto_ops {
	int		family;
	struct module	*owner;
	......
	int		(*sendmsg)   (struct kiocb *iocb, struct socket *sock,
				      struct msghdr *m, size_t total_len);
};

proto_ops结构体主要是一些函数指针,其函数指针在内核初始化时通过initcall机制对inet_init函数进行安装

​ 创建socket时其指定的函数表为inet_stream_ops

const struct proto_ops inet_stream_ops = {
	.family		   = PF_INET,
	.owner		   = THIS_MODULE,
	.release	   = inet_release,
	.bind		   = inet_bind,
	.connect	   = inet_stream_connect,
	.socketpair	   = sock_no_socketpair,
	.accept		   = inet_accept,
	.getname	   = inet_getname,
	.poll		   = tcp_poll,
	.ioctl		   = inet_ioctl,
	.listen		   = inet_listen,
	.shutdown	   = inet_shutdown,
	.setsockopt	   = sock_common_setsockopt,
	.getsockopt	   = sock_common_getsockopt,
	.sendmsg	   = tcp_sendmsg,
	.recvmsg	   = sock_common_recvmsg,
	.mmap		   = sock_no_mmap,
	.sendpage	   = tcp_sendpage,
	.splice_read	   = tcp_splice_read,
}

​ 可以看到sendmsg函数被指定为函数tcp_sendmsg

​ 因此调用sock->ops->sendmsg函数也就是调用函数tcp_sendmsg

2.2 传输层

tcp_sendmsg函数如下:

int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
		size_t size)
{
    .......
if (forced_push(tp)) {
				tcp_mark_push(tp, skb);
				__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
			} else if (skb == tcp_send_head(sk))
				tcp_push_one(sk, mss_now);
			continue;
    ......
}

​ 在该函数中,先会对sk进行处理,然后调用函数__tcp_push_pending_frames进行数据的发送

​ 进入函数__tcp_push_pending_frames

void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
			       int nonagle)
{
	struct sk_buff *skb = tcp_send_head(sk);

	if (skb) {
		if (tcp_write_xmit(sk, cur_mss, nonagle))
			tcp_check_probe_timer(sk);
	}
}

​ 从上面函数可以看到当skb有数据时就会调用tcp_write_xmit进行数据发送

static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
{
	......

		if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))
			break;

        ......
    }

​ 在该函数中最终会调用tcp_transmit_skb进行数据发送

tcp_transmit_skb函数如下,仅贴出主要函数

static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
			    gfp_t gfp_mask)
{
    
    ......
err = icsk->icsk_af_ops->queue_xmit(skb, 0);

	......
}

​ 在tcp_transmit_skb函数中会指定SYN标志位

​ 可以看到在tcp_transmit_skb函数会调用icsk->icsk_af_ops->queue_xmit完成实际的数据发送,而该函数指针实际指向的函数是ip_queue_xmit

2.3 网络层

​ 经过tcp_output.c的函数tcp_transmit_skb会调用网络层ip_output.c文件的函数ip_queue_xmit

ip_queue_xmit函数如下所示:

int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
{
    ......
    return ip_local_out(skb);
    ......
        
}

ip_queue_xmit先对ip数据报进行初始化然后最后调用ip_local_out进行数据发送

ip_local_out函数:

int ip_local_out(struct sk_buff *skb)
{
	int err;

	err = __ip_local_out(skb);
	if (likely(err == 1))
		err = dst_output(skb);

	return err;
}

ip_local_out会调用函数dst_output

dst_output函数:

static inline int dst_output(struct sk_buff *skb)
{
	return skb->dst->output(skb);
}

​ 可以看到该函数会调用skb->dst->output,而该函数指针指向ip_output函数

ip_output函数为:

int ip_output(struct sk_buff *skb)
{
	struct net_device *dev = skb->dst->dev;

	IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);

	skb->dev = dev;
	skb->protocol = htons(ETH_P_IP);

	return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,
			    ip_finish_output,
			    !(IPCB(skb)->flags & IPSKB_REROUTED));
}

NF_HOOK_COND宏定义为:

#define NF_HOOK_COND(pf, hook, skb, indev, outdev, okfn, cond)		       \
({int __ret;								       \
if ((__ret=nf_hook_thresh(pf, hook, (skb), indev, outdev, okfn, INT_MIN, cond)) == 1)\
	__ret = (okfn)(skb);						       \
__ret;})

​ 这就意味着代码

NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,
			    ip_finish_output,
			    !(IPCB(skb)->flags & IPSKB_REROUTED));

​ 会执行函数ip_finish_output

ip_finish_output函数:

static int ip_finish_output(struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
	/* Policy lookup after SNAT yielded a new policy */
	if (skb->dst->xfrm != NULL) {
		IPCB(skb)->flags |= IPSKB_REROUTED;
		return dst_output(skb);
	}
#endif
	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
		return ip_fragment(skb, ip_finish_output2);
	else
		return ip_finish_output2(skb);
}

​ 上述函数会根据数据块长度选择执行函数ip_fragment还是ip_finish_output2

​ 而ip_fragment是先将数据进行分段然后再发送,最终依旧会调用函数

ip_finish_output2,故直接看函数ip_finish_output2

ip_finish_output2函数为:

static inline int ip_finish_output2(struct sk_buff *skb)
{
	struct dst_entry *dst = skb->dst;
	struct rtable *rt = (struct rtable *)dst;
	struct net_device *dev = dst->dev;
	unsigned int hh_len = LL_RESERVED_SPACE(dev);

	if (rt->rt_type == RTN_MULTICAST)
		IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
	else if (rt->rt_type == RTN_BROADCAST)
		IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS);

	/* Be paranoid, rather than too clever. */
	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
		struct sk_buff *skb2;

		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
		if (skb2 == NULL) {
			kfree_skb(skb);
			return -ENOMEM;
		}
		if (skb->sk)
			skb_set_owner_w(skb2, skb->sk);
		kfree_skb(skb);
		skb = skb2;
	}

	if (dst->hh)
		return neigh_hh_output(dst->hh, skb);
	else if (dst->neighbour)
		return dst->neighbour->output(skb);

	if (net_ratelimit())
		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
	kfree_skb(skb);
	return -EINVAL;
}

​ 该函数会完成ip数据包头部大小的检测,保证为链路层留下足够的头部空间。之后会调用dst->neighbour->output函数指针,此为邻居子系统的

output函数指针,实际指向函数neigh_resolve_output

​ 邻居子系统指明了IP地址向MAC地址的转换关系,是一种邻居协议

​ 转到函数neigh_resolve_output

int neigh_resolve_output(struct sk_buff *skb)
{
	struct dst_entry *dst = skb->dst;
	struct neighbour *neigh;
	int rc = 0;

	if (!dst || !(neigh = dst->neighbour))
		goto discard;

	__skb_pull(skb, skb_network_offset(skb));

	if (!neigh_event_send(neigh, skb)) {
		int err;
		struct net_device *dev = neigh->dev;
		if (dev->header_ops->cache && !dst->hh) {
			write_lock_bh(&neigh->lock);
			if (!dst->hh)
				neigh_hh_init(neigh, dst, dst->ops->protocol);
			err = dev_hard_header(skb, dev, ntohs(skb->protocol),
					      neigh->ha, NULL, skb->len);
			write_unlock_bh(&neigh->lock);
		} else {
			read_lock_bh(&neigh->lock);
			err = dev_hard_header(skb, dev, ntohs(skb->protocol),
					      neigh->ha, NULL, skb->len);
			read_unlock_bh(&neigh->lock);
		}
		if (err >= 0)
			rc = neigh->ops->queue_xmit(skb);
		else
			goto out_kfree_skb;
	}
out:
	return rc;
discard:
	NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n",
		      dst, dst ? dst->neighbour : NULL);
out_kfree_skb:
	rc = -EINVAL;
	kfree_skb(skb);
	goto out;
}

​ 该函数在rc = neigh->ops->queue_xmit(skb);处通过函数指针neigh->ops->queue_xmit执行邻居函数表的函数,其实际函数为dev_queue_xmit

2.4 数据链路层

​ 首先调用邻居子系统函数dev_queue_xmit

int dev_queue_xmit(struct sk_buff *skb)
{
	struct net_device *dev = skb->dev;
	struct Qdisc *q;
	int rc = -ENOMEM;
	if (netif_needs_gso(dev, skb))
		goto gso;

	if (skb_shinfo(skb)->frag_list &&
	    !(dev->features & NETIF_F_FRAGLIST) &&
	    __skb_linearize(skb))
		goto out_kfree_skb;

	if (skb_shinfo(skb)->nr_frags &&
	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
	    __skb_linearize(skb))
		goto out_kfree_skb;
	if (skb->ip_summed == CHECKSUM_PARTIAL) {
		skb_set_transport_header(skb, skb->csum_start -
					      skb_headroom(skb));
		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
			goto out_kfree_skb;
	}

gso:
	spin_lock_prefetch(&dev->queue_lock);

	rcu_read_lock_bh();

	q = rcu_dereference(dev->qdisc);
#ifdef CONFIG_NET_CLS_ACT
	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
#endif
	if (q->enqueue) {
		/* Grab device queue */
		spin_lock(&dev->queue_lock);
		q = dev->qdisc;
		if (q->enqueue) {
			/* reset queue_mapping to zero */
			skb_set_queue_mapping(skb, 0);
			rc = q->enqueue(skb, q);
			qdisc_run(dev);
			spin_unlock(&dev->queue_lock);

			rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
			goto out;
		}
		spin_unlock(&dev->queue_lock);
	}
	if (dev->flags & IFF_UP) {
		int cpu = smp_processor_id(); /* ok because BHs are off */

		if (dev->xmit_lock_owner != cpu) {

			HARD_TX_LOCK(dev, cpu);

			if (!netif_queue_stopped(dev) &&
			    !netif_subqueue_stopped(dev, skb)) {
				rc = 0;
				if (!dev_hard_start_xmit(skb, dev)) {
					HARD_TX_UNLOCK(dev);
					goto out;
				}
			}
			HARD_TX_UNLOCK(dev);
			if (net_ratelimit())
				printk(KERN_CRIT "Virtual device %s asks to "
				       "queue packet!\n", dev->name);
		} else {
			if (net_ratelimit())
				printk(KERN_CRIT "Dead loop on virtual device "
				       "%s, fix it urgently!\n", dev->name);
		}
	}

	rc = -ENETDOWN;
	rcu_read_unlock_bh();

out_kfree_skb:
	kfree_skb(skb);
	return rc;
out:
	rcu_read_unlock_bh();
	return rc;
}

​ 该函数调用dev_hard_start_xmit函数进行数据发送

dev_hard_start_xmit函数为:

int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
	if (likely(!skb->next)) {
		if (!list_empty(&ptype_all))
			dev_queue_xmit_nit(skb, dev);

		if (netif_needs_gso(dev, skb)) {
			if (unlikely(dev_gso_segment(skb)))
				goto out_kfree_skb;
			if (skb->next)
				goto gso;
		}

		return dev->hard_start_xmit(skb, dev);
	}

gso:
	do {
		struct sk_buff *nskb = skb->next;
		int rc;

		skb->next = nskb->next;
		nskb->next = NULL;
		rc = dev->hard_start_xmit(nskb, dev);
		if (unlikely(rc)) {
			nskb->next = skb->next;
			skb->next = nskb;
			return rc;
		}
		if (unlikely((netif_queue_stopped(dev) ||
			     netif_subqueue_stopped(dev, skb)) &&
			     skb->next))
			return NETDEV_TX_BUSY;
	} while (skb->next);

	skb->destructor = DEV_GSO_CB(skb)->destructor;

out_kfree_skb:
	kfree_skb(skb);
	return 0;
}

​ 该函数会调用函数dev->hard_start_xmit进行设备调用数据发送

​ 对于不同的网卡硬件设备来说,其hard_start_xmit函数初始化也不同,对于RTL8169设备驱动函数来说,其dev->hard_start_xmit指针指向函数为rtl8169_start_xmit

static int rtl8169_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct rtl8169_private *tp = netdev_priv(dev);
	unsigned int frags, entry = tp->cur_tx % NUM_TX_DESC;
	struct TxDesc *txd = tp->TxDescArray + entry;
	void __iomem *ioaddr = tp->mmio_addr;
	dma_addr_t mapping;
	u32 status, len;
	u32 opts1;
	int ret = NETDEV_TX_OK;

	if (unlikely(TX_BUFFS_AVAIL(tp) < skb_shinfo(skb)->nr_frags)) {
		if (netif_msg_drv(tp)) {
			printk(KERN_ERR
			       "%s: BUG! Tx Ring full when queue awake!\n",
			       dev->name);
		}
		goto err_stop;
	}

	if (unlikely(le32_to_cpu(txd->opts1) & DescOwn))
		goto err_stop;

	opts1 = DescOwn | rtl8169_tso_csum(skb, dev);

	frags = rtl8169_xmit_frags(tp, skb, opts1);
	if (frags) {
		len = skb_headlen(skb);
		opts1 |= FirstFrag;
	} else {
		len = skb->len;

		if (unlikely(len < ETH_ZLEN)) {
			if (skb_padto(skb, ETH_ZLEN))
				goto err_update_stats;
			len = ETH_ZLEN;
		}

		opts1 |= FirstFrag | LastFrag;
		tp->tx_skb[entry].skb = skb;
	}

	mapping = pci_map_single(tp->pci_dev, skb->data, len, PCI_DMA_TODEVICE);

	tp->tx_skb[entry].len = len;
	txd->addr = cpu_to_le64(mapping);
	txd->opts2 = cpu_to_le32(rtl8169_tx_vlan_tag(tp, skb));

	wmb();

	/* anti gcc 2.95.3 bugware (sic) */
	status = opts1 | len | (RingEnd * !((entry + 1) % NUM_TX_DESC));
	txd->opts1 = cpu_to_le32(status);

	dev->trans_start = jiffies;

	tp->cur_tx += frags + 1;

	smp_wmb();

	RTL_W8(TxPoll, NPQ);	/* set polling bit */

	if (TX_BUFFS_AVAIL(tp) < MAX_SKB_FRAGS) {
		netif_stop_queue(dev);
		smp_rmb();
		if (TX_BUFFS_AVAIL(tp) >= MAX_SKB_FRAGS)
			netif_wake_queue(dev);
	}

out:
	return ret;

err_stop:
	netif_stop_queue(dev);
	ret = NETDEV_TX_BUSY;
err_update_stats:
	dev->stats.tx_dropped++;
	goto out;
}

​ 上述函数会将数据持续写入缓冲区中,然后调用底层硬件完成数据的发送

​ 至此内核中的数据发送过程结束

2.5 发送过程调试分析:

断点处为dev->hard_start_xmit,内核为linux5.4.34,函数调用栈如下所示:

TCP/IP协议栈在Linux内核中的运行时序分析

​ 可以看到函数发送先从entry_syscall_64开始进入系统调用,之后调用sys_send_to进行发送,然后调用传输层tcp_sendmsg,再调用网络层ip_output,之后调用数据链路层dev_queue_xmit,然后调用相应的网卡驱动的发送函数,最后通过物理层进行数据传输。

3 recv接收过程

​ 接收依然要经过5层结构,物理层不做分析,直接到数据链路层,也就是数据到达网卡设备开始。

3.1 数据链路层

​ 网卡每次接收到数据都会调用相应驱动的中断函数,如cs8900网卡的中断函数为net_interrupt

static irqreturn_t net_interrupt(int irq, void *dev_id)
{
	......
	while ((status = ioread16(lp->virt_addr + ISQ_PORT))) {
		cs89_dbg(4, debug, "%s: event=%04x\n", dev->name, status);
		handled = 1;
		switch (status & ISQ_EVENT_MASK) {
		case ISQ_RECEIVER_EVENT:
			/* Got a packet(s). */
			net_rx(dev);
			break;
		case ISQ_TRANSMITTER_EVENT:
			dev->stats.tx_packets++;
			netif_wake_queue(dev);	/* Inform upper layers. */
      ......  
        }
        

​ 通过while循环判断io读取状态,然后调用net_rx进行接收数据处理

net_rx函数为:

static void
net_rx(struct net_device *dev)
{
	struct net_local *lp = netdev_priv(dev);
	struct sk_buff *skb;
	int status, length;

	status = ioread16(lp->virt_addr + RX_FRAME_PORT);
	length = ioread16(lp->virt_addr + RX_FRAME_PORT);

	if ((status & RX_OK) == 0) {
		count_rx_errors(status, dev);
		return;
	}

	/* Malloc up new buffer. */
	skb = netdev_alloc_skb(dev, length + 2);
	if (skb == NULL) {
		dev->stats.rx_dropped++;
		return;
	}
	skb_reserve(skb, 2);	/* longword align L3 header */

	readwords(lp, RX_FRAME_PORT, skb_put(skb, length), length >> 1);
	if (length & 1)
		skb->data[length-1] = ioread16(lp->virt_addr + RX_FRAME_PORT);

	cs89_dbg(3, debug, "%s: received %d byte packet of type %x\n",
		 dev->name, length,
		 (skb->data[ETH_ALEN + ETH_ALEN] << 8) |
		 skb->data[ETH_ALEN + ETH_ALEN + 1]);

	skb->protocol = eth_type_trans(skb, dev);
	netif_rx(skb);
	dev->stats.rx_packets++;
	dev->stats.rx_bytes += length;
}

​ 主要完成了从缓冲区获取数据和空间分配,之后调用netif_rx进行下一步处理

netif_rx函数:

int netif_rx(struct sk_buff *skb)
{
	......
	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
		if (queue->input_pkt_queue.qlen) {
enqueue:
			dev_hold(skb->dev);
			__skb_queue_tail(&queue->input_pkt_queue, skb);
			local_irq_restore(flags);
			return NET_RX_SUCCESS;
		}

		napi_schedule(&queue->backlog);
		goto enqueue;
	}

	......
}

​ 可以看到该函数会先检查接收队列是否有数据,如果有数据则通过__skb_queue_tail链入队尾,否则会调用napi_schedule

​ 该软中段数据结构用于处理中断后半部。

​ 进入函数napi_schedule

static inline void napi_schedule(struct napi_struct *n)
{
	if (napi_schedule_prep(n))
		__napi_schedule(n);
}

​ 之后调用函数__napi_schedule

​ 进入函数__napi_schedule

void __napi_schedule(struct napi_struct *n)
{
	unsigned long flags;

	local_irq_save(flags);
	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
	local_irq_restore(flags);
}

​ 该函数会先禁用中断,将软中断数据结构napi添加到队尾,之后执行系统软中断,最后恢复中断

​ 系统软中段入口为do_softirq函数,之后会执行调用网络接收软中断函数net_rx_action

static void net_rx_action(struct softirq_action *h)
{
    ......
	if (test_bit(NAPI_STATE_SCHED, &n->state))
			work = n->poll(n, weight);
        ......
    }

​ 该函数执行n->poll进行数据处理,而该函数是软中断数据结构napi_struct的处理函数,napi_struct的处理函数poll,在net_dev_init函数中被设置为process_backlog.

​ 因此net_rx_action函数最后会调用函数process_backlog进行数据包处理

process_backlog函数如下:

static int process_backlog(struct napi_struct *napi, int quota)
{
	int work = 0;
	struct softnet_data *queue = &__get_cpu_var(softnet_data);
	unsigned long start_time = jiffies;

	napi->weight = weight_p;
	do {
		struct sk_buff *skb;
		struct net_device *dev;

		local_irq_disable();
		skb = __skb_dequeue(&queue->input_pkt_queue);
		if (!skb) {
			__napi_complete(napi);
			local_irq_enable();
			break;
		}

		local_irq_enable();

		dev = skb->dev;

		netif_receive_skb(skb);

		dev_put(dev);
	} while (++work < quota && jiffies == start_time);

	return work;
}

​ 该函数通过循环不断取出接收队列的数据,然后调用netif_receive_skb将数据向上层传递

​ 转到函数netif_receive_skb

int netif_receive_skb(struct sk_buff *skb)
{
	......
	list_for_each_entry_rcu(ptype,
			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
		if (ptype->type == type &&
		    (!ptype->dev || ptype->dev == skb->dev)) {
			if (pt_prev)
				ret = deliver_skb(skb, pt_prev, orig_dev);
			pt_prev = ptype;
		}
	}

	......
}

​ 之后调用deliver_skb进行数据发送

​ 转到deliver_skb函数

static inline int deliver_skb(struct sk_buff *skb,
			      struct packet_type *pt_prev,
			      struct net_device *orig_dev)
{
	atomic_inc(&skb->users);
	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}

​ 可以看到其调用了pt_prev->func函数指针,其实际指向为函数ip_rcv

3.2 网络层

​ 网络层接收过程的第一个调用函数为ip_rcv

int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
	struct iphdr *iph;
	u32 len;

	/* When the interface is in promisc. mode, drop all the crap
	 * that it receives, do not try to analyse it.
	 */
	if (skb->pkt_type == PACKET_OTHERHOST)
		goto drop;

	IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES);

	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
		IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
		goto out;
	}

	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
		goto inhdr_error;

	iph = ip_hdr(skb);

	/*
	 *	RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
	 *
	 *	Is the datagram acceptable?
	 *
	 *	1.	Length at least the size of an ip header
	 *	2.	Version of 4
	 *	3.	Checksums correctly. [Speed optimisation for later, skip loopback checksums]
	 *	4.	Doesn't have a bogus length
	 */

	if (iph->ihl < 5 || iph->version != 4)
		goto inhdr_error;

	if (!pskb_may_pull(skb, iph->ihl*4))
		goto inhdr_error;

	iph = ip_hdr(skb);

	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
		goto inhdr_error;

	len = ntohs(iph->tot_len);
	if (skb->len < len) {
		IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS);
		goto drop;
	} else if (len < (iph->ihl*4))
		goto inhdr_error;

	/* Our transport medium may have padded the buffer out. Now we know it
	 * is IP we can trim to the true length of the frame.
	 * Note this now means skb->len holds ntohs(iph->tot_len).
	 */
	if (pskb_trim_rcsum(skb, len)) {
		IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
		goto drop;
	}

	/* Remove any debris in the socket control block */
	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));

	return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL,
		       ip_rcv_finish);

inhdr_error:
	IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
drop:
	kfree_skb(skb);
out:
	return NET_RX_DROP;
}

ip_rcv函数首先对包类型进行检查,若包类型为PACKET_OTHERHOST,则转入drop,将数据包发送给其他主机。

​ 调用skb_share_check用于检查能够共享数据包结构。

pskb_may_pull用于检查ip头部,若头部大小过小,则调整头部大小

​ 可以看到NF_HOOK宏定义为

#define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb)

​ 因此

return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish);

会调用函数ip_rcv_finish(struct sk_buff *skb)

​ 转到函数ip_rcv_finish

static int ip_rcv_finish(struct sk_buff *skb)
{
	const struct iphdr *iph = ip_hdr(skb);
	struct rtable *rt;

	/*
	 *	Initialise the virtual path cache for the packet. It describes
	 *	how the packet travels inside Linux networking.
	 */
	if (skb->dst == NULL) {
		int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
					 skb->dev);
		if (unlikely(err)) {
			if (err == -EHOSTUNREACH)
				IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
			else if (err == -ENETUNREACH)
				IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
			goto drop;
		}
	}

#ifdef CONFIG_NET_CLS_ROUTE
	if (unlikely(skb->dst->tclassid)) {
		struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
		u32 idx = skb->dst->tclassid;
		st[idx&0xFF].o_packets++;
		st[idx&0xFF].o_bytes+=skb->len;
		st[(idx>>16)&0xFF].i_packets++;
		st[(idx>>16)&0xFF].i_bytes+=skb->len;
	}
#endif

	if (iph->ihl > 5 && ip_rcv_options(skb))
		goto drop;

	rt = skb->rtable;
	if (rt->rt_type == RTN_MULTICAST)
		IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS);
	else if (rt->rt_type == RTN_BROADCAST)
		IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS);

	return dst_input(skb);

drop:
	kfree_skb(skb);
	return NET_RX_DROP;
}

​ 该函数会先调用ip_route_input进行路由表查询,然后调用函数dst_input

static inline int dst_input(struct sk_buff *skb)
{
	int err;

	for (;;) {
		err = skb->dst->input(skb);

		if (likely(err == 0))
			return err;
		/* Oh, Jamal... Seems, I will not forgive you this mess. :-) */
		if (unlikely(err != NET_XMIT_BYPASS))
			return err;
	}
}

​ 该函数会调用路由表的输入处理函数,该处理函数在ip_route_input_slow被指定为ip_local_deliver

​ 接着看函数ip_local_deliver

int ip_local_deliver(struct sk_buff *skb)
{
	/*
	 *	Reassemble IP fragments.
	 */

	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
		if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
			return 0;
	}

	return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
		       ip_local_deliver_finish);
}

​ 该函数首先会重组IP分段,然后通过NF_HOOK宏调用函数ip_local_deliver_finish

static int ip_local_deliver_finish(struct sk_buff *skb)
{
	struct net *net = dev_net(skb->dev);

	__skb_pull(skb, ip_hdrlen(skb));

	/* Point into the IP datagram, just past the header. */
	skb_reset_transport_header(skb);

	rcu_read_lock();
	{
		int protocol = ip_hdr(skb)->protocol;
		int hash, raw;
		struct net_protocol *ipprot;

	resubmit:
		raw = raw_local_deliver(skb, protocol);

		hash = protocol & (MAX_INET_PROTOS - 1);
		ipprot = rcu_dereference(inet_protos[hash]);
		if (ipprot != NULL && (net == &init_net || ipprot->netns_ok)) {
			int ret;

			if (!ipprot->no_policy) {
				if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
					kfree_skb(skb);
					goto out;
				}
				nf_reset(skb);
			}
			ret = ipprot->handler(skb);
			if (ret < 0) {
				protocol = -ret;
				goto resubmit;
			}
			IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
		} else {
			if (!raw) {
				if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
					IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
					icmp_send(skb, ICMP_DEST_UNREACH,
						  ICMP_PROT_UNREACH, 0);
				}
			} else
				IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
			kfree_skb(skb);
		}
	}
 out:
	rcu_read_unlock();

	return 0;
}

​ 该函数首先会将数据包的传输层头部指向数据块TCP头部,之后调用ipport->handler处理函数,该函数是传输层处理函数,其指向函数tcp_v4_rcv

3.3 传输层

​ 由网络层的ip_local_deliver_finish会调用到传输层的第一个接收函数为tcp_v4_rcv

tcp_v4_rcv代码过长,其主要代码是调用函数tcp_prequeue将数据包链入预处理队列

if (!tcp_prequeue(sk, skb))
			ret = tcp_v4_do_rcv(sk, skb);

tcp_prequeue函数如下:

static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (!sysctl_tcp_low_latency && tp->ucopy.task) {
		__skb_queue_tail(&tp->ucopy.prequeue, skb);
		tp->ucopy.memory += skb->truesize;
		if (tp->ucopy.memory > sk->sk_rcvbuf) {
			struct sk_buff *skb1;

			BUG_ON(sock_owned_by_user(sk));

			while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
				sk->sk_backlog_rcv(sk, skb1);
				NET_INC_STATS_BH(LINUX_MIB_TCPPREQUEUEDROPPED);
			}

			tp->ucopy.memory = 0;
		} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
			wake_up_interruptible(sk->sk_sleep);
			if (!inet_csk_ack_scheduled(sk))
				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
						          (3 * TCP_RTO_MIN) / 4,
							  TCP_RTO_MAX);
		}
		return 1;
	}
	return 0;
}

​ 该函数会调用处理库存函数sk_backlog_rcv,其实际指向函数tcp_v4_do_rcv

​ 转到函数tcp_v4_do_rcv

int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
	.......

	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
		TCP_CHECK_TIMER(sk);
		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
			rsk = sk;
			goto reset;
		}
		TCP_CHECK_TIMER(sk);
		return 0;
	}

	.......
}

​ 该函数主要调用tcp_rcv_established来进行数据处理

​ 转到函数tcp_rcv_established

int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
			struct tcphdr *th, unsigned len)
{
    ...
    			if (eaten)
				__kfree_skb(skb);
			else
				sk->sk_data_ready(sk, 0);
			return 0;
    ......
}

​ 该函数首先会复制数据,当复制完成后通过sk->sk_data_ready来唤醒服务器程序进程,让服务器程序进程来接收数据,其实际指向函数为sock_def_readable

3.4 应用层

​ 之前由传输层的tcp_rcv_established唤醒服务器程序进程

​ 在应用层通过server.crecv函数开始分析

​ 与之前的send函数类似,recv函数首先会经过系统调用,然后通过系统调用号找到sys_recvmsg函数,该函数又会调用函数__sock_recvmsg

​ 转到函数__sock_recvmsg

static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
				 struct msghdr *msg, size_t size, int flags)
{
	int err;
	struct sock_iocb *si = kiocb_to_siocb(iocb);

	si->sock = sock;
	si->scm = NULL;
	si->msg = msg;
	si->size = size;
	si->flags = flags;

	err = security_socket_recvmsg(sock, msg, size, flags);
	if (err)
		return err;

	return sock->ops->recvmsg(iocb, sock, msg, size, flags);
}

​ 可以看到该函数会调用sock->ops->recvmsg该函数实际指向函数sock_common_recvmsg

int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
			struct msghdr *msg, size_t size, int flags)
{
	struct sock *sk = sock->sk;
	int addr_len = 0;
	int err;

	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
				   flags & ~MSG_DONTWAIT, &addr_len);
	if (err >= 0)
		msg->msg_namelen = addr_len;
	return err;
}

​ 该函数又调用函数sk->sk_prot->recvmsg,该函数实际指向函数tcp_recvmsg完成数据包的接收

​ 代码过长,主要完成的就是循环接收接收队列的数据包,复制每个数据包的数据。

​ 这样就完成了从服务端的recv函数获取到接收队列的数据。

4 时序分析

​ 根据上述代码可整合为下面函数调用所示:

TCP/IP协议栈在Linux内核中的运行时序分析

​ 上述流程主要由三个阶段组成

  1. 发送过程

    发送过程一直从系统调用开始到将数据交由网卡传输

  2. 接收数据

    接收数据由底层硬件产生中断,然后执行相应中断函数一直到将数据在传输层放入接收队列

  3. 获取数据

    获取数据由接收端调用recv函数,经过系统调用再到接收队列拿数据。

上一篇:TCP/IP协议栈在Linux内核中的运行时序分析


下一篇:用python调用百度地图API实现 地址转经纬度并计算两地之间的距离(上)