1 概述
该博客主要描述Tcp/ip
协议栈的发送和接收流程,linux内核版本为2.6.26
全文直接从发送端和接收端两侧分别展开,然后跟踪数据发送和接收流程
2 send发送过程
2.1 应用层
首先通过在client文件中进行send函数调用
if (send(sockfd, "Hello!\n", 7, 0) == -1)
perror("send");
send函数在socket.h
文件中声明:
extern ssize_t send (int __fd, __const void *__buf, size_t __n, int __flags);
找到send函数在send.c
文件中
ssize_t
__send (fd, buf, n, flags)
int fd;
const void *buf;
size_t n;
int flags;
{
error_t err;
size_t wrote;
err = HURD_DPORT_USE (fd, __socket_send (port, MACH_PORT_NULL,
flags, buf, n,
NULL, MACH_MSG_TYPE_COPY_SEND, 0,
NULL, 0, &wrote));
return err ? __hurd_dfail (fd, err) : wrote;
}
libc_hidden_def (__send)
weak_alias (__send, send)
可以看到weak_alias (__send, send)
为send函数定义了别名,使用别名可以让函数在其他位置进行真实定义
send()
函数的真实定义在文件send.S
中:
#define socket send
#define __socket __libc_send
#define NARGS 4
#define NEED_CANCELLATION
#include <socket.S>
weak_alias (__libc_send, __send)
libc_hidden_def (__send)
其中包含socket.S
文件
#define P(a, b) P2(a, b)
#define P2(a, b) a##b
#ifndef __socket
# ifndef NO_WEAK_ALIAS
# define __socket P(__,socket)
# else
# define __socket socket
# endif
#endif
.globl __socket
cfi_startproc
ENTRY (__socket)
/* Save registers. */
movl %ebx, %edx
cfi_register (3, 2)
movl $SYS_ify(socketcall), %eax /* System call number in %eax. */
/* Use ## so `socket' is a separate token that might be #define'd. */
movl $P(SOCKOP_,socket), %ebx /* Subcode is first arg to syscall. */
lea 4(%esp), %ecx /* Address of args is 2nd arg. */
/* Do the system call trap. */
ENTER_KERNEL
/* Restore registers. */
movl %edx, %ebx
cfi_restore (3)
ret
又I386
系统的ENTER_KERNEL和SYS_ify
的宏定义如下
# define ENTER_KERNEL int $0x80
#define SYS_ify(syscall_name) __NR_##syscall_name
将宏定义转换可得:
ENTRY (__libc_send)
movl %ebx, %edx
cfi_register (3, 2)
movl $__NR_socketcall, %eax /* System call number in %eax. */
/* Use ## so `socket' is a separate token that might be #define'd. */
movl $SOCKOP_send, %ebx /* Subcode is first arg to syscall. */
lea 4(%esp), %ecx /* Address of args is 2nd arg. */
/* Do the system call trap. */
int $0x80
/* Restore registers. */
movl %edx, %ebx
cfi_restore (3)
ret
系统调用号在unistd_32.h
文件中定义为102
#define __NR_socketcall 102
同时SOCKOP_send
作为第一个参数保存到ebx
寄存器中,第二个参数通过数组指针(地址)方式保存在ecx
中
在glibc
的socketcall.h
文件中定义:
#define SOCKOP_send 9
所以SOCKOP_send
为9
随即执行软中断指令int $0x80
到达系统调用入口函数system_call
system_call
实现在文件entry_32.S
中
ENTRY(system_call)
syscall_call:
call *sys_call_table(,%eax,4)
movl %eax,PT_EAX(%esp) # store the return value
syscall_exit:
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
因为之前传入eax
寄存器的值为102,故call *sys_call_table(,%eax,4)
为调用系统分配表的102号函数,系统分配表在文件syscall_table.S
中:
.long sys_fstatfs /* 100 */
.long sys_ioperm
.long sys_socketcall
.long sys_syslog
.long sys_setitimer
.long sys_getitimer /* 105 */
可以看到102号函数为sys_socketcall
函数
在net/socket.c
文件中找到sys_socketcall
函数
asmlinkage long sys_socketcall(int call, unsigned long __user *args)
{
switch (call) {
case SYS_SEND:
err = sys_send(a0, (void __user *)a1, a[2], a[3]);
break;
case SYS_SENDTO:
err = sys_sendto(a0, (void __user *)a1, a[2], a[3],
(struct sockaddr __user *)a[4], a[5]);
break;
}
call函数作为接收系统调用的第一个参数为SOCKOP_send
的值9,又SYS_SEND
在net.h
中定义为
#define SYS_SEND 9
故通过switch会选择SYS_SEND执行
之后开始执行函数sys_send()
;
asmlinkage long sys_send(int fd, void __user *buff, size_t len, unsigned flags)
{
return sys_sendto(fd, buff, len, flags, NULL, 0);
}
可以看到sys_send()
函数实际调用函数sys_sendto
进入函数sys_send()
中
asmlinkage long sys_sendto(int fd, void __user *buff, size_t len,
unsigned flags, struct sockaddr __user *addr,
int addr_len)
{
......
err = sock_sendmsg(sock, &msg, len);
out_put:
fput_light(sock->file, fput_needed);
out:
return err;
}
可以看到最终会调用函数sock_sendmsg
进行消息发送
进入函数sock_sendmsg
中
int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
......
ret = __sock_sendmsg(&iocb, sock, msg, size);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&iocb);
return ret;
}
可以看到sock_sendmsg
会调用__sock_sendmsg
进行消息发送
进入到函数__sock_sendmsg
中
static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg, size_t size)
{
struct sock_iocb *si = kiocb_to_siocb(iocb);
int err;
si->sock = sock;
si->scm = NULL;
si->msg = msg;
si->size = size;
err = security_socket_sendmsg(sock, msg, size);
if (err)
return err;
return sock->ops->sendmsg(iocb, sock, msg, size);
}
可以看到在__sock_sendmsg
函数中会调用sock结构体的ops指针
sock结构体如下所示:
struct socket {
socket_state state;
unsigned long flags;
const struct proto_ops *ops;
struct fasync_struct *fasync_list;
struct file *file;
struct sock *sk;
wait_queue_head_t wait;
short type;
};
其中包含函数操作结构体proto_ops
的指针
proto_ops
结构体如下所示:
struct proto_ops {
int family;
struct module *owner;
......
int (*sendmsg) (struct kiocb *iocb, struct socket *sock,
struct msghdr *m, size_t total_len);
};
proto_ops
结构体主要是一些函数指针,其函数指针在内核初始化时通过initcall
机制对inet_init
函数进行安装
创建socket
时其指定的函数表为inet_stream_ops
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_stream_connect,
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet_getname,
.poll = tcp_poll,
.ioctl = inet_ioctl,
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = tcp_sendmsg,
.recvmsg = sock_common_recvmsg,
.mmap = sock_no_mmap,
.sendpage = tcp_sendpage,
.splice_read = tcp_splice_read,
}
可以看到sendmsg
函数被指定为函数tcp_sendmsg
因此调用sock->ops->sendmsg
函数也就是调用函数tcp_sendmsg
2.2 传输层
tcp_sendmsg
函数如下:
int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
size_t size)
{
.......
if (forced_push(tp)) {
tcp_mark_push(tp, skb);
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
} else if (skb == tcp_send_head(sk))
tcp_push_one(sk, mss_now);
continue;
......
}
在该函数中,先会对sk
进行处理,然后调用函数__tcp_push_pending_frames
进行数据的发送
进入函数__tcp_push_pending_frames
中
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
int nonagle)
{
struct sk_buff *skb = tcp_send_head(sk);
if (skb) {
if (tcp_write_xmit(sk, cur_mss, nonagle))
tcp_check_probe_timer(sk);
}
}
从上面函数可以看到当skb
有数据时就会调用tcp_write_xmit
进行数据发送
static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
{
......
if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))
break;
......
}
在该函数中最终会调用tcp_transmit_skb
进行数据发送
tcp_transmit_skb
函数如下,仅贴出主要函数
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
gfp_t gfp_mask)
{
......
err = icsk->icsk_af_ops->queue_xmit(skb, 0);
......
}
在tcp_transmit_skb
函数中会指定SYN标志位
可以看到在tcp_transmit_skb
函数会调用icsk->icsk_af_ops->queue_xmit
完成实际的数据发送,而该函数指针实际指向的函数是ip_queue_xmit
2.3 网络层
经过tcp_output.c
的函数tcp_transmit_skb
会调用网络层ip_output.c
文件的函数ip_queue_xmit
ip_queue_xmit
函数如下所示:
int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
{
......
return ip_local_out(skb);
......
}
ip_queue_xmit
先对ip数据报进行初始化然后最后调用ip_local_out
进行数据发送
ip_local_out
函数:
int ip_local_out(struct sk_buff *skb)
{
int err;
err = __ip_local_out(skb);
if (likely(err == 1))
err = dst_output(skb);
return err;
}
ip_local_out
会调用函数dst_output
dst_output
函数:
static inline int dst_output(struct sk_buff *skb)
{
return skb->dst->output(skb);
}
可以看到该函数会调用skb->dst->output
,而该函数指针指向ip_output
函数
ip_output
函数为:
int ip_output(struct sk_buff *skb)
{
struct net_device *dev = skb->dst->dev;
IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
NF_HOOK_COND
宏定义为:
#define NF_HOOK_COND(pf, hook, skb, indev, outdev, okfn, cond) \
({int __ret; \
if ((__ret=nf_hook_thresh(pf, hook, (skb), indev, outdev, okfn, INT_MIN, cond)) == 1)\
__ret = (okfn)(skb); \
__ret;})
这就意味着代码
NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
会执行函数ip_finish_output
ip_finish_output
函数:
static int ip_finish_output(struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb->dst->xfrm != NULL) {
IPCB(skb)->flags |= IPSKB_REROUTED;
return dst_output(skb);
}
#endif
if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
return ip_fragment(skb, ip_finish_output2);
else
return ip_finish_output2(skb);
}
上述函数会根据数据块长度选择执行函数ip_fragment
还是ip_finish_output2
而ip_fragment
是先将数据进行分段然后再发送,最终依旧会调用函数
ip_finish_output2
,故直接看函数ip_finish_output2
ip_finish_output2
函数为:
static inline int ip_finish_output2(struct sk_buff *skb)
{
struct dst_entry *dst = skb->dst;
struct rtable *rt = (struct rtable *)dst;
struct net_device *dev = dst->dev;
unsigned int hh_len = LL_RESERVED_SPACE(dev);
if (rt->rt_type == RTN_MULTICAST)
IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
else if (rt->rt_type == RTN_BROADCAST)
IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS);
/* Be paranoid, rather than too clever. */
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
struct sk_buff *skb2;
skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
if (skb2 == NULL) {
kfree_skb(skb);
return -ENOMEM;
}
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
kfree_skb(skb);
skb = skb2;
}
if (dst->hh)
return neigh_hh_output(dst->hh, skb);
else if (dst->neighbour)
return dst->neighbour->output(skb);
if (net_ratelimit())
printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
kfree_skb(skb);
return -EINVAL;
}
该函数会完成ip
数据包头部大小的检测,保证为链路层留下足够的头部空间。之后会调用dst->neighbour->output
函数指针,此为邻居子系统的
output
函数指针,实际指向函数neigh_resolve_output
邻居子系统指明了IP
地址向MAC地址的转换关系,是一种邻居协议
转到函数neigh_resolve_output
int neigh_resolve_output(struct sk_buff *skb)
{
struct dst_entry *dst = skb->dst;
struct neighbour *neigh;
int rc = 0;
if (!dst || !(neigh = dst->neighbour))
goto discard;
__skb_pull(skb, skb_network_offset(skb));
if (!neigh_event_send(neigh, skb)) {
int err;
struct net_device *dev = neigh->dev;
if (dev->header_ops->cache && !dst->hh) {
write_lock_bh(&neigh->lock);
if (!dst->hh)
neigh_hh_init(neigh, dst, dst->ops->protocol);
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
neigh->ha, NULL, skb->len);
write_unlock_bh(&neigh->lock);
} else {
read_lock_bh(&neigh->lock);
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
neigh->ha, NULL, skb->len);
read_unlock_bh(&neigh->lock);
}
if (err >= 0)
rc = neigh->ops->queue_xmit(skb);
else
goto out_kfree_skb;
}
out:
return rc;
discard:
NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n",
dst, dst ? dst->neighbour : NULL);
out_kfree_skb:
rc = -EINVAL;
kfree_skb(skb);
goto out;
}
该函数在rc = neigh->ops->queue_xmit(skb);
处通过函数指针neigh->ops->queue_xmit
执行邻居函数表的函数,其实际函数为dev_queue_xmit
2.4 数据链路层
首先调用邻居子系统函数dev_queue_xmit
int dev_queue_xmit(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct Qdisc *q;
int rc = -ENOMEM;
if (netif_needs_gso(dev, skb))
goto gso;
if (skb_shinfo(skb)->frag_list &&
!(dev->features & NETIF_F_FRAGLIST) &&
__skb_linearize(skb))
goto out_kfree_skb;
if (skb_shinfo(skb)->nr_frags &&
(!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
__skb_linearize(skb))
goto out_kfree_skb;
if (skb->ip_summed == CHECKSUM_PARTIAL) {
skb_set_transport_header(skb, skb->csum_start -
skb_headroom(skb));
if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
goto out_kfree_skb;
}
gso:
spin_lock_prefetch(&dev->queue_lock);
rcu_read_lock_bh();
q = rcu_dereference(dev->qdisc);
#ifdef CONFIG_NET_CLS_ACT
skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
#endif
if (q->enqueue) {
/* Grab device queue */
spin_lock(&dev->queue_lock);
q = dev->qdisc;
if (q->enqueue) {
/* reset queue_mapping to zero */
skb_set_queue_mapping(skb, 0);
rc = q->enqueue(skb, q);
qdisc_run(dev);
spin_unlock(&dev->queue_lock);
rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
goto out;
}
spin_unlock(&dev->queue_lock);
}
if (dev->flags & IFF_UP) {
int cpu = smp_processor_id(); /* ok because BHs are off */
if (dev->xmit_lock_owner != cpu) {
HARD_TX_LOCK(dev, cpu);
if (!netif_queue_stopped(dev) &&
!netif_subqueue_stopped(dev, skb)) {
rc = 0;
if (!dev_hard_start_xmit(skb, dev)) {
HARD_TX_UNLOCK(dev);
goto out;
}
}
HARD_TX_UNLOCK(dev);
if (net_ratelimit())
printk(KERN_CRIT "Virtual device %s asks to "
"queue packet!\n", dev->name);
} else {
if (net_ratelimit())
printk(KERN_CRIT "Dead loop on virtual device "
"%s, fix it urgently!\n", dev->name);
}
}
rc = -ENETDOWN;
rcu_read_unlock_bh();
out_kfree_skb:
kfree_skb(skb);
return rc;
out:
rcu_read_unlock_bh();
return rc;
}
该函数调用dev_hard_start_xmit
函数进行数据发送
dev_hard_start_xmit
函数为:
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
if (likely(!skb->next)) {
if (!list_empty(&ptype_all))
dev_queue_xmit_nit(skb, dev);
if (netif_needs_gso(dev, skb)) {
if (unlikely(dev_gso_segment(skb)))
goto out_kfree_skb;
if (skb->next)
goto gso;
}
return dev->hard_start_xmit(skb, dev);
}
gso:
do {
struct sk_buff *nskb = skb->next;
int rc;
skb->next = nskb->next;
nskb->next = NULL;
rc = dev->hard_start_xmit(nskb, dev);
if (unlikely(rc)) {
nskb->next = skb->next;
skb->next = nskb;
return rc;
}
if (unlikely((netif_queue_stopped(dev) ||
netif_subqueue_stopped(dev, skb)) &&
skb->next))
return NETDEV_TX_BUSY;
} while (skb->next);
skb->destructor = DEV_GSO_CB(skb)->destructor;
out_kfree_skb:
kfree_skb(skb);
return 0;
}
该函数会调用函数dev->hard_start_xmit
进行设备调用数据发送
对于不同的网卡硬件设备来说,其hard_start_xmit
函数初始化也不同,对于RTL8169
设备驱动函数来说,其dev->hard_start_xmit
指针指向函数为rtl8169_start_xmit
static int rtl8169_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct rtl8169_private *tp = netdev_priv(dev);
unsigned int frags, entry = tp->cur_tx % NUM_TX_DESC;
struct TxDesc *txd = tp->TxDescArray + entry;
void __iomem *ioaddr = tp->mmio_addr;
dma_addr_t mapping;
u32 status, len;
u32 opts1;
int ret = NETDEV_TX_OK;
if (unlikely(TX_BUFFS_AVAIL(tp) < skb_shinfo(skb)->nr_frags)) {
if (netif_msg_drv(tp)) {
printk(KERN_ERR
"%s: BUG! Tx Ring full when queue awake!\n",
dev->name);
}
goto err_stop;
}
if (unlikely(le32_to_cpu(txd->opts1) & DescOwn))
goto err_stop;
opts1 = DescOwn | rtl8169_tso_csum(skb, dev);
frags = rtl8169_xmit_frags(tp, skb, opts1);
if (frags) {
len = skb_headlen(skb);
opts1 |= FirstFrag;
} else {
len = skb->len;
if (unlikely(len < ETH_ZLEN)) {
if (skb_padto(skb, ETH_ZLEN))
goto err_update_stats;
len = ETH_ZLEN;
}
opts1 |= FirstFrag | LastFrag;
tp->tx_skb[entry].skb = skb;
}
mapping = pci_map_single(tp->pci_dev, skb->data, len, PCI_DMA_TODEVICE);
tp->tx_skb[entry].len = len;
txd->addr = cpu_to_le64(mapping);
txd->opts2 = cpu_to_le32(rtl8169_tx_vlan_tag(tp, skb));
wmb();
/* anti gcc 2.95.3 bugware (sic) */
status = opts1 | len | (RingEnd * !((entry + 1) % NUM_TX_DESC));
txd->opts1 = cpu_to_le32(status);
dev->trans_start = jiffies;
tp->cur_tx += frags + 1;
smp_wmb();
RTL_W8(TxPoll, NPQ); /* set polling bit */
if (TX_BUFFS_AVAIL(tp) < MAX_SKB_FRAGS) {
netif_stop_queue(dev);
smp_rmb();
if (TX_BUFFS_AVAIL(tp) >= MAX_SKB_FRAGS)
netif_wake_queue(dev);
}
out:
return ret;
err_stop:
netif_stop_queue(dev);
ret = NETDEV_TX_BUSY;
err_update_stats:
dev->stats.tx_dropped++;
goto out;
}
上述函数会将数据持续写入缓冲区中,然后调用底层硬件完成数据的发送
至此内核中的数据发送过程结束
2.5 发送过程调试分析:
断点处为dev->hard_start_xmit
,内核为linux5.4.34
,函数调用栈如下所示:
可以看到函数发送先从entry_syscall_64
开始进入系统调用,之后调用sys_send_to
进行发送,然后调用传输层tcp_sendmsg
,再调用网络层ip_output
,之后调用数据链路层dev_queue_xmit
,然后调用相应的网卡驱动的发送函数,最后通过物理层进行数据传输。
3 recv接收过程
接收依然要经过5层结构,物理层不做分析,直接到数据链路层,也就是数据到达网卡设备开始。
3.1 数据链路层
网卡每次接收到数据都会调用相应驱动的中断函数,如cs8900网卡的中断函数为net_interrupt
static irqreturn_t net_interrupt(int irq, void *dev_id)
{
......
while ((status = ioread16(lp->virt_addr + ISQ_PORT))) {
cs89_dbg(4, debug, "%s: event=%04x\n", dev->name, status);
handled = 1;
switch (status & ISQ_EVENT_MASK) {
case ISQ_RECEIVER_EVENT:
/* Got a packet(s). */
net_rx(dev);
break;
case ISQ_TRANSMITTER_EVENT:
dev->stats.tx_packets++;
netif_wake_queue(dev); /* Inform upper layers. */
......
}
通过while循环判断io读取状态,然后调用net_rx
进行接收数据处理
net_rx
函数为:
static void
net_rx(struct net_device *dev)
{
struct net_local *lp = netdev_priv(dev);
struct sk_buff *skb;
int status, length;
status = ioread16(lp->virt_addr + RX_FRAME_PORT);
length = ioread16(lp->virt_addr + RX_FRAME_PORT);
if ((status & RX_OK) == 0) {
count_rx_errors(status, dev);
return;
}
/* Malloc up new buffer. */
skb = netdev_alloc_skb(dev, length + 2);
if (skb == NULL) {
dev->stats.rx_dropped++;
return;
}
skb_reserve(skb, 2); /* longword align L3 header */
readwords(lp, RX_FRAME_PORT, skb_put(skb, length), length >> 1);
if (length & 1)
skb->data[length-1] = ioread16(lp->virt_addr + RX_FRAME_PORT);
cs89_dbg(3, debug, "%s: received %d byte packet of type %x\n",
dev->name, length,
(skb->data[ETH_ALEN + ETH_ALEN] << 8) |
skb->data[ETH_ALEN + ETH_ALEN + 1]);
skb->protocol = eth_type_trans(skb, dev);
netif_rx(skb);
dev->stats.rx_packets++;
dev->stats.rx_bytes += length;
}
主要完成了从缓冲区获取数据和空间分配,之后调用netif_rx
进行下一步处理
netif_rx
函数:
int netif_rx(struct sk_buff *skb)
{
......
if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
if (queue->input_pkt_queue.qlen) {
enqueue:
dev_hold(skb->dev);
__skb_queue_tail(&queue->input_pkt_queue, skb);
local_irq_restore(flags);
return NET_RX_SUCCESS;
}
napi_schedule(&queue->backlog);
goto enqueue;
}
......
}
可以看到该函数会先检查接收队列是否有数据,如果有数据则通过__skb_queue_tail
链入队尾,否则会调用napi_schedule
。
该软中段数据结构用于处理中断后半部。
进入函数napi_schedule
static inline void napi_schedule(struct napi_struct *n)
{
if (napi_schedule_prep(n))
__napi_schedule(n);
}
之后调用函数__napi_schedule
进入函数__napi_schedule
void __napi_schedule(struct napi_struct *n)
{
unsigned long flags;
local_irq_save(flags);
list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
local_irq_restore(flags);
}
该函数会先禁用中断,将软中断数据结构napi添加到队尾,之后执行系统软中断,最后恢复中断
系统软中段入口为do_softirq
函数,之后会执行调用网络接收软中断函数net_rx_action
static void net_rx_action(struct softirq_action *h)
{
......
if (test_bit(NAPI_STATE_SCHED, &n->state))
work = n->poll(n, weight);
......
}
该函数执行n->poll
进行数据处理,而该函数是软中断数据结构napi_struct
的处理函数,napi_struct
的处理函数poll
,在net_dev_init
函数中被设置为process_backlog
.
因此net_rx_action
函数最后会调用函数process_backlog
进行数据包处理
process_backlog
函数如下:
static int process_backlog(struct napi_struct *napi, int quota)
{
int work = 0;
struct softnet_data *queue = &__get_cpu_var(softnet_data);
unsigned long start_time = jiffies;
napi->weight = weight_p;
do {
struct sk_buff *skb;
struct net_device *dev;
local_irq_disable();
skb = __skb_dequeue(&queue->input_pkt_queue);
if (!skb) {
__napi_complete(napi);
local_irq_enable();
break;
}
local_irq_enable();
dev = skb->dev;
netif_receive_skb(skb);
dev_put(dev);
} while (++work < quota && jiffies == start_time);
return work;
}
该函数通过循环不断取出接收队列的数据,然后调用netif_receive_skb
将数据向上层传递
转到函数netif_receive_skb
int netif_receive_skb(struct sk_buff *skb)
{
......
list_for_each_entry_rcu(ptype,
&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
if (ptype->type == type &&
(!ptype->dev || ptype->dev == skb->dev)) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
......
}
之后调用deliver_skb
进行数据发送
转到deliver_skb
函数
static inline int deliver_skb(struct sk_buff *skb,
struct packet_type *pt_prev,
struct net_device *orig_dev)
{
atomic_inc(&skb->users);
return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}
可以看到其调用了pt_prev->func
函数指针,其实际指向为函数ip_rcv
3.2 网络层
网络层接收过程的第一个调用函数为ip_rcv
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct iphdr *iph;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*/
if (skb->pkt_type == PACKET_OTHERHOST)
goto drop;
IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES);
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
goto out;
}
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
iph = ip_hdr(skb);
/*
* RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
*
* Is the datagram acceptable?
*
* 1. Length at least the size of an ip header
* 2. Version of 4
* 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
* 4. Doesn't have a bogus length
*/
if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error;
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
iph = ip_hdr(skb);
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto inhdr_error;
len = ntohs(iph->tot_len);
if (skb->len < len) {
IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
goto inhdr_error;
/* Our transport medium may have padded the buffer out. Now we know it
* is IP we can trim to the true length of the frame.
* Note this now means skb->len holds ntohs(iph->tot_len).
*/
if (pskb_trim_rcsum(skb, len)) {
IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
goto drop;
}
/* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish);
inhdr_error:
IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
return NET_RX_DROP;
}
ip_rcv
函数首先对包类型进行检查,若包类型为PACKET_OTHERHOST
,则转入drop
,将数据包发送给其他主机。
调用skb_share_check
用于检查能够共享数据包结构。
pskb_may_pull
用于检查ip头部,若头部大小过小,则调整头部大小
可以看到NF_HOOK
宏定义为
#define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb)
因此
return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish);
会调用函数ip_rcv_finish(struct sk_buff *skb)
转到函数ip_rcv_finish
static int ip_rcv_finish(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
if (skb->dst == NULL) {
int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
skb->dev);
if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
else if (err == -ENETUNREACH)
IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
goto drop;
}
}
#ifdef CONFIG_NET_CLS_ROUTE
if (unlikely(skb->dst->tclassid)) {
struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
u32 idx = skb->dst->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes+=skb->len;
st[(idx>>16)&0xFF].i_packets++;
st[(idx>>16)&0xFF].i_bytes+=skb->len;
}
#endif
if (iph->ihl > 5 && ip_rcv_options(skb))
goto drop;
rt = skb->rtable;
if (rt->rt_type == RTN_MULTICAST)
IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS);
else if (rt->rt_type == RTN_BROADCAST)
IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS);
return dst_input(skb);
drop:
kfree_skb(skb);
return NET_RX_DROP;
}
该函数会先调用ip_route_input
进行路由表查询,然后调用函数dst_input
static inline int dst_input(struct sk_buff *skb)
{
int err;
for (;;) {
err = skb->dst->input(skb);
if (likely(err == 0))
return err;
/* Oh, Jamal... Seems, I will not forgive you this mess. :-) */
if (unlikely(err != NET_XMIT_BYPASS))
return err;
}
}
该函数会调用路由表的输入处理函数,该处理函数在ip_route_input_slow
被指定为ip_local_deliver
接着看函数ip_local_deliver
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
该函数首先会重组IP分段,然后通过NF_HOOK
宏调用函数ip_local_deliver_finish
static int ip_local_deliver_finish(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
__skb_pull(skb, ip_hdrlen(skb));
/* Point into the IP datagram, just past the header. */
skb_reset_transport_header(skb);
rcu_read_lock();
{
int protocol = ip_hdr(skb)->protocol;
int hash, raw;
struct net_protocol *ipprot;
resubmit:
raw = raw_local_deliver(skb, protocol);
hash = protocol & (MAX_INET_PROTOS - 1);
ipprot = rcu_dereference(inet_protos[hash]);
if (ipprot != NULL && (net == &init_net || ipprot->netns_ok)) {
int ret;
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
goto out;
}
nf_reset(skb);
}
ret = ipprot->handler(skb);
if (ret < 0) {
protocol = -ret;
goto resubmit;
}
IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
} else {
if (!raw) {
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
} else
IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
kfree_skb(skb);
}
}
out:
rcu_read_unlock();
return 0;
}
该函数首先会将数据包的传输层头部指向数据块TCP头部,之后调用ipport->handler
处理函数,该函数是传输层处理函数,其指向函数tcp_v4_rcv
3.3 传输层
由网络层的ip_local_deliver_finish
会调用到传输层的第一个接收函数为tcp_v4_rcv
tcp_v4_rcv
代码过长,其主要代码是调用函数tcp_prequeue
将数据包链入预处理队列
if (!tcp_prequeue(sk, skb))
ret = tcp_v4_do_rcv(sk, skb);
tcp_prequeue
函数如下:
static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
if (!sysctl_tcp_low_latency && tp->ucopy.task) {
__skb_queue_tail(&tp->ucopy.prequeue, skb);
tp->ucopy.memory += skb->truesize;
if (tp->ucopy.memory > sk->sk_rcvbuf) {
struct sk_buff *skb1;
BUG_ON(sock_owned_by_user(sk));
while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
sk->sk_backlog_rcv(sk, skb1);
NET_INC_STATS_BH(LINUX_MIB_TCPPREQUEUEDROPPED);
}
tp->ucopy.memory = 0;
} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
wake_up_interruptible(sk->sk_sleep);
if (!inet_csk_ack_scheduled(sk))
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
(3 * TCP_RTO_MIN) / 4,
TCP_RTO_MAX);
}
return 1;
}
return 0;
}
该函数会调用处理库存函数sk_backlog_rcv
,其实际指向函数tcp_v4_do_rcv
转到函数tcp_v4_do_rcv
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
.......
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
TCP_CHECK_TIMER(sk);
if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
goto reset;
}
TCP_CHECK_TIMER(sk);
return 0;
}
.......
}
该函数主要调用tcp_rcv_established
来进行数据处理
转到函数tcp_rcv_established
int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
...
if (eaten)
__kfree_skb(skb);
else
sk->sk_data_ready(sk, 0);
return 0;
......
}
该函数首先会复制数据,当复制完成后通过sk->sk_data_ready
来唤醒服务器程序进程,让服务器程序进程来接收数据,其实际指向函数为sock_def_readable
3.4 应用层
之前由传输层的tcp_rcv_established
唤醒服务器程序进程
在应用层通过server.c
的recv
函数开始分析
与之前的send
函数类似,recv函数首先会经过系统调用,然后通过系统调用号找到sys_recvmsg
函数,该函数又会调用函数__sock_recvmsg
转到函数__sock_recvmsg
static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg, size_t size, int flags)
{
int err;
struct sock_iocb *si = kiocb_to_siocb(iocb);
si->sock = sock;
si->scm = NULL;
si->msg = msg;
si->size = size;
si->flags = flags;
err = security_socket_recvmsg(sock, msg, size, flags);
if (err)
return err;
return sock->ops->recvmsg(iocb, sock, msg, size, flags);
}
可以看到该函数会调用sock->ops->recvmsg
该函数实际指向函数sock_common_recvmsg
int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg, size_t size, int flags)
{
struct sock *sk = sock->sk;
int addr_len = 0;
int err;
err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
flags & ~MSG_DONTWAIT, &addr_len);
if (err >= 0)
msg->msg_namelen = addr_len;
return err;
}
该函数又调用函数sk->sk_prot->recvmsg
,该函数实际指向函数tcp_recvmsg
完成数据包的接收
代码过长,主要完成的就是循环接收接收队列的数据包,复制每个数据包的数据。
这样就完成了从服务端的recv函数获取到接收队列的数据。
4 时序分析
根据上述代码可整合为下面函数调用所示:
上述流程主要由三个阶段组成
-
发送过程
发送过程一直从系统调用开始到将数据交由网卡传输
-
接收数据
接收数据由底层硬件产生中断,然后执行相应中断函数一直到将数据在传输层放入接收队列
-
获取数据
获取数据由接收端调用recv函数,经过系统调用再到接收队列拿数据。