今天打算在医院陪父母看病度过一天,还好目前病情算是稳定!!辛劳一生,到头来累出一身病,也许这是中国大多数农民的归宿!!!!!
珍惜和父母呆在一起的日子!听听父亲话说当年修路修桥的峥嵘岁月!顺便也写写blog
1、网卡发生中断,网卡中断服务函数执行收包动作,以igb驱动为例。
一般都会调用igb_clean_rx_irq copy 经行 packet split 其主干逻辑为:
/** This memory barrier is needed to keep us from reading * any other fields out of the rx_desc until we know the * RXD_STAT_DD bit is set */ rmb(); /* retrieve a buffer from the ring */ skb = igb_fetch_rx_buffer(rx_ring, rx_desc, skb); ----------- --------》 /* allocate a skb to store the frags */ -------》skb = netdev_alloc_skb_ip_align(rx_ring->netdev, IGB_RX_HDR_LEN); /* exit if we failed to retrieve a buffer */ if (!skb) break; ------------- napi_gro_receive
第一次内存分配为:__netdev_alloc_skb分配skb以及data 数据区内存
/** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device * @dev: network device to receive on * @len: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb * * Allocate a new &sk_buff and assign it a usage count of one. The * buffer has NET_SKB_PAD headroom built in. Users should allocate * the headroom they think they need without accounting for the * built in space. The built in space is used for optimisations. * * %NULL is returned if there is no free memory. */ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, gfp_t gfp_mask) { struct page_frag_cache *nc; unsigned long flags; struct sk_buff *skb; bool pfmemalloc; void *data; len += NET_SKB_PAD; if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) goto skb_fail; goto skb_success; } len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); len = SKB_DATA_ALIGN(len); if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; local_irq_save(flags); nc = this_cpu_ptr(&netdev_alloc_cache); data = __alloc_page_frag(nc, len, gfp_mask); pfmemalloc = nc->pfmemalloc; local_irq_restore(flags); if (unlikely(!data)) return NULL; skb = __build_skb(data, len); if (unlikely(!skb)) { skb_free_frag(data); return NULL; } /* use OR instead of assignment to avoid clearing of bits in mask */ if (pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; skb_success: skb_reserve(skb, NET_SKB_PAD); skb->dev = dev; skb_fail: return skb; } EXPORT_SYMBOL(__netdev_alloc_skb);View Code
根据代码可知:
1、数据区len 分配时需要考虑是否大于一个page,
- len大于page ,通过slab里面走alloc pages
- len小于page,且不是通过DMA,直接走alloc pages分配
2、skb通过cache分配
由于此时在软中断里面执行,所以需要禁止本地硬中断后在执行data的内存分配???还是应为per_cpu变量的原因??
3、执行skb_reserve(skb, NET_SKB_PAD);数据区中预留了 NET_SKB_PAD个字节大小的空间作为协议头使用
也就是内存区域变为了:head__reverse__data/tail______end(end=tail+size)
skb的初始化如下:
/** * __build_skb - build a network buffer * @data: data buffer provided by caller * @frag_size: size of data, or 0 if head was kmalloced * * Allocate a new &sk_buff. Caller provides space holding head and * skb_shared_info. @data must have been allocated by kmalloc() only if * @frag_size is 0, otherwise data should come from the page allocator * or vmalloc() * The return is the new skb buffer. * On a failure the return is %NULL, and @data is not freed. * Notes : * Before IO, driver allocates only data buffer where NIC put incoming frame * Driver should add room at head (NET_SKB_PAD) and * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) * After IO, driver calls build_skb(), to allocate sk_buff and populate it * before giving packet to stack. * RX rings only contains data buffers, not full skbs. */ struct sk_buff *__build_skb(void *data, unsigned int frag_size) { struct skb_shared_info *shinfo; struct sk_buff *skb; unsigned int size = frag_size ? : ksize(data); skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); if (!skb) return NULL; size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); memset(skb, 0, offsetof(struct sk_buff, tail)); skb->truesize = SKB_TRUESIZE(size); atomic_set(&skb->users, 1); skb->head = data; skb->data = data; skb_reset_tail_pointer(skb);//skb->tail = skb->data; skb->end = skb->tail + size; skb->mac_header = (typeof(skb->mac_header))~0U; skb->transport_header = (typeof(skb->transport_header))~0U; /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); kmemcheck_annotate_variable(shinfo->destructor_arg); return skb; }View Code
skb其checksum 等值初始化见eth_type_trans
/** * eth_type_trans - determine the packet's protocol ID. * @skb: received socket data * @dev: receiving network device * * The rule here is that we * assume 802.3 if the type field is short enough to be a length. * This is normal practice and works for any 'now in use' protocol. */ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) { unsigned short _service_access_point; const unsigned short *sap; const struct ethhdr *eth; skb->dev = dev; skb_reset_mac_header(skb); eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN); if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; } else if (unlikely(!ether_addr_equal_64bits(eth->h_dest, dev->dev_addr))) skb->pkt_type = PACKET_OTHERHOST; /* * Some variants of DSA tagging don't have an ethertype field * at all, so we check here whether one of those tagging * variants has been configured on the receiving interface, * and if so, set skb->protocol without looking at the packet. */ if (unlikely(netdev_uses_dsa(dev))) return htons(ETH_P_XDSA); if (likely(eth_proto_is_802_3(eth->h_proto))) return eth->h_proto; /* * This is a magic hack to spot IPX packets. Older Novell breaks * the protocol design and runs IPX over 802.3 without an 802.2 LLC * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This * won't work for fault tolerant netware but does for the rest. */ sap = skb_header_pointer(skb, 0, sizeof(*sap), &_service_access_point); if (sap && *sap == 0xFFFF) return htons(ETH_P_802_3); /* * Real 802.2 LLC */ return htons(ETH_P_802_2); } EXPORT_SYMBOL(eth_type_trans);View Code
实现了reset_mac_header 、protocol 、dev 、pkt_type等赋值
2、从网卡内存通过DMA 读取报文后,以skb 形势交给协议栈处理;首先来到的是netif_receive_skb_internal;
static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; net_timestamp_check(netdev_tstamp_prequeue, skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; rcu_read_lock(); #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu >= 0) { ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); return ret; } } #endif ret = __netif_receive_skb(skb); rcu_read_unlock(); return ret; }View Code
使用了rcu lock;此时为啥要使用rcu lock?? 里面有rcu 变量吗??
3、二层收包处理:__netif_receive_skb_core
涉及到如下处理:
skb_reset_network_header(skb);
if (!skb_transport_header_was_set(skb))
skb_reset_transport_header(skb);
skb_reset_mac_len(skb);
static inline void skb_reset_network_header(struct sk_buff *skb) { skb->network_header = skb->data - skb->head; } static inline void skb_reset_transport_header(struct sk_buff *skb) { skb->transport_header = skb->data - skb->head; } static inline void skb_reset_mac_len(struct sk_buff *skb) { skb->mac_len = skb->network_header - skb->mac_header; }View Code
复位 ip tcp 等头部地址(这里使用的偏移地址,主要是相对head的偏移);计算mac层的长度
那么mac层头部是什么时候复位的呢?-----是在驱动收到包后了,此时还会计算protocol rx_checksum 等值
- 处理vlan-8021q协议
if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || skb->protocol == cpu_to_be16(ETH_P_8021AD)) { skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto out; }
cpu_to_be16:be16 表示大端-, 为啥不直接使用网络字节序呢??
虽然#define cpu_to_be16(data) HTONS((data)) be最后也是网络字节序;
注意skb里面的赋值使用网络字节序
struct sk_buff *skb_vlan_untag(struct sk_buff *skb) { struct vlan_hdr *vhdr; u16 vlan_tci; if (unlikely(skb_vlan_tag_present(skb))) { /* vlan_tci is already set-up so leave this for another time */ return skb; } skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) goto err_free; if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) goto err_free; vhdr = (struct vlan_hdr *)skb->data; vlan_tci = ntohs(vhdr->h_vlan_TCI); __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); skb_pull_rcsum(skb, VLAN_HLEN); vlan_set_encap_proto(skb, vhdr); skb = skb_reorder_vlan_header(skb); if (unlikely(!skb)) goto err_free; skb_reset_network_header(skb); skb_reset_transport_header(skb); skb_reset_mac_len(skb); return skb; err_free: kfree_skb(skb); return NULL; }View Code
对于untag_vlan 函数:
检查是否shared也就是多个user在使用,此时通过skb->user判断。如果user!= 1 则需要 clone skb
注意pskb_may_pull此时涉及到frag_list skb->data skb->data_len 等处理后续详细分析
skb_pull_rcsum--->skb->data +=vlanlen;同时计算CRC:skb_postpull_rcsum
紧接着就是提取vlan信息计算crc, 设置mac_headr偏移量skb->mac_header += VLAN_HLEN;
skb_reset_network_header(skb); skb_reset_transport_header(skb); skb_reset_mac_len(skb);
重新设置网络层 传输层 mac层_len 数值;注意此时data指针指向了网络层开头
3、IP层处理:ipv4为例
逻辑如下:
skb_share_check(skb--)
pskb_may_pull(skb---
iph = ip_hdr(skb);
以及计算crc:ip_fast_csum pskb_trim_rcsum
最后进入:ip_rcv_finish 函数处理:
其主要为:查找路由:ip_route_input_noref skb_valid_dst
同时解析ip_options 选项
最后根据rtable决定是否转发还是送入本地协议栈
分析:
skb_share_check中可能会调用 skb_clone , skb_clone克隆时 不会clone 其next prev上的skbbuffer ;
记住此时clone 只是copy一份skb 并不会new一份data数据区
static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) { #define C(x) n->x = skb->x n->next = n->prev = NULL; n->sk = NULL;View Code
pskb_may_pull:看下其设计原理:pskb_may_pull(skb, sizeof(struct iphdr))
static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len) { if (likely(len <= skb_headlen(skb)))// 一般都是会小于线性区的长度 return 1; if (unlikely(len > skb->len))// 需要pull的长度大约数据长度 所以不合理 错误 return 0;
//需要pull的长度大于 线性区但是小于整个数据区长度 return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL; } //skb_headlen skb->len - skb->data_len 是当前片(unpaged data)长度 static inline unsigned int skb_headlen(const struct sk_buff *skb) {//skb->len 是data长度,包含所带分片长度 //skb->data_len 是paged data长度, 即分片数据的长度,也就是skb_shared_info中的长度 return skb->len - skb->data_len; }
Case:如果需要pull的长度大于线性区但是小于整个数据区长度,那就会调用__pskb_pull_tail 修改 线性区的tail, 其偏移值为len--线性区len;
/** * __pskb_pull_tail - advance tail of skb header * @skb: buffer to reallocate * @delta: number of bytes to advance tail * * The function makes a sense only on a fragmented &sk_buff, * it expands header moving its tail forward and copying necessary * data from fragmented part. * * &sk_buff MUST have reference count of 1. * * Returns %NULL (and &sk_buff does not change) if pull failed * or value of new tail of skb in the case of success. * * All the pointers pointing into skb header may change and must be * reloaded after call to this function. */ /* Moves tail of skb head forward, copying data from fragmented part, * when it is necessary. * 1. It may fail due to malloc failure. * 2. It may change skb pointers. * * It is pretty complicated. Luckily, it is called only in exceptional cases. */ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) { /* If skb has not enough free space at tail, get new one * plus 128 bytes for future expansions. If we have enough * room at tail, reallocate without expansion only if skb is cloned. */ int i, k, eat = (skb->tail + delta) - skb->end; // eat 大于0 表示 tail如果移动len-head_len 就会超出 end区域 也就是为去除当前skb可用内存,还需要多少内存 // 如果skb 是已经被cloned multiple shared copies if (eat > 0 || skb_cloned(skb)) { if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, GFP_ATOMIC)) return NULL; } //当前skb可用内存 足够pull // head---data----tail----end----frag /* delta ==== pull len headlen== len--data_len === linerdata */ //end---tai > delta 可以直接copy 到 线性区内存块----从skb的offset(skb->tail),拷贝delta个字节到skb->tail之后 if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)) BUG(); /* Optimization: no fragments, no reasons to preestimate * size of pulled pages. Superb. *///没有分段 if (!skb_has_frag_list(skb)) goto pull_pages; //由于数据已经拷贝到了skb->data中,因此需要释放frags,frag_list中被拷贝过的数据 //计算从frags数组中拷贝的数据量 /* Estimate size of pulled pages. */ eat = delta; //寻找到满足eat这么多数据量的最后一个page for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); if (size >= eat) goto pull_pages; eat -= size; } /* If we need update frag list, we are in troubles. * Certainly, it possible to add an offset to skb data, * but taking into account that pulling is expected to * be very rare operation, it is worth to fight against * further bloating skb head and crucify ourselves here instead. * Pure masohism, indeed. 8)8) *///eat仍不为0,说明从frag_list中进行了拷贝,释放frag_list if (eat) {skb_shared_info struct sk_buff *list = skb_shinfo(skb)->frag_list; struct sk_buff *clone = NULL; struct sk_buff *insp = NULL; do { BUG_ON(!list); if (list->len <= eat) { /* Eaten as whole. */ eat -= list->len; list = list->next; insp = list; } else { /* Eaten partially. */ if (skb_shared(list)) { /* Sucks! We need to fork list. :-( */ clone = skb_clone(list, GFP_ATOMIC); if (!clone) return NULL; insp = list->next; list = clone; } else { /* This may be pulled without * problems. */ insp = list; } if (!pskb_pull(list, eat)) { kfree_skb(clone); return NULL; } break; } } while (eat); //list指向frag_list头 //直到list遍历到数据量足够的最后一个skb /* Free pulled out fragments. */ while ((list = skb_shinfo(skb)->frag_list) != insp) { skb_shinfo(skb)->frag_list = list->next; kfree_skb(list); } /* And insert new clone at head. */ if (clone) { clone->next = list; skb_shinfo(skb)->frag_list = clone; } } /* Success! Now we may commit changes to skb data. */ pull_pages: eat = delta; k = 0;//释放frags中的page for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); if (size <= eat) { skb_frag_unref(skb, i); eat -= size; } else { skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; if (eat) { skb_shinfo(skb)->frags[k].page_offset += eat; skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); eat = 0; } k++; } } skb_shinfo(skb)->nr_frags = k; skb->tail += delta; skb->data_len -= delta; return skb_tail_pointer(skb); } EXPORT_SYMBOL(__pskb_pull_tail); /** * skb_copy_bits - copy bits from skb to kernel buffer * @skb: source skb * @offset: offset in source * @to: destination buffer * @len: number of bytes to copy * * Copy the specified number of bytes from the source skb to the * destination buffer. * * CAUTION ! : * If its prototype is ever changed, * check arch/{*}/net/{*}.S files, * since it is called from BPF assembly code. */ //if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)) // headlen == linerdata to===tail int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) { int start = skb_headlen(skb);//主数据包的线性数据块长度 struct sk_buff *frag_iter; int i, copy; if (offset > (int)skb->len - len)//skb->len : 全部数据块的总长度 offset + len > skb->len goto fault; /* Copy header. 如果线性区还有空间 先处理线性空间 */ if ((copy = start - offset) > 0) {//基本数据块长度 - 偏移处: 判断是否要拷贝头部 if (copy > len) //需拷贝的数据长度完全在liner数据块内, 所以完全是线性拷贝 copy = len; skb_copy_from_linear_data_offset(skb, offset, to, copy); if ((len -= copy) == 0) return 0; offset += copy;//如果拷贝的数据存在于散列在内存页面的分散数据块或者分段数据块中 to += copy; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; skb_frag_t *f = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len);//???????? // 线性区+frag == 总的数据长度 end = start + skb_frag_size(f); if ((copy = end - offset) > 0) { u8 *vaddr; if (copy > len) copy = len; vaddr = kmap_atomic(skb_frag_page(f)); //frag 中page 的virtual虚拟地址 memcpy(to, vaddr + f->page_offset + offset - start,// offset-start 已经copy的数据 copy); kunmap_atomic(vaddr); if ((len -= copy) == 0) return 0; offset += copy; to += copy; } start = end;//重行计算start 也就是 已经copy的数据尾部 } /* 如果还没有 copy 完足够长度的数据,还要遍历 frag_list 继续 copy 数据,而且 copy 数据的方式是递归调用该函数 */ skb_walk_frags(skb, frag_iter) { int end; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { if (copy > len) copy = len; if (skb_copy_bits(frag_iter, offset - start, to, copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; to += copy; } start = end; } if (!len) return 0; fault: return -EFAULT; } EXPORT_SYMBOL(skb_copy_bits);
pskb_trim_rcsum:分析
len = ntohs(iph->tot_len);
pskb_trim_rcsum(skb, len)
/** * pskb_trim_rcsum - trim received skb and update checksum * @skb: buffer to trim * @len: new length * * This is exactly the same as pskb_trim except that it ensures the * checksum of received packets are still valid after the operation. */ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) { if (likely(len >= skb->len)) return 0; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE;//设置为没有计算 return __pskb_trim(skb, len); } static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) { if (skb->data_len) return ___pskb_trim(skb, len); // 只有线性区 重行设置skb的len 同时设置tail skb->len = len; skb_set_tail_pointer(skb, len); return 0; }
pskb_tim 主要是去掉多余的数据, 只保留ip数据段;重新计算crc
1 /* Trims skb to length len. It can change skb pointers. 2 /* 针对skb中存在非线性数据的情形,将skb的数据长度裁减到len长度,最终skb->len = len 3 * 多余的数据会被clean掉 4 */ 5 6 int ___pskb_trim(struct sk_buff *skb, unsigned int len) 7 { 8 struct sk_buff **fragp; 9 struct sk_buff *frag; 10 int offset = skb_headlen(skb); 11 int nfrags = skb_shinfo(skb)->nr_frags; 12 int i; 13 int err; 14 15 if (skb_cloned(skb) && 16 unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) 17 return err; 18 19 i = 0; 20 if (offset >= len) 21 goto drop_pages; 22 /* offset 是线性数据缓冲区的长度,len是要裁减到的长度,offset大于等于len, 23 * 说明线性缓冲区有一部分数据是多余的,而非线性缓冲区的数据都是多余的,因此 24 * 需要把非线性缓冲区释放掉。 */ 25 for (; i < nfrags; i++) { 26 int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); 27 28 if (end < len) { 29 offset = end; 30 continue; /* 非多余数据 在 len 范围内 */ 31 } 32 /* 修改长度,部分多余 */ 33 skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); 34 35 drop_pages: 36 skb_shinfo(skb)->nr_frags = i;/* 更新unmapped page个数 */ 37 38 for (; i < nfrags; i++) 39 skb_frag_unref(skb, i); /* 释放掉 */ 40 41 if (skb_has_frag_list(skb)) 42 skb_drop_fraglist(skb);/* 整个frag_list中的所有skb的数据都是多余...全部释放 */ 43 goto done; 44 } 45 /* unmapped page中的数据都有效,对frag_list中的数据进行trim */ 46 for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); 47 fragp = &frag->next) { 48 int end = offset + frag->len; 49 /* 该skb是共享的,则克隆一个,将克隆的加入到链表中 */ 50 /* 为何要这样?因为接下来可能会对该skb进行修改... */ 51 if (skb_shared(frag)) { 52 struct sk_buff *nfrag; 53 54 nfrag = skb_clone(frag, GFP_ATOMIC); 55 if (unlikely(!nfrag)) 56 return -ENOMEM; 57 58 nfrag->next = frag->next; 59 consume_skb(frag); 60 frag = nfrag; 61 *fragp = frag; 62 } 63 64 if (end < len) { 65 offset = end; 66 continue; 67 } 68 69 if (end > len && 70 unlikely((err = pskb_trim(frag, len - offset)))) 71 return err; 72 73 if (frag->next) 74 skb_drop_list(&frag->next); 75 break; 76 } 77 78 done: 79 /* 若trim到的长度大于线性数据缓冲区的长度,则非线性数据缓冲区的长度要减小 80 * 不管if 如何。skb->len的长度最终都一定是len... */ 81 82 if (len > skb_headlen(skb)) { 83 skb->data_len -= skb->len - len; 84 skb->len = len; 85 } else { 86 skb->len = len; 87 skb->data_len = 0; 88 skb_set_tail_pointer(skb, len); 89 } 90 //在一个if else 里面都有 skb->len = len 渣渣代码 91 return 0; 92 }View Code