ip分片重组 ip_defrag

概述

在ip_local_deliver中,如果检测到是分片包,则需要进行分片重组;

其涉及的函数调用关系如下所示:

 /**
* ip_local_deliver
*  |-->ip_is_fragment //判断是否为分片包
*  |-->ip_defrag //分片缓存&重组
*    |-->ip_find //查找ipq
*    |  |-->ip_frag_find //查找frag_queue
* |
*    |-->ip_defrag_queue //分片接收组合
*      |-->ip_frag_reasm //接收完整的分片组成新的ip包
*/
函数源码分析
ip_local_deliver
 /*
* Deliver IP Packets to the higher protocol layers.
*/
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
struct net *net = dev_net(skb->dev); /* 分片重组 */
if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
return ;
} /* 经过LOCAL_IN钩子点 */
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
net, NULL, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
ip_is_fragment
 /* 判断是否为分片包 */
static inline bool ip_is_fragment(const struct iphdr *iph)
{
/*
根据(n-1)(mtu-ip头)计算值,第一片的offset=0,其余偏移为1480倍数
除最后一片外,其余片标记MF
*/
return (iph->frag_off & htons(IP_MF | IP_OFFSET)) != ;
}
ip_defrag
 /* Process an incoming IP datagram fragment. */
int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
int vif = l3mdev_master_ifindex_rcu(dev);
struct ipq *qp; __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
skb_orphan(skb); /* Lookup (or create) queue header */
/* 查找或创建分片队列 */
qp = ip_find(net, ip_hdr(skb), user, vif); /* 分片队列存在 */
if (qp) {
int ret; spin_lock(&qp->q.lock); /* 分片加入到队列中,能重组则重组 */
ret = ip_frag_queue(qp, skb); spin_unlock(&qp->q.lock);
ipq_put(qp);
return ret;
} /* 无法创建新的ip分片队列,内存不足 */
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
kfree_skb(skb);
return -ENOMEM;
}
ip_find
 /* Find the correct entry in the "incomplete datagrams" queue for
* this IP datagram, and create new one, if nothing is found.
*/
static struct ipq *ip_find(struct net *net, struct iphdr *iph,
u32 user, int vif)
{
struct inet_frag_queue *q;
struct ip4_create_arg arg;
unsigned int hash; /* 记录ip头和输入信息 */
arg.iph = iph;
arg.user = user;
arg.vif = vif; /* 通过id,源地址,目的地址,协议计算hash */
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); /* 根据hash值查找或创建队列 */
q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
if (IS_ERR_OR_NULL(q)) {
inet_frag_maybe_warn_overflow(q, pr_fmt());
return NULL;
} /* 返回队列q对应的ipq */
return container_of(q, struct ipq, q);
}
inet_frag_find
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
struct inet_frags *f, void *key,
unsigned int hash)
{
struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
int depth = ; /* 分片内存已经超过了低限 */
if (frag_mem_limit(nf) > nf->low_thresh)
/* 进行节点回收 */
inet_frag_schedule_worker(f); /* 找到hash桶 */
hash &= (INETFRAGS_HASHSZ - );
hb = &f->hash[hash]; spin_lock(&hb->chain_lock); /* 遍历链表 */
hlist_for_each_entry(q, &hb->chain, list) { /* 找到节点 */
if (q->net == nf && f->match(q, key)) { /* 增加引用计数 */
atomic_inc(&q->refcnt);
spin_unlock(&hb->chain_lock); /* 返回节点 */
return q;
} /* 记录查找深度 */
depth++;
}
spin_unlock(&hb->chain_lock); /* 未找到的情况下 */ /* 桶节点的链表深度不超过限定 */
if (depth <= INETFRAGS_MAXDEPTH)
/* 创建节点返回 */
return inet_frag_create(nf, f, key); /* 如果已经超过了重建间隔时间,则重建 */
if (inet_frag_may_rebuild(f)) {
/* 打重建标记 */
if (!f->rebuild)
f->rebuild = true;
/* 进行节点回收 */
inet_frag_schedule_worker(f);
} return ERR_PTR(-ENOBUFS);
}
inet_frag_worker
 static void inet_frag_worker(struct work_struct *work)
{ /* 本次回收的桶节点数 */
unsigned int budget = INETFRAGS_EVICT_BUCKETS;
unsigned int i, evicted = ;
struct inet_frags *f; /* 找到hash表 */
f = container_of(work, struct inet_frags, frags_work); BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); local_bh_disable(); /* 从上次回收完的下一个节点开始,进行回收 */
for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) { /* 回收并统计回收数量 */
evicted += inet_evict_bucket(f, &f->hash[i]); /* 下一个未回收桶节点 */
i = (i + ) & (INETFRAGS_HASHSZ - ); /* 回收节点数超过最大值,停止 */
if (evicted > INETFRAGS_EVICT_MAX)
break;
} /* 记录下次需要开始回收的桶节点 */
f->next_bucket = i; local_bh_enable(); /* 如果需要重建,则重建 */
if (f->rebuild && inet_frag_may_rebuild(f))
inet_frag_secret_rebuild(f);
}
inet_evict_bucket
 static unsigned int
inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
{
struct inet_frag_queue *fq;
struct hlist_node *n;
unsigned int evicted = ;
HLIST_HEAD(expired); spin_lock(&hb->chain_lock); /* 遍历桶下的链表 */
hlist_for_each_entry_safe(fq, n, &hb->chain, list) { /* 未超过限定,无需回收 */
if (!inet_fragq_should_evict(fq))
continue; /* 定时器无法删除 */
if (!del_timer(&fq->timer))
continue; /* 能够回收的节点加入到临时hash */
hlist_add_head(&fq->list_evictor, &expired); /* 记录回收数量 */
++evicted;
} spin_unlock(&hb->chain_lock); /* 依次调用回收函数进行回收 */
hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
f->frag_expire((unsigned long) fq); /* 返回回收节点数 */
return evicted;
}
inet_frag_secret_rebuild
 static void inet_frag_secret_rebuild(struct inet_frags *f)
{
int i; write_seqlock_bh(&f->rnd_seqlock); /* 无需重建 */
if (!inet_frag_may_rebuild(f))
goto out; /* 获取新的用于计算hash的随机值 */
get_random_bytes(&f->rnd, sizeof(u32)); /* 遍历hash表 */
for (i = ; i < INETFRAGS_HASHSZ; i++) {
struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
struct hlist_node *n; /* 取的桶节点 */
hb = &f->hash[i];
spin_lock(&hb->chain_lock); /* 遍历桶节点下面的链表 */
hlist_for_each_entry_safe(q, n, &hb->chain, list) { /* 计算hash */
unsigned int hval = inet_frag_hashfn(f, q); /* 节点不属于当前桶 */
if (hval != i) {
struct inet_frag_bucket *hb_dest; /* 从当前桶中删除该节点 */
hlist_del(&q->list); /* Relink to new hash chain. */
/* 找到目标桶 */
hb_dest = &f->hash[hval]; /* This is the only place where we take
* another chain_lock while already holding
* one. As this will not run concurrently,
* we cannot deadlock on hb_dest lock below, if its
* already locked it will be released soon since
* other caller cannot be waiting for hb lock
* that we've taken above.
*/
spin_lock_nested(&hb_dest->chain_lock,
SINGLE_DEPTH_NESTING);
/* 节点加入目标桶的链表中 */
hlist_add_head(&q->list, &hb_dest->chain);
spin_unlock(&hb_dest->chain_lock);
}
}
spin_unlock(&hb->chain_lock);
} /* 设置重建标记和重建时间 */
f->rebuild = false;
f->last_rebuild_jiffies = jiffies;
out:
write_sequnlock_bh(&f->rnd_seqlock);
}
ip_frag_queue
 /* Add new segment to existing queue. */
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
struct sk_buff *prev, *next;
struct net_device *dev;
unsigned int fragsize;
int flags, offset;
int ihl, end;
int err = -ENOENT;
u8 ecn; /* 分片接收完毕 */
if (qp->q.flags & INET_FRAG_COMPLETE)
goto err; /*
不是本机发出的报文
检测存在dos攻击
则重新初始化队列
初始化失败则释放
*/
if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
unlikely(ip_frag_too_far(qp)) &&
unlikely(err = ip_frag_reinit(qp))) {
ipq_kill(qp);
goto err;
} ecn = ip4_frag_ecn(ip_hdr(skb)->tos); /* 找到分片字段 */
offset = ntohs(ip_hdr(skb)->frag_off); /* 取前三位标记字段 */
flags = offset & ~IP_OFFSET; /* 取分片偏移 */
offset &= IP_OFFSET; /* 计算实际偏移数 */
offset <<= ; /* offset is in 8-byte chunks */ /* ip头部长度 */
ihl = ip_hdrlen(skb); /* Determine the position of this fragment. */ /*
计算当前分片的结束位置
数据长度需要减去ip头以及前面的偏移长度
*/
end = offset + skb->len - skb_network_offset(skb) - ihl;
err = -EINVAL; /* Is this the final fragment? */
/* 如果是最后一个分片 */
if ((flags & IP_MF) == ) {
/* If we already have some bits beyond end
* or have different end, the segment is corrupted.
*/
/* 当前分片结束为止小于分片总位置,有超过这个位置的数据 */
/* 已经接收到最后分片,两个分片位置不同 */
if (end < qp->q.len ||
((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
goto err; /* 标记最后一个分片到达*/
qp->q.flags |= INET_FRAG_LAST_IN; /* 设置长度为结束位置偏移 */
qp->q.len = end;
}
/* 不是最后一个分片*/
else {
/* 不是8字节对齐 */
if (end&) {
/* 截断成8字节对齐 */
end &= ~; /* 需要重新计算校验和 */
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
skb->ip_summed = CHECKSUM_NONE;
} /* 接收到新片在已接收分片之后 */
if (end > qp->q.len) {
/* Some bits beyond end -> corruption. */
/* 最后一片已经达到,数据溢出 */
if (qp->q.flags & INET_FRAG_LAST_IN)
goto err; /* 记录最大偏移分片结束偏移 */
qp->q.len = end;
}
} /* 无数据 */
if (end == offset)
goto err; err = -ENOMEM; /* 去掉ip头 */
if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
goto err; /* 调整有效负载 */
err = pskb_trim_rcsum(skb, end - offset);
if (err)
goto err; /* Find out which fragments are in front and at the back of us
* in the chain of fragments so far. We must know where to put
* this fragment, right?
*/
/* 设置前一个分片为最后一个分片 */
prev = qp->q.fragments_tail; /* 只有当前分片或者 最后一个分片的偏移小于当前分片偏移 */
if (!prev || FRAG_CB(prev)->offset < offset) {
next = NULL;
/* 找到该位置 */
goto found;
} /* 分片在前面 */ prev = NULL;
/* 遍历分片列表 */
for (next = qp->q.fragments; next != NULL; next = next->next) {
/* 找到当前分片的下一个分片 */
if (FRAG_CB(next)->offset >= offset)
break; /* bingo! */ /* 记录前一个分片 */
prev = next;
} found:
/* We found where to put this one. Check for overlap with
* preceding fragment, and, if needed, align things so that
* any overlaps are eliminated.
*/ /* 与前一片有重叠 */ /* 存在前一个分片位置 */
if (prev) { /* 计算重叠部分 */
int i = (FRAG_CB(prev)->offset + prev->len) - offset; /* 如果有重叠 */
if (i > ) {
/* 偏移去掉重叠部分 */
offset += i;
err = -EINVAL; /* 去掉重叠后超过了尾端 */
if (end <= offset)
goto err;
err = -ENOMEM;
/* 去掉重叠部分 */
if (!pskb_pull(skb, i))
goto err;
/* 需要重新计算校验和 */
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
skb->ip_summed = CHECKSUM_NONE;
}
} err = -ENOMEM; /* 与后片有重叠 */ /* 存在下一个分片&& 分片偏移与当前有重叠 */
while (next && FRAG_CB(next)->offset < end) { /* 计算重叠部分 */
int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */ /* 重叠部分未超过下一分片总长度 */
if (i < next->len) {
/* Eat head of the next overlapped fragment
* and leave the loop. The next ones cannot overlap.
*/
/* 去掉下一个分片的重叠部分 */
if (!pskb_pull(next, i))
goto err;
/* 计算偏移 */
FRAG_CB(next)->offset += i; /* 减少已经接收到的长度 */
qp->q.meat -= i; /* 需要重新计算校验和 */
if (next->ip_summed != CHECKSUM_UNNECESSARY)
next->ip_summed = CHECKSUM_NONE;
break;
}
/* 重叠部分超过一片 */
else { /* 记录当前分片用于释放 */
struct sk_buff *free_it = next; /* Old fragment is completely overridden with
* new one drop it.
*/
/* 记录下一分片 */
next = next->next; /* 调整指针 */
if (prev)
prev->next = next;
else
qp->q.fragments = next; /* 减少接收到长度 */
qp->q.meat -= free_it->len; /* 减少内存统计 */
sub_frag_mem_limit(qp->q.net, free_it->truesize); /* 释放分片 */
kfree_skb(free_it);
} /* 继续判断新的下一片是否有重叠 */
} /* 设置新的偏移 */
FRAG_CB(skb)->offset = offset; /* Insert this fragment in the chain of fragments. */ /* 插入该分片节点 */
skb->next = next;
if (!next)
qp->q.fragments_tail = skb;
if (prev)
prev->next = skb;
else
qp->q.fragments = skb; /* 记录设备的输入接口 */
dev = skb->dev;
if (dev) {
qp->iif = dev->ifindex;
skb->dev = NULL;
} /* 设置时间戳 */
qp->q.stamp = skb->tstamp;
/* 设置接收到的数据长度 */
qp->q.meat += skb->len;
qp->ecn |= ecn;
/* 增加内存统计 */
add_frag_mem_limit(qp->q.net, skb->truesize); /* 偏移为0,则标记首片到达 */
if (offset == )
qp->q.flags |= INET_FRAG_FIRST_IN; /* 分片大小 */
fragsize = skb->len + ihl; /* 记录最大分片大小 */
if (fragsize > qp->q.max_size)
qp->q.max_size = fragsize; /* 记录最大不允许分片的大小 */
if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
fragsize > qp->max_df_size)
qp->max_df_size = fragsize; /* 收尾分片均已到达,接收长度与分片长度一致 */
if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
qp->q.meat == qp->q.len) {
unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL;
/* 重组成新的ip包 */
err = ip_frag_reasm(qp, prev, dev);
skb->_skb_refdst = orefdst;
return err;
} /* 释放路由引用 */
skb_dst_drop(skb); /* 缓存了该包 */
return -EINPROGRESS; err:
kfree_skb(skb);
return err;
}
ip_frag_reasm
 /* Build a new IP datagram from all its fragments. */

 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
struct net_device *dev)
{
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct iphdr *iph;
struct sk_buff *fp, *head = qp->q.fragments;
int len;
int ihlen;
int err;
u8 ecn; /* 移除队列 */
ipq_kill(qp); ecn = ip_frag_ecn_table[qp->ecn];
if (unlikely(ecn == 0xff)) {
err = -EINVAL;
goto out_fail;
}
/* Make the one we just received the head. */ /* 如果前一片存在 */
if (prev) {
/* 头部为当前片 */
head = prev->next; /* 克隆当前片 */
fp = skb_clone(head, GFP_ATOMIC);
if (!fp)
goto out_nomem; /* 设置下一片指针 */
fp->next = head->next; /* 下一片为空则记录尾指针 */
if (!fp->next)
qp->q.fragments_tail = fp; //加入当前片
prev->next = fp; /* 替换头部,释放原有头部 */
skb_morph(head, qp->q.fragments);
head->next = qp->q.fragments->next; consume_skb(qp->q.fragments);
qp->q.fragments = head;
} WARN_ON(!head);
WARN_ON(FRAG_CB(head)->offset != ); /* Allocate a new buffer for the datagram. */ /* 计算新的ip包空间 */
ihlen = ip_hdrlen(head);
len = ihlen + qp->q.len; err = -E2BIG; /* 长度超过最大值 */
if (len > )
goto out_oversize; /* Head of list must not be cloned. */
/* 头部不能是克隆的 */
if (skb_unclone(head, GFP_ATOMIC))
goto out_nomem; /* If the first fragment is fragmented itself, we split
* it to two chunks: the first with data and paged part
* and the second, holding only fragments. */ /* 如果头部有fraglist ,将其分开成两个部分,头不能有frag_list*/
if (skb_has_frag_list(head)) {
struct sk_buff *clone;
int i, plen = ; clone = alloc_skb(, GFP_ATOMIC);
if (!clone)
goto out_nomem; /* 分开后的链接到头部下一个 */
clone->next = head->next;
head->next = clone; /* 将fraglist给clone */
skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
skb_frag_list_init(head); /* 重新调整长度等 */
for (i = ; i < skb_shinfo(head)->nr_frags; i++)
plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
clone->len = clone->data_len = head->data_len - plen;
head->data_len -= clone->len;
head->len -= clone->len;
clone->csum = ;
clone->ip_summed = head->ip_summed;
add_frag_mem_limit(qp->q.net, clone->truesize);
} /* 将后面的分片链接到frag_list上 */
skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head)); /*统计分片长度等信息 */
for (fp=head->next; fp; fp = fp->next) {
head->data_len += fp->len;
head->len += fp->len;
if (head->ip_summed != fp->ip_summed)
head->ip_summed = CHECKSUM_NONE;
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
head->truesize += fp->truesize;
}
sub_frag_mem_limit(qp->q.net, head->truesize); /* 设置新的ip包字段值 */
head->next = NULL;
head->dev = dev;
head->tstamp = qp->q.stamp;
/* 分片流程中如果frag_max_size比MTU小,则使用frag_max_size作为分片MTU */
IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); iph = ip_hdr(head);
iph->tot_len = htons(len);
iph->tos |= ecn; /* When we set IP_DF on a refragmented skb we must also force a
* call to ip_fragment to avoid forwarding a DF-skb of size s while
* original sender only sent fragments of size f (where f < s).
*
* We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
* frag seen to avoid sending tiny DF-fragments in case skb was built
* from one very small df-fragment and one large non-df frag.
*/
/* 
设置了DF标记,则输出过程中需要强制进入分片流程,
来限制DF分片的大小, 不能超过原始原始的大小 最大分片长度==最大不分片的长度,打标记DF/IPSKB_FRAG_PMTU,
    以避免发送小的DF分片和大的非DF分片 
*/
if (qp->max_df_size == qp->q.max_size) {
/* 设置FRAG_PMTU */
IPCB(head)->flags |= IPSKB_FRAG_PMTU;
/* 设置不分片 */
iph->frag_off = htons(IP_DF);
}
/* 否则不设置标记 */
else {
iph->frag_off = ;
} /* 计算校验和 */
ip_send_check(iph); __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); /* 重置队列标记 */
qp->q.fragments = NULL;
qp->q.fragments_tail = NULL;
return ; out_nomem:
net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
err = -ENOMEM;
goto out_fail;
out_oversize:
net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
out_fail:
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
return err;
}
上一篇:MySQL MGR模式介绍


下一篇:使用pt-archiver工具进行MySQL数据库迁移