基于Linux-2.6.30版本,具体实现net\ipv4\netfilter目录下,入口文件为net\ipv4\netfilter\iptable_filter.c,入口/出口函数为模块的init函数iptable_filter_init()和uninit函数iptable_filter_fini()
iptable_filter_init()函数流程如下
1、register_pernet_subsys(&iptable_filter_net_ops),其作用初步看是用于注册报文匹配目标规则,暂不分析。
2、nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 该调用既是将filter类型table的hook这侧到netfilter的核心框架中。其中ipt_ops既是记录了具体hook处理实现内容,如下
1 static struct nf_hook_ops ipt_ops[] __read_mostly = { 2 3 { 4 5 .hook = ipt_local_in_hook, 6 7 .owner = THIS_MODULE, 8 9 .pf = PF_INET, 10 11 .hooknum = NF_INET_LOCAL_IN, 12 13 .priority = NF_IP_PRI_FILTER, 14 15 }, 16 17 { 18 19 .hook = ipt_hook, 20 21 .owner = THIS_MODULE, 22 23 .pf = PF_INET, 24 25 .hooknum = NF_INET_FORWARD, 26 27 .priority = NF_IP_PRI_FILTER, 28 29 }, 30 31 { 32 33 .hook = ipt_local_out_hook, 34 35 .owner = THIS_MODULE, 36 37 .pf = PF_INET, 38 39 .hooknum = NF_INET_LOCAL_OUT, 40 41 .priority = NF_IP_PRI_FILTER, 42 43 }, 44 45 };
以chain INPUT的实现为例,
hook成员即表示具体的hook处理函数,当报文匹配上本规则后,在报文向上层protocol layer上送之前,会被调用。具体见后面进一步分析。
pf即protocol family,表示处理报文的协议类型。
hooknum其实表示本pf下的hook类型,此处处理入方向的报文,与iptables命令工具中INPUT、FORWARD、OUTPUT基本相对应。
priority表示本条chain的优先级。
需要特别注意一下,后续实际注册chain处理规则时,既是利用pf、hooknum、priority将各个chain保存到nf_hooks中的对应位置。后续在netfilter的核心系统中,即根据pf/hooknum查找与之对应的hook,并按照priority指定的优先级一次调用各个hook函数。
现在看看具体filter hook函数是怎么调用的,对net/ipv4/中的代码进行grep,结果如下
1 [root@arch ipv4]# grep -n NF_HOOK *.c 2 3 arp.c:666: NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit); 4 5 arp.c:934: return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); 6 7 ip_forward.c:114: return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev, 8 9 ip_input.c:268: return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL, 10 11 ip_input.c:440: return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, 12 13 ip_output.c:272: NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, 14 15 ip_output.c:288: NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL, 16 17 ip_output.c:292: return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev, 18 19 ip_output.c:306: return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev, 20 21 ipmr.c:1319: NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev, 22 23 raw.c:375: err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev, 24 25 xfrm4_input.c:63: NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, 26 27 xfrm4_output.c:89: return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, 28 29 [root@arch ipv4]# pwd 30 31 /root/linux-2.6.30/net/ipv4 32 33 [root@arch ipv4]#
补充说明:NF_HOOK宏既是netfilter系统在报文处理的地方,插入hook的功能宏,其实现如下
1 #ifdef CONFIG_NETFILTER 2 3 #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) 4 5 NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, INT_MIN) 6 7 #else 8 9 #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb) 10 11 #endif
在上面的grep结果中,ip_input.c对INPUT方向中,利用NF_HOOK放置的报文处理hook实现为
1 /* 2 3 * Deliver IP Packets to the higher protocol layers. 4 5 */ 6 7 int ip_local_deliver(struct sk_buff *skb) 8 9 { 10 11 /* 12 13 * Reassemble IP fragments. 14 15 */ 16 17 18 19 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { 20 21 if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER)) 22 23 return 0; 24 25 } 26 27 28 29 return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL, 30 31 ip_local_deliver_finish); 32 33 } 34 35 NF_HOOK_THRESH宏实现如下 36 37 #ifdef CONFIG_NETFILTER 38 39 #define NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, thresh) 40 41 ({int __ret; 42 43 if ((__ret=nf_hook_thresh(pf, hook, (skb), indev, outdev, okfn, thresh, 1)) == 1)44 45 __ret = (okfn)(skb); 46 47 __ret;}) 48 49 #endif
可见,协议栈是在将ip报文向上一层的协议处理层上送报文的时刻,调用netfilter的hook函数的,可见若系统没有配置netfilter,NF_HOOK实际将直接调用协议扎自身的上送函数,反之若配置了netfilter,则先经过netfilter处理之后再根据结果做区分处理。
Nf_hook_thresh()的实现又做了进一步区分实现,如下
1 #ifdef CONFIG_NETFILTER 2 3 static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, 4 5 struct sk_buff *skb, 6 7 struct net_device *indev, 8 9 struct net_device *outdev, 10 11 int (*okfn)(struct sk_buff *), int thresh, 12 13 int cond) 14 15 { 16 17 if (!cond) 18 19 return 1; 20 21 #ifndef CONFIG_NETFILTER_DEBUG 22 23 if (list_empty(&nf_hooks[pf][hook])) 24 25 return 1; 26 27 #endif 28 29 return nf_hook_slow(pf, hook, skb, indev, outdev, okfn, thresh); 30 31 } 32 33 #else 34 35 static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, 36 37 struct sk_buff *skb, 38 39 struct net_device *indev, 40 41 struct net_device *outdev, 42 43 int (*okfn)(struct sk_buff *), int thresh, 44 45 int cond) 46 47 { 48 49 return okfn(skb); 50 51 } 52 53 #endif
可见最终在netfilter系统中,hook调用的入口是nf_hook_slow,其实现很直观,如下
1 /* Returns 1 if okfn() needs to be executed by the caller, 2 3 * -EPERM for NF_DROP, 0 otherwise. */ 4 5 int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, 6 7 struct net_device *indev, 8 9 struct net_device *outdev, 10 11 int (*okfn)(struct sk_buff *), 12 13 int hook_thresh) 14 15 { 16 17 struct list_head *elem; 18 19 unsigned int verdict; 20 21 int ret = 0; 22 23 24 25 /* We may already have this, but read-locks nest anyway */ 26 27 rcu_read_lock(); 28 29 30 31 elem = &nf_hooks[pf][hook]; // pf -> NFPROTO_IPV4, NFPROTO_ARP,NFPROTO_BRIDGE,PF_INET etc 32 33 next_hook: // hook -> 0~7 34 35 verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev, 36 37 outdev, &elem, okfn, hook_thresh); 38 39 if (verdict == NF_ACCEPT || verdict == NF_STOP) { 40 41 ret = 1; 42 43 } else if (verdict == NF_DROP) { 44 45 kfree_skb(skb); 46 47 ret = -EPERM; 48 49 } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { 50 51 if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn, 52 53 verdict >> NF_VERDICT_BITS)) 54 55 goto next_hook; 56 57 } 58 59 rcu_read_unlock(); 60 61 return ret; 62 63 }
基本流程就是利用pf/hook在nf_hooks[][]中找到前期注册的hook,然后遍历调用hook中各个处理函数,根据hook处理函数返回结果,对当前报文做区分处理。其实按优先级进行遍历,不体现在调用的地方,而是在注册的地方,调用既是按照优先级排序好的顺序依次调用各个函数而已。nf_iterate()函数实现如下
1 unsigned int nf_iterate(struct list_head *head, 2 3 struct sk_buff *skb, 4 5 unsigned int hook, 6 7 const struct net_device *indev, 8 9 const struct net_device *outdev, 10 11 struct list_head **i, 12 13 int (*okfn)(struct sk_buff *), 14 15 int hook_thresh) 16 17 { 18 19 unsigned int verdict; 20 21 22 23 /* 24 25 * The caller must not block between calls to this 26 27 * function because of risk of continuing from deleted element. 28 29 */ 30 31 list_for_each_continue_rcu(*i, head) { 32 33 struct nf_hook_ops *elem = (struct nf_hook_ops *)*i; 34 35 36 37 if (hook_thresh > elem->priority) 38 39 continue; 40 41 42 43 /* Optimization: we don‘t need to hold module 44 45 reference here, since function can‘t sleep. --RR */ 46 47 verdict = elem->hook(hook, skb, indev, outdev, okfn); 48 49 if (verdict != NF_ACCEPT) { 50 51 #ifdef CONFIG_NETFILTER_DEBUG 52 53 if (unlikely((verdict & NF_VERDICT_MASK) 54 55 > NF_MAX_VERDICT)) { 56 57 NFDEBUG("Evil return from %p(%u).\n", 58 59 elem->hook, hook); 60 61 continue; 62 63 } 64 65 #endif 66 67 if (verdict != NF_REPEAT) 68 69 return verdict; 70 71 *i = (*i)->prev; 72 73 } 74 75 } 76 77 return NF_ACCEPT; 78 79 }
注意红色部分,可以看出,仅当是每一个规则返回结果为NF_ACCEPT时,才继续处理本类型hook中下一个优先级的;若本次的hook函数返回NF_REPEAT,则将当前packet的在本次hook函数上再执行依次;其他情况直接返回hook函数执行结果。
Nf_iterate()返回到nf_hook_slow()函数之后,nf_hook_slow即根据执行结果,做区分处理, 若是ACCEPT或STOP,本packet在netfilter处理过程完毕,后续继续调用packet上报函数
若是DROP,本packet将被释放丢弃,不再调用packet上报函数
若包含QUEUE,则将packet送入当前chain对应的队列中,再做对应的处理,在iptables的命令帮助中,说明如下
QUEUE means to pass the packet to userspace. (How the packet can be received by a userspace process differs by the particular queue handler. 2.4.x and 2.6.x kernels up to 2.6.13 include the ip_queue queue handler. Kernels 2.6.14 and later additionally include the nfnetlink_queue queue handler. Packets with a target of QUEUE will be sent to queue number ‘0‘ in this case. Please also see the NFQUEUE target as described later in this man page.)
基本的报文处理流程既是如上面所述,接下来是具体过滤处理实现流程,以决定对应对packet的处理结果,以IPv4报文入方向过滤处理为例,具体既是
1 static struct nf_hook_ops ipt_ops[] __read_mostly = { 2 3 { 4 5 .hook = ipt_local_in_hook, 6 7 .owner = THIS_MODULE, 8 9 .pf = PF_INET, 10 11 .hooknum = NF_INET_LOCAL_IN, 12 13 .priority = NF_IP_PRI_FILTER, 14 15 } 16 17 ... 18 19 };
即ipt_local_in_hook函数,其实现既将报文针对之前添加的规则,与packet做匹配检查,并返回预先设置的结果。