内核将未缓存的IPv6路由项组成一个链表rt6_uncached_list,其为一个每处理器变量。
struct uncached_list {
spinlock_t lock;
struct list_head head;
};
static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
在IPv6路由初始化函数中,初始化rt6_uncached_list链表头和自旋锁。
int __init ip6_route_init(void)
{
for_each_possible_cpu(cpu) {
struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
INIT_LIST_HEAD(&ul->head);
spin_lock_init(&ul->lock);
}
函数rt6_uncached_list_add和函数rt6_uncached_list_del分别用于将路由缓存rt添加到rt6_uncached_list链表上,或者从链表中删除。
void rt6_uncached_list_add(struct rt6_info *rt)
{
struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
rt->rt6i_uncached_list = ul;
spin_lock_bh(&ul->lock);
list_add_tail(&rt->rt6i_uncached, &ul->head);
spin_unlock_bh(&ul->lock);
}
void rt6_uncached_list_del(struct rt6_info *rt)
{
if (!list_empty(&rt->rt6i_uncached)) {
struct uncached_list *ul = rt->rt6i_uncached_list;
struct net *net = dev_net(rt->dst.dev);
spin_lock_bh(&ul->lock);
list_del(&rt->rt6i_uncached);
atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
spin_unlock_bh(&ul->lock);
}
}
初始化
在IPv6路由缓存分配函数中,初始化其rt6i_uncached链表指针。
static void rt6_info_init(struct rt6_info *rt)
{
struct dst_entry *dst = &rt->dst;
memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
INIT_LIST_HEAD(&rt->rt6i_uncached);
}
/* allocate dst with ip6_dst_ops */
struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags)
{
struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
1, DST_OBSOLETE_FORCE_CHK, flags);
if (rt) {
rt6_info_init(rt);
atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
}
return rt;
}
路由查找
与IPv4不同,IPv6的出口路由和入口路由都使用函数ip6_pol_route实现,区别在于传入的接口索引参数不同。
INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net,
struct fib6_table *table,
struct flowi6 *fl6,
const struct sk_buff *skb,
int flags)
{
return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
}
INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net,
struct fib6_table *table,
struct flowi6 *fl6,
const struct sk_buff *skb,
int flags)
{
return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
}
首先,函数rt6_find_cached_rt在fib查询结果的exception表中查找缓存的路由,如果找到,则返回此值。
struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
int oif, struct flowi6 *fl6, const struct sk_buff *skb, int flags)
{
struct fib6_result res = {};
struct rt6_info *rt = NULL;
WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && !rcu_read_lock_held());
strict |= flags & RT6_LOOKUP_F_IFACE;
strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
if (net->ipv6.devconf_all->forwarding == 0)
strict |= RT6_LOOKUP_F_REACHABLE;
rcu_read_lock();
fib6_table_lookup(net, table, oif, fl6, &res, strict);
if (res.f6i == net->ipv6.fib6_null_entry)
goto out;
fib6_select_path(net, &res, fl6, oif, false, skb, strict);
/*Search through exception table */
rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
否则,判断流结构flowi6是否设置了FLOWI_FLAG_KNOWN_NH,并且没有设置了下一跳网关的地址组,这种已知下一跳的前提下查找路由的情况不常见。而且,由于在fl6结构目的地址成员daddr使用的是下一跳地址,而不是skb报文中的目的地址,此时创建的路由缓存项不会缓存在fib6树种,将其添加到uncached_list链表。
} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
!res.nh->fib_nh_gw_family)) {
/* Create a RTF_CACHE clone which will not be
* owned by the fib6 tree. It is for the special case where
* the daddr in the skb during the neighbor look-up is different
* from the fl6->daddr used to look-up route here.
*/
rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
if (rt) {
/* 1 refcnt is taken during ip6_rt_cache_alloc().
* As rt6_uncached_list_add() does not consume refcnt,
* this refcnt is always returned to the caller even
* if caller sets RT6_LOOKUP_F_DST_NOREF flag.
*/
rt6_uncached_list_add(rt);
atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
rcu_read_unlock();
return rt;
}
如果以上两种情况都没有成立,分配每处理器路由缓存项,其过程中将缓存路由项,不必加到uncached_list链表。
} else {
/* Get a percpu copy */
local_bh_disable();
rt = rt6_get_pcpu_route(&res);
if (!rt)
rt = rt6_make_pcpu_route(net, &res);
local_bh_enable();
}
例如,对于SRv6,其中路由查询前,如果下一跳地址有效,流结构的目的地设置为下一跳地址,并且设置FLOWI_FLAG_KNOWN_NH标志。
static int seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
u32 tbl_id, bool local_delivery)
{
struct net *net = dev_net(skb->dev);
struct ipv6hdr *hdr = ipv6_hdr(skb);
int flags = RT6_LOOKUP_F_HAS_SADDR;
struct dst_entry *dst = NULL;
struct rt6_info *rt;
struct flowi6 fl6;
int dev_flags = 0;
fl6.flowi6_iif = skb->dev->ifindex;
fl6.daddr = nhaddr ? *nhaddr : hdr->daddr;
fl6.saddr = hdr->saddr;
fl6.flowlabel = ip6_flowinfo(hdr);
fl6.flowi6_mark = skb->mark;
fl6.flowi6_proto = hdr->nexthdr;
if (nhaddr)
fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
if (!tbl_id) {
dst = ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags);
} else {
struct fib6_table *table;
table = fib6_get_table(net, tbl_id);
if (!table)
goto out;
rt = ip6_pol_route(net, table, 0, &fl6, skb, flags);
dst = &rt->dst;
ICMPv6
对于使用ICMPv6的IPv6邻居发现、IGMP和MLD协议,利用icmp6_dst_alloc分配路由缓存项。对于这类报文,仅限于本地网络,报文的下一跳地址和目的地址相同,这里不查询fib6表,直接分配缓存项,导致新分配的路由在fib树中没有缓存位置,所以将其添加到uncached_list链表。
struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6)
{
struct dst_entry *dst;
struct rt6_info *rt;
struct inet6_dev *idev = in6_dev_get(dev);
struct net *net = dev_net(dev);
if (unlikely(!idev))
return ERR_PTR(-ENODEV);
rt = ip6_dst_alloc(net, dev, 0);
if (unlikely(!rt)) {
in6_dev_put(idev);
dst = ERR_PTR(-ENOMEM);
goto out;
}
rt->dst.input = ip6_input;
rt->dst.output = ip6_output;
rt->rt6i_gateway = fl6->daddr;
rt->rt6i_dst.addr = fl6->daddr;
rt->rt6i_dst.plen = 128;
rt->rt6i_idev = idev;
dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
/* Add this dst into uncached_list so that rt6_disable_ip() can
* do proper release of the net_device
*/
rt6_uncached_list_add(rt);
atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
uncached路由缓存清除
当接口被注销或者down时,由函数rt6_uncached_list_flush_dev清除设备相关的uncached路由缓存。
static int addrconf_ifdown(struct net_device *dev, bool unregister)
{
unsigned long event = unregister ? NETDEV_UNREGISTER : NETDEV_DOWN;
struct net *net = dev_net(dev);
struct inet6_dev *idev;
rt6_disable_ip(dev, event);
void rt6_disable_ip(struct net_device *dev, unsigned long event)
{
rt6_sync_down_dev(dev, event);
rt6_uncached_list_flush_dev(dev_net(dev), dev);
neigh_ifdown(&nd_tbl, dev);
}
遍历所有的rt6_uncached_list中的路由缓存,将其中与操作设备相等的缓存项的设备换成黑洞设备blackhole_netdev,并且将路由项的inet6_dev换成回环接口对应的inet6_dev。
可见,实际上并没有将路由缓存项从uncached_list链表中删除。
static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
{
struct net_device *loopback_dev = net->loopback_dev;
if (dev == loopback_dev)
return;
for_each_possible_cpu(cpu) {
struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
struct rt6_info *rt;
spin_lock_bh(&ul->lock);
list_for_each_entry(rt, &ul->head, rt6i_uncached) {
struct inet6_dev *rt_idev = rt->rt6i_idev;
struct net_device *rt_dev = rt->dst.dev;
if (rt_idev->dev == dev) {
rt->rt6i_idev = in6_dev_get(loopback_dev);
in6_dev_put(rt_idev);
}
if (rt_dev == dev) {
rt->dst.dev = blackhole_netdev;
dev_hold(rt->dst.dev);
dev_put(rt_dev);
}
在销毁路由缓存时,由函数rt6_uncached_list_del检测其是否在uncached_list链表上,为真将其移除,并且递减net->ipv6.rt6_stats->fib_rt_uncache计数,这是在rt6_uncached_list_del内部完成。与此不同,此计数的递增是在rt6_uncached_list_add外部完成。
static void ip6_dst_destroy(struct dst_entry *dst)
{
struct rt6_info *rt = (struct rt6_info *)dst;
struct fib6_info *from;
struct inet6_dev *idev;
ip_dst_metrics_put(dst);
rt6_uncached_list_del(rt);
idev = rt->rt6i_idev;
if (idev) {
rt->rt6i_idev = NULL;
in6_dev_put(idev);
}
from = xchg((__force struct fib6_info **)&rt->from, NULL);
fib6_info_release(from);
}
uncached路由缓存项判断
对于出口路由查找,按照代码中的注释,如果路由缓存已经加入uncached_list链表,说明缓存引用计数已经递增。但是,uncached_list的添加并没有增加路由缓存引用计数,只有在初始分配时设置的引用计数1。
对于uncached的路由缓存,引用计数为1即可,在使用完成之后执行释放操作,这里不需要再次增加引用计数,感觉这个注释有问题,但是代码逻辑并没有问题。对于需要缓存的路由,执行dst_hold_safe增加引用计数,计数应当为2,这样在使用完之后,进行释放时,将计数减为1,并没有实际释放,路由缓存还在。
struct dst_entry *ip6_route_output_flags(struct net *net,
const struct sock *sk, struct flowi6 *fl6, int flags)
{
struct dst_entry *dst;
struct rt6_info *rt6;
rcu_read_lock();
dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
rt6 = (struct rt6_info *)dst;
/* For dst cached in uncached_list, refcnt is already taken. */
if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
dst = &net->ipv6.ip6_null_entry->dst;
dst_hold(dst);
}
以下路由合法性检测函数ip6_dst_check,如果路由缓存项具有RTF_PCPU标志,其可能在函数ip6_rt_pcpu_alloc中设置,在同时设置了缓存的from成员(fib6_info结构)时调用rt6_dst_from_check执行检查。另外,根据之前的介绍,有两种情况路将缓存加到了uncached_list链表上:一是ICMPv6报文发送时生成的缓存;二是已知下一跳地址的情况下创建的路由缓存,对于第一种情况,路由缓存直接生成,from成员为空,调用rt6_check进行检查;对于第二种情况调用rt6_dst_from_check检查。
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
{
struct dst_entry *dst_ret;
struct fib6_info *from;
struct rt6_info *rt;
rt = container_of(dst, struct rt6_info, dst);
if (rt->sernum)
return rt6_is_valid(rt) ? dst : NULL;
rcu_read_lock();
/* All IPV6 dsts are created with ->obsolete set to the value
* DST_OBSOLETE_FORCE_CHK which forces validation calls down
* into this function always.
*/
from = rcu_dereference(rt->from);
if (from && (rt->rt6i_flags & RTF_PCPU ||
unlikely(!list_empty(&rt->rt6i_uncached))))
dst_ret = rt6_dst_from_check(rt, from, cookie);
else
dst_ret = rt6_check(rt, from, cookie);
rcu_read_unlock();
return dst_ret;
}
如果路由缓存没有设置RT6_LOOKUP_F_DST_NOREF标志,即其使用了引用计数,由函数ip6_rt_put递减计数。另外,如果缓存位于uncached_list链表上,表明不需要缓存此路由,也执行引用计数的递减。
/* Only conditionally release dst if flags indicates
* !RT6_LOOKUP_F_DST_NOREF or dst is in uncached_list.
*/
static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags)
{
if (!(flags & RT6_LOOKUP_F_DST_NOREF) ||
!list_empty(&rt->rt6i_uncached))
ip6_rt_put(rt);
}
内核版本 5.10