未缓存的IPv6路由项链表

内核将未缓存的IPv6路由项组成一个链表rt6_uncached_list,其为一个每处理器变量。

struct uncached_list {
    spinlock_t      lock;
    struct list_head    head;
};

static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);

在IPv6路由初始化函数中,初始化rt6_uncached_list链表头和自旋锁。

int __init ip6_route_init(void)
{

    for_each_possible_cpu(cpu) {
        struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);

        INIT_LIST_HEAD(&ul->head);
        spin_lock_init(&ul->lock);
    }

函数rt6_uncached_list_add和函数rt6_uncached_list_del分别用于将路由缓存rt添加到rt6_uncached_list链表上,或者从链表中删除。

void rt6_uncached_list_add(struct rt6_info *rt)
{
    struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);

    rt->rt6i_uncached_list = ul;

    spin_lock_bh(&ul->lock);
    list_add_tail(&rt->rt6i_uncached, &ul->head);
    spin_unlock_bh(&ul->lock);
}
void rt6_uncached_list_del(struct rt6_info *rt)
{
    if (!list_empty(&rt->rt6i_uncached)) {
        struct uncached_list *ul = rt->rt6i_uncached_list;
        struct net *net = dev_net(rt->dst.dev);

        spin_lock_bh(&ul->lock);
        list_del(&rt->rt6i_uncached);
        atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
        spin_unlock_bh(&ul->lock);
    }
}

初始化

在IPv6路由缓存分配函数中,初始化其rt6i_uncached链表指针。

static void rt6_info_init(struct rt6_info *rt)
{
    struct dst_entry *dst = &rt->dst;

    memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
    INIT_LIST_HEAD(&rt->rt6i_uncached);
}
/* allocate dst with ip6_dst_ops */
struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags)
{
    struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
                    1, DST_OBSOLETE_FORCE_CHK, flags);

    if (rt) {
        rt6_info_init(rt);
        atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
    }

    return rt;
}

路由查找

与IPv4不同,IPv6的出口路由和入口路由都使用函数ip6_pol_route实现,区别在于传入的接口索引参数不同。

INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net,
                        struct fib6_table *table,
                        struct flowi6 *fl6,
                        const struct sk_buff *skb,
                        int flags)
{
    return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
}
INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net,
                         struct fib6_table *table,
                         struct flowi6 *fl6,
                         const struct sk_buff *skb,
                         int flags)
{
    return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
}

首先,函数rt6_find_cached_rt在fib查询结果的exception表中查找缓存的路由,如果找到,则返回此值。

struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
                   int oif, struct flowi6 *fl6, const struct sk_buff *skb, int flags)
{
    struct fib6_result res = {};
    struct rt6_info *rt = NULL;

    WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && !rcu_read_lock_held());

    strict |= flags & RT6_LOOKUP_F_IFACE;
    strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
    if (net->ipv6.devconf_all->forwarding == 0)
        strict |= RT6_LOOKUP_F_REACHABLE;

    rcu_read_lock();

    fib6_table_lookup(net, table, oif, fl6, &res, strict);
    if (res.f6i == net->ipv6.fib6_null_entry)
        goto out;

    fib6_select_path(net, &res, fl6, oif, false, skb, strict);

    /*Search through exception table */
    rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);

否则,判断流结构flowi6是否设置了FLOWI_FLAG_KNOWN_NH,并且没有设置了下一跳网关的地址组,这种已知下一跳的前提下查找路由的情况不常见。而且,由于在fl6结构目的地址成员daddr使用的是下一跳地址,而不是skb报文中的目的地址,此时创建的路由缓存项不会缓存在fib6树种,将其添加到uncached_list链表。

    } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
                !res.nh->fib_nh_gw_family)) {
        /* Create a RTF_CACHE clone which will not be
         * owned by the fib6 tree.  It is for the special case where
         * the daddr in the skb during the neighbor look-up is different
         * from the fl6->daddr used to look-up route here.
         */
        rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);

        if (rt) {
            /* 1 refcnt is taken during ip6_rt_cache_alloc().
             * As rt6_uncached_list_add() does not consume refcnt,
             * this refcnt is always returned to the caller even
             * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
             */
            rt6_uncached_list_add(rt);
            atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
            rcu_read_unlock();

            return rt;
        }

如果以上两种情况都没有成立,分配每处理器路由缓存项,其过程中将缓存路由项,不必加到uncached_list链表。

    } else {
        /* Get a percpu copy */
        local_bh_disable();
        rt = rt6_get_pcpu_route(&res);
        if (!rt)
            rt = rt6_make_pcpu_route(net, &res);

        local_bh_enable();
    }

例如,对于SRv6,其中路由查询前,如果下一跳地址有效,流结构的目的地设置为下一跳地址,并且设置FLOWI_FLAG_KNOWN_NH标志。

static int seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
            u32 tbl_id, bool local_delivery)
{
    struct net *net = dev_net(skb->dev);
    struct ipv6hdr *hdr = ipv6_hdr(skb);
    int flags = RT6_LOOKUP_F_HAS_SADDR;
    struct dst_entry *dst = NULL;
    struct rt6_info *rt;
    struct flowi6 fl6;
    int dev_flags = 0;

    fl6.flowi6_iif = skb->dev->ifindex;
    fl6.daddr = nhaddr ? *nhaddr : hdr->daddr;
    fl6.saddr = hdr->saddr;
    fl6.flowlabel = ip6_flowinfo(hdr);
    fl6.flowi6_mark = skb->mark;
    fl6.flowi6_proto = hdr->nexthdr;

    if (nhaddr)
        fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;

    if (!tbl_id) {
        dst = ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags);
    } else {
        struct fib6_table *table;

        table = fib6_get_table(net, tbl_id);
        if (!table)
            goto out;

        rt = ip6_pol_route(net, table, 0, &fl6, skb, flags);
        dst = &rt->dst;

ICMPv6

对于使用ICMPv6的IPv6邻居发现、IGMP和MLD协议,利用icmp6_dst_alloc分配路由缓存项。对于这类报文,仅限于本地网络,报文的下一跳地址和目的地址相同,这里不查询fib6表,直接分配缓存项,导致新分配的路由在fib树中没有缓存位置,所以将其添加到uncached_list链表。

struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6)
{
    struct dst_entry *dst;
    struct rt6_info *rt;
    struct inet6_dev *idev = in6_dev_get(dev);
    struct net *net = dev_net(dev);

    if (unlikely(!idev))
        return ERR_PTR(-ENODEV);

    rt = ip6_dst_alloc(net, dev, 0);
    if (unlikely(!rt)) {
        in6_dev_put(idev);
        dst = ERR_PTR(-ENOMEM);
        goto out;
    }
    rt->dst.input = ip6_input;
    rt->dst.output  = ip6_output;
    rt->rt6i_gateway  = fl6->daddr;
    rt->rt6i_dst.addr = fl6->daddr;
    rt->rt6i_dst.plen = 128;
    rt->rt6i_idev     = idev;
    dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);

    /* Add this dst into uncached_list so that rt6_disable_ip() can
     * do proper release of the net_device
     */
    rt6_uncached_list_add(rt);
    atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);

uncached路由缓存清除

当接口被注销或者down时,由函数rt6_uncached_list_flush_dev清除设备相关的uncached路由缓存。

static int addrconf_ifdown(struct net_device *dev, bool unregister)
{
    unsigned long event = unregister ? NETDEV_UNREGISTER : NETDEV_DOWN;
    struct net *net = dev_net(dev);
    struct inet6_dev *idev;

    rt6_disable_ip(dev, event);

void rt6_disable_ip(struct net_device *dev, unsigned long event)
{
    rt6_sync_down_dev(dev, event);
    rt6_uncached_list_flush_dev(dev_net(dev), dev);
    neigh_ifdown(&nd_tbl, dev);
}  

遍历所有的rt6_uncached_list中的路由缓存,将其中与操作设备相等的缓存项的设备换成黑洞设备blackhole_netdev,并且将路由项的inet6_dev换成回环接口对应的inet6_dev。

可见,实际上并没有将路由缓存项从uncached_list链表中删除。

static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
{
    struct net_device *loopback_dev = net->loopback_dev;

    if (dev == loopback_dev)
        return;

    for_each_possible_cpu(cpu) {
        struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
        struct rt6_info *rt;

        spin_lock_bh(&ul->lock);
        list_for_each_entry(rt, &ul->head, rt6i_uncached) {
            struct inet6_dev *rt_idev = rt->rt6i_idev;
            struct net_device *rt_dev = rt->dst.dev;

            if (rt_idev->dev == dev) {
                rt->rt6i_idev = in6_dev_get(loopback_dev);
                in6_dev_put(rt_idev);
            }

            if (rt_dev == dev) {
                rt->dst.dev = blackhole_netdev;
                dev_hold(rt->dst.dev);
                dev_put(rt_dev);
            }

在销毁路由缓存时,由函数rt6_uncached_list_del检测其是否在uncached_list链表上,为真将其移除,并且递减net->ipv6.rt6_stats->fib_rt_uncache计数,这是在rt6_uncached_list_del内部完成。与此不同,此计数的递增是在rt6_uncached_list_add外部完成。

static void ip6_dst_destroy(struct dst_entry *dst)
{
    struct rt6_info *rt = (struct rt6_info *)dst;
    struct fib6_info *from;
    struct inet6_dev *idev;

    ip_dst_metrics_put(dst);
    rt6_uncached_list_del(rt);

    idev = rt->rt6i_idev;
    if (idev) {
        rt->rt6i_idev = NULL;
        in6_dev_put(idev);
    }

    from = xchg((__force struct fib6_info **)&rt->from, NULL);
    fib6_info_release(from);
}

uncached路由缓存项判断

对于出口路由查找,按照代码中的注释,如果路由缓存已经加入uncached_list链表,说明缓存引用计数已经递增。但是,uncached_list的添加并没有增加路由缓存引用计数,只有在初始分配时设置的引用计数1。

对于uncached的路由缓存,引用计数为1即可,在使用完成之后执行释放操作,这里不需要再次增加引用计数,感觉这个注释有问题,但是代码逻辑并没有问题。对于需要缓存的路由,执行dst_hold_safe增加引用计数,计数应当为2,这样在使用完之后,进行释放时,将计数减为1,并没有实际释放,路由缓存还在。

struct dst_entry *ip6_route_output_flags(struct net *net,
                     const struct sock *sk, struct flowi6 *fl6, int flags)
{
        struct dst_entry *dst;
        struct rt6_info *rt6;

        rcu_read_lock();
        dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
        rt6 = (struct rt6_info *)dst;
        /* For dst cached in uncached_list, refcnt is already taken. */
        if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
                dst = &net->ipv6.ip6_null_entry->dst;
                dst_hold(dst);
        }

以下路由合法性检测函数ip6_dst_check,如果路由缓存项具有RTF_PCPU标志,其可能在函数ip6_rt_pcpu_alloc中设置,在同时设置了缓存的from成员(fib6_info结构)时调用rt6_dst_from_check执行检查。另外,根据之前的介绍,有两种情况路将缓存加到了uncached_list链表上:一是ICMPv6报文发送时生成的缓存;二是已知下一跳地址的情况下创建的路由缓存,对于第一种情况,路由缓存直接生成,from成员为空,调用rt6_check进行检查;对于第二种情况调用rt6_dst_from_check检查。

static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
{   
    struct dst_entry *dst_ret;
    struct fib6_info *from;
    struct rt6_info *rt;
    
    rt = container_of(dst, struct rt6_info, dst);
    
    if (rt->sernum)
        return rt6_is_valid(rt) ? dst : NULL;
    
    rcu_read_lock();
    
    /* All IPV6 dsts are created with ->obsolete set to the value
     * DST_OBSOLETE_FORCE_CHK which forces validation calls down
     * into this function always.
     */
    
    from = rcu_dereference(rt->from);
    
    if (from && (rt->rt6i_flags & RTF_PCPU ||
        unlikely(!list_empty(&rt->rt6i_uncached))))
        dst_ret = rt6_dst_from_check(rt, from, cookie);
    else
        dst_ret = rt6_check(rt, from, cookie);
    
    rcu_read_unlock();
    
    return dst_ret;
}

如果路由缓存没有设置RT6_LOOKUP_F_DST_NOREF标志,即其使用了引用计数,由函数ip6_rt_put递减计数。另外,如果缓存位于uncached_list链表上,表明不需要缓存此路由,也执行引用计数的递减。

/* Only conditionally release dst if flags indicates
 * !RT6_LOOKUP_F_DST_NOREF or dst is in uncached_list.
 */ 
static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags)
{   
    if (!(flags & RT6_LOOKUP_F_DST_NOREF) ||
        !list_empty(&rt->rt6i_uncached))
        ip6_rt_put(rt);
} 

内核版本 5.10

上一篇:VMware CeotOS 网络配置


下一篇:centOS中ifcfg-ens33配置详解