soft lookup检测机制

soft lookup检测机制

soft lookup是如何检测的,它实现的文件在kernel/watchdog.c

它主要是会起一个hrtimer定时器,周期性产生hrtimer interrupt,这个irq handler函数是watchdog_timer_fn()

在这个irq handler里会往migration/* kernel thread queue一个work,这个work所做的事情就是更新per cpu变量watchdog_touch_ts的值为当前系统时间戳,queue这个work是不会等这个work完成的,queue进去就会return。这个work即是softlockup_fn()

如果当前cpu卡住了,没有发生线程调度,migration/*线程将不会得到运行,所以这个work将一直不会得到处理,所以watchdog_touch_ts的时间戳将一直得不到更新,等后续hrtimer interrupt产生时,比较当前系统时间戳和watchdog_touch_ts时间戳,如果这两个的间隔大于了某一个阈值,将会打印出如下的log:

BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]

所以soft lookup表示一个cpu卡住了,卡在某一个线程,一直没有发生调度切换。

 

上述hrtimer和migration/*都是per cpu的,migration/*内核线程是和cpu绑定的,所以cpu core有多少个,这样的线程就会有多少个,用ps -Af |grep migration查看结果如下:

console:/proc/13 # ps -Af |grep migration
root            13     2 0 02:29:43 ?     00:00:00 [migration/0]
root            16     2 0 02:29:43 ?     00:00:00 [migration/1]
root            21     2 0 02:29:43 ?     00:00:00 [migration/2]
root            26     2 0 02:29:43 ?     00:00:00 [migration/3]

 

4.19/kernel/watchdog.c

static void __touch_watchdog(void)
{
    __this_cpu_write(watchdog_touch_ts, get_timestamp());
}

 

static int softlockup_fn(void *data)
{
    __this_cpu_write(soft_lockup_hrtimer_cnt,
             __this_cpu_read(hrtimer_interrupts));
    __touch_watchdog();
    complete(this_cpu_ptr(&softlockup_completion));

    return 0;
}

 

static int is_softlockup(unsigned long touch_ts)
{
    unsigned long now = get_timestamp();

    if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){
        /* Warn about unreasonable delays. */
        if (time_after(now, touch_ts + get_softlockup_thresh()))
            return now - touch_ts;
    }
    return 0;
}

 

 

static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
    /* kick the softlockup detector */
    if (completion_done(this_cpu_ptr(&softlockup_completion))) {
        reinit_completion(this_cpu_ptr(&softlockup_completion));
        stop_one_cpu_nowait(smp_processor_id(),
                softlockup_fn, NULL,
                this_cpu_ptr(&softlockup_stop_work));
    }

    duration = is_softlockup(touch_ts);
    if (unlikely(duration)) {
        pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
            smp_processor_id(), duration,
            current->comm, task_pid_nr(current));
        if (regs)
            show_regs(regs);
        else
            dump_stack();
}

 

上一篇:linux下安装svn服务端


下一篇:Linux_scp