softlockup(watchdog)用于检测系统调度是否正常,即软锁的情况,当发生softlockup时,内核不能调度,但还能响应中断,对用户的表现可能为:能ping通,但无法登陆系统,无法进行正常操作。
其基本原理为:为每个CPU启动一个内核线程(watchdog/x),此线程为优先级最高的实时线程,在该线程得到调度时,会更新相应的计数(时间戳),同时会启动定时器,当定时器到期时检查相应的时间戳,如果超过指定时间,都没有更新,则说明这段时间内都没有发生调度(因为此线程优先级最高),则打印相应告警或根据配置可以进入panic流程。
基本代码分析(2.6.32)
rest_init->kernel_init->lockup_detector_init->cpu_callback->watchdog_prepare_cpu(初始化watchdog定时器):
点击(此处)折叠或打开
-
static int watchdog_prepare_cpu(int cpu)
-
{
-
struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
-
-
WARN_ON(per_cpu(softlockup_watchdog, cpu));
-
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);//初始化高精度定时器
-
hrtimer->function = watchdog_timer_fn;//设置定时器处理函数
-
-
return 0;
-
}
看门狗定时器处理函数:
点击(此处)折叠或打开
-
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
-
{
-
//获取计数watchdog_touch_ts,该计数在watchdog内核线程被调度时更新
-
unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
-
struct pt_regs *regs = get_irq_regs();
-
int duration;
-
-
/* kick the hardlockup detector */
-
//增加中断计数,证明没有发生硬锁(关中断死锁)
-
watchdog_interrupt_count();
-
-
/* kick the softlockup detector */
-
//唤醒wathdog内核线程
-
wake_up_process(__get_cpu_var(softlockup_watchdog));
-
-
/* .. and repeat */
-
//重启定时器
-
hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
-
if (touch_ts == 0) {
-
if (unlikely(__get_cpu_var(softlockup_touch_sync))) {
-
/*
-
* If the time stamp was touched atomically
-
* make sure the scheduler tick is up to date.
-
*/
-
__get_cpu_var(softlockup_touch_sync) = false;
-
sched_clock_tick();
-
}
-
__touch_watchdog();
-
return HRTIMER_RESTART;
-
}
-
-
/* check for a softlockup
-
* This is done by making sure a high priority task is
-
* being scheduled. The task touches the watchdog to
-
* indicate it is getting cpu time. If it hasn't then
-
* this is a good indication some task is hogging the cpu
-
*/
-
//判断是否发生了软锁,原理是判断touch_ts(时间戳)是否超过一定时间没有更新
-
duration = is_softlockup(touch_ts);
-
if (unlikely(duration)) {
-
/* only warn once */
-
if (__get_cpu_var(soft_watchdog_warn) == true)
-
return HRTIMER_RESTART;
-
//发生了软锁后,进行一些列的信息记录和告警。
-
printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
-
smp_processor_id(), duration,
-
current->comm, task_pid_nr(current));
-
print_modules();
-
print_irqtrace_events(current);
-
if (regs)
-
show_regs(regs);
-
else
-
dump_stack();
-
//如果配置了softlockup_panic(proc中配置),则panic
-
if (softlockup_panic)
-
panic("softlockup: hung tasks");
-
__get_cpu_var(soft_watchdog_warn) = true;
-
} else
-
__get_cpu_var(soft_watchdog_warn) = false;
-
-
return HRTIMER_RESTART;
-
}
启动看门狗,即创建watchdog内核线程。
点击(此处)折叠或打开
-
static int watchdog_enable(int cpu)
-
{
-
struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
-
int err = 0;
-
-
/* enable the perf event */
-
err = watchdog_nmi_enable(cpu);
-
-
/* Regardless of err above, fall through and start softlockup */
-
-
/* create the watchdog thread */
-
if (!p) {
-
//创建watchdog内核线程
-
p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
-
if (IS_ERR(p)) {
-
printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
-
if (!err)
-
/* if hardlockup hasn't already set this */
-
err = PTR_ERR(p);
-
goto out;
-
}
-
kthread_bind(p, cpu);
-
per_cpu(watchdog_touch_ts, cpu) = 0;
-
per_cpu(softlockup_watchdog, cpu) = p;
-
wake_up_process(p);
-
}
-
-
out:
-
return err;
-
}
watchdog内核线程执行主函数,主要是要更新计数(时间戳)
点击(此处)折叠或打开
-
static int watchdog(void *unused)
-
{
-
//设置为最高优先级
-
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
-
//设置为实时线程
-
sched_setscheduler(current, SCHED_FIFO, ¶m);
-
-
/* initialize timestamp */
-
//初始化计数(时间戳)
-
__touch_watchdog();
-
-
/* kick off the timer for the hardlockup detector */
-
/* done here because hrtimer_start can only pin to smp_processor_id() */
-
//启动定时器,用于检测是否发生软锁
-
hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
-
HRTIMER_MODE_REL_PINNED);
-
//睡眠
-
set_current_state(TASK_INTERRUPTIBLE);
-
/*
-
* Run briefly once per second to reset the softlockup timestamp.
-
* If this gets delayed for more than 60 seconds then the
-
* debug-printout triggers in watchdog_timer_fn().
-
*/
-
while (!kthread_should_stop()) {
-
//更新计数
-
__touch_watchdog();
-
schedule();
-
-
if (kthread_should_stop())
-
break;
-
-
set_current_state(TASK_INTERRUPTIBLE);
-
}
-
__set_current_state(TASK_RUNNING);
-
-
return 0;
-
}
判断是否发生软锁:is_softlockup
点击(此处)折叠或打开
-
static int is_softlockup(unsigned long touch_ts)
-
{
-
unsigned long now = get_timestamp(smp_processor_id());
-
-
/* Warn about unreasonable delays: */
-
//检测计数多久没有更新了,如果超过了60s,则表示发生了软锁
-
if (time_after(now, touch_ts + softlockup_thresh))
-
return now - touch_ts;
-
-
return 0;
-
}
本文转自 guowang327 51CTO博客,原文链接:http://blog.51cto.com/guowang327/1962741,如需转载请自行联系原作者