#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
#include <linux/kthread.h>
+#include "watchdog_hld.h"
static DEFINE_MUTEX(watchdog_proc_mutex);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
+DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static unsigned long soft_lockup_nmi_warn;
unsigned int __read_mostly softlockup_panic =
__this_cpu_write(watchdog_touch_ts, 0);
}
-/* watchdog detector functions */
-bool is_hardlockup(void)
-{
- unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
-
- if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
- return true;
-
- __this_cpu_write(hrtimer_interrupts_saved, hrint);
- return false;
-}
-
static int is_softlockup(unsigned long touch_ts)
{
unsigned long now = get_timestamp();
/* kick the hardlockup detector */
watchdog_interrupt_count();
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+ /* test for hardlockups on the next cpu */
+ watchdog_check_hardlockup_other_cpu();
+#endif
+
/* kick the softlockup detector */
wake_up_process(__this_cpu_read(softlockup_watchdog));
#include <linux/module.h>
#include <asm/irq_regs.h>
#include <linux/perf_event.h>
+#include "watchdog_hld.h"
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static cpumask_t __read_mostly watchdog_cpus;
+#endif
/* boot commands */
/*
}
EXPORT_SYMBOL(touch_nmi_watchdog);
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static unsigned int watchdog_next_cpu(unsigned int cpu)
+{
+ cpumask_t cpus = watchdog_cpus;
+ unsigned int next_cpu;
+
+ next_cpu = cpumask_next(cpu, &cpus);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(&cpus);
+
+ if (next_cpu == cpu)
+ return nr_cpu_ids;
+
+ return next_cpu;
+}
+
+static int is_hardlockup_other_cpu(unsigned int cpu)
+{
+ unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
+
+ if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
+ return 1;
+
+ per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
+ return 0;
+}
+
+void watchdog_check_hardlockup_other_cpu(void)
+{
+ unsigned int next_cpu;
+
+ /*
+ * Test for hardlockups every 3 samples. The sample period is
+ * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
+ * watchdog_thresh (over by 20%).
+ */
+ if (__this_cpu_read(hrtimer_interrupts) % 3 != 0)
+ return;
+
+ /* check for a hardlockup on the next cpu */
+ next_cpu = watchdog_next_cpu(smp_processor_id());
+ if (next_cpu >= nr_cpu_ids)
+ return;
+
+ /*
+ * "smp_rmb" matches "smp_wmb" in "watchdog_nmi_enable" and
+ * "watchdog_nmi_disable" to ensure "watchdog_nmi_touch" is
+ * updated for a cpu before any other cpu sees it is online.
+ */
+ smp_rmb();
+
+ if (per_cpu(watchdog_nmi_touch, next_cpu) == true) {
+ per_cpu(watchdog_nmi_touch, next_cpu) = false;
+ return;
+ }
+
+ if (is_hardlockup_other_cpu(next_cpu)) {
+ /* only warn once */
+ if (per_cpu(hard_watchdog_warn, next_cpu) == true)
+ return;
+
+ if (hardlockup_panic)
+ panic("Watchdog detected hard LOCKUP on cpu %u",
+ next_cpu);
+ else
+ WARN(1, "Watchdog detected hard LOCKUP on cpu %u",
+ next_cpu);
+
+ per_cpu(hard_watchdog_warn, next_cpu) = true;
+ } else {
+ per_cpu(hard_watchdog_warn, next_cpu) = false;
+ }
+}
+
+int watchdog_nmi_enable(unsigned int cpu)
+{
+ /*
+ * The new cpu will be marked online before the first hrtimer interrupt
+ * runs on it. If another cpu tests for a hardlockup on the new cpu
+ * before it has run its first hrtimer, it will get a false positive.
+ * Touch the watchdog on the new cpu to delay the first check for at
+ * least 3 sampling periods to guarantee one hrtimer has run on the new
+ * cpu.
+ */
+ per_cpu(watchdog_nmi_touch, cpu) = true;
+
+ /*
+ * Ensure watchdog_nmi_touch is updated for a cpu before any other
+ * cpu sees it is online.
+ */
+ smp_wmb();
+ cpumask_set_cpu(cpu, &watchdog_cpus);
+ return 0;
+}
+
+void watchdog_nmi_disable(unsigned int cpu)
+{
+ unsigned int next_cpu = watchdog_next_cpu(cpu);
+
+ /*
+ * Offlining this cpu will cause the cpu before this one to start
+ * checking the one after this one. If this cpu just finished checking
+ * the next cpu and updating hrtimer_interrupts_saved, and then the
+ * previous cpu checks it within one sample period, it will trigger a
+ * false positive. Touch the watchdog on the next cpu to prevent it.
+ */
+ if (next_cpu < nr_cpu_ids)
+ per_cpu(watchdog_nmi_touch, next_cpu) = true;
+
+ /*
+ * Ensure watchdog_nmi_touch is updated for a cpu before any other
+ * cpu sees it is online.
+ */
+ smp_wmb();
+ cpumask_clear_cpu(cpu, &watchdog_cpus);
+}
+
+#else
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
.disabled = 1,
};
+/* watchdog detector functions */
+bool is_hardlockup(void)
+{
+ unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
+
+ if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
+ return true;
+
+ __this_cpu_write(hrtimer_interrupts_saved, hrint);
+ return false;
+}
+
/* Callback function for perf event subsystem */
static void watchdog_overflow_callback(struct perf_event *event,
struct perf_sample_data *data,
cpu0_err = 0;
}
}
+
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU */
The overhead should be minimal. A periodic hrtimer runs to
generate interrupts and kick the watchdog task every 4 seconds.
An NMI is generated every 10 seconds or so to check for hardlockups.
+ If NMIs are not available on the platform, every 12 seconds the
+ hrtimer interrupt on one cpu will be used to check for hardlockups
+ on the next cpu.
The frequency of hrtimer and NMI events and the soft and hard lockup
thresholds can be controlled through the sysctl watchdog_thresh.
-config HARDLOCKUP_DETECTOR
+config HARDLOCKUP_DETECTOR_NMI
def_bool y
depends on LOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG
depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI
+config HARDLOCKUP_DETECTOR_OTHER_CPU
+ def_bool y
+ depends on LOCKUP_DETECTOR && SMP
+ depends on !HARDLOCKUP_DETECTOR_NMI && !HAVE_NMI_WATCHDOG
+
+config HARDLOCKUP_DETECTOR
+ def_bool y
+ depends on HARDLOCKUP_DETECTOR_NMI || HARDLOCKUP_DETECTOR_OTHER_CPU
+
config BOOTPARAM_HARDLOCKUP_PANIC
bool "Panic (Reboot) On Hard Lockups"
depends on HARDLOCKUP_DETECTOR