]> nv-tegra.nvidia Code Review - linux-4.9.git/commitdiff
ANDROID: hardlockup: detect hard lockups without NMIs using secondary cpus
authorColin Cross <ccross@android.com>
Fri, 12 Oct 2018 07:33:59 +0000 (15:33 +0800)
committermobile promotions <svcmobile_promotions@nvidia.com>
Thu, 8 Nov 2018 17:15:50 +0000 (09:15 -0800)
Emulate NMIs on systems where they are not available by using timer
interrupts on other cpus.  Each cpu will use its softlockup hrtimer
to check that the next cpu is processing hrtimer interrupts by
verifying that a counter is increasing.

This patch is useful on systems where the hardlockup detector is not
available due to a lack of NMIs, for example most ARM SoCs.
Without this patch any cpu stuck with interrupts disabled can
cause a hardware watchdog reset with no debugging information,
but with this patch the kernel can detect the lockup and panic,
which can result in useful debugging info.

Bug 200459527

Change-Id: I83d6837cafcc6d6e7a70352f5a4d09c0ede1d8a4
Signed-off-by: Colin Cross <ccross@android.com>
Signed-off-by: Kary Jin <karyj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1929802
(cherry picked from commit c039614dcce22309387769378d722b4c37bd352d)
Reviewed-on: https://git-master.nvidia.com/r/1934166
GVS: Gerrit_Virtual_Submit
Reviewed-by: Daniel Fu <danifu@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
include/linux/nmi.h
kernel/watchdog.c
kernel/watchdog_hld.c
kernel/watchdog_hld.h [new file with mode: 0644]
lib/Kconfig.debug

index 0a3fadc32693a9cf869693f4c406eee5d168e36b..3d7c29d090aa0ce65c4cb89b3d0e4cf09b561723 100644 (file)
  * may be used to reset the timeout - for code which intentionally
  * disables interrupts for a long time. This call is stateless.
  */
-#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR_NMI)
 #include <asm/nmi.h>
+#endif
+
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
 extern void touch_nmi_watchdog(void);
 #else
 static inline void touch_nmi_watchdog(void)
index 63177be0159e9493f6d6ade90efae743aaf117b7..395721ff16de7966751f18e3c3fe787e32dcfd53 100644 (file)
@@ -25,6 +25,7 @@
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
 #include <linux/kthread.h>
+#include "watchdog_hld.h"
 
 static DEFINE_MUTEX(watchdog_proc_mutex);
 
@@ -78,10 +79,9 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
 static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
 static DEFINE_PER_CPU(bool, softlockup_touch_sync);
 static DEFINE_PER_CPU(bool, soft_watchdog_warn);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
+DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
 static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static unsigned long soft_lockup_nmi_warn;
 
 unsigned int __read_mostly softlockup_panic =
@@ -210,18 +210,6 @@ void touch_softlockup_watchdog_sync(void)
        __this_cpu_write(watchdog_touch_ts, 0);
 }
 
-/* watchdog detector functions */
-bool is_hardlockup(void)
-{
-       unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
-
-       if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
-               return true;
-
-       __this_cpu_write(hrtimer_interrupts_saved, hrint);
-       return false;
-}
-
 static int is_softlockup(unsigned long touch_ts)
 {
        unsigned long now = get_timestamp();
@@ -268,6 +256,11 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
        /* kick the hardlockup detector */
        watchdog_interrupt_count();
 
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+       /* test for hardlockups on the next cpu */
+       watchdog_check_hardlockup_other_cpu();
+#endif
+
        /* kick the softlockup detector */
        wake_up_process(__this_cpu_read(softlockup_watchdog));
 
index 12b8dd64078655dd9004d03caa8167da16b57cf5..e5709b8603acbb4307c416843503b3470af44a4a 100644 (file)
 #include <linux/module.h>
 #include <asm/irq_regs.h>
 #include <linux/perf_event.h>
+#include "watchdog_hld.h"
 
 static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static cpumask_t __read_mostly watchdog_cpus;
+#endif
 
 /* boot commands */
 /*
@@ -68,6 +74,124 @@ void touch_nmi_watchdog(void)
 }
 EXPORT_SYMBOL(touch_nmi_watchdog);
 
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static unsigned int watchdog_next_cpu(unsigned int cpu)
+{
+       cpumask_t cpus = watchdog_cpus;
+       unsigned int next_cpu;
+
+       next_cpu = cpumask_next(cpu, &cpus);
+       if (next_cpu >= nr_cpu_ids)
+               next_cpu = cpumask_first(&cpus);
+
+       if (next_cpu == cpu)
+               return nr_cpu_ids;
+
+       return next_cpu;
+}
+
+static int is_hardlockup_other_cpu(unsigned int cpu)
+{
+       unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
+
+       if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
+               return 1;
+
+       per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
+       return 0;
+}
+
+void watchdog_check_hardlockup_other_cpu(void)
+{
+       unsigned int next_cpu;
+
+       /*
+        * Test for hardlockups every 3 samples.  The sample period is
+        *  watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
+        *  watchdog_thresh (over by 20%).
+        */
+       if (__this_cpu_read(hrtimer_interrupts) % 3 != 0)
+               return;
+
+       /* check for a hardlockup on the next cpu */
+       next_cpu = watchdog_next_cpu(smp_processor_id());
+       if (next_cpu >= nr_cpu_ids)
+               return;
+
+       /*
+        * "smp_rmb" matches "smp_wmb" in "watchdog_nmi_enable" and
+        * "watchdog_nmi_disable" to ensure "watchdog_nmi_touch" is
+        * updated for a cpu before any other cpu sees it is online.
+        */
+       smp_rmb();
+
+       if (per_cpu(watchdog_nmi_touch, next_cpu) == true) {
+               per_cpu(watchdog_nmi_touch, next_cpu) = false;
+               return;
+       }
+
+       if (is_hardlockup_other_cpu(next_cpu)) {
+               /* only warn once */
+               if (per_cpu(hard_watchdog_warn, next_cpu) == true)
+                       return;
+
+               if (hardlockup_panic)
+                       panic("Watchdog detected hard LOCKUP on cpu %u",
+                               next_cpu);
+               else
+                       WARN(1, "Watchdog detected hard LOCKUP on cpu %u",
+                               next_cpu);
+
+               per_cpu(hard_watchdog_warn, next_cpu) = true;
+       } else {
+               per_cpu(hard_watchdog_warn, next_cpu) = false;
+       }
+}
+
+int watchdog_nmi_enable(unsigned int cpu)
+{
+       /*
+        * The new cpu will be marked online before the first hrtimer interrupt
+        * runs on it.  If another cpu tests for a hardlockup on the new cpu
+        * before it has run its first hrtimer, it will get a false positive.
+        * Touch the watchdog on the new cpu to delay the first check for at
+        * least 3 sampling periods to guarantee one hrtimer has run on the new
+        * cpu.
+        */
+       per_cpu(watchdog_nmi_touch, cpu) = true;
+
+       /*
+        * Ensure watchdog_nmi_touch is updated for a cpu before any other
+        * cpu sees it is online.
+        */
+       smp_wmb();
+       cpumask_set_cpu(cpu, &watchdog_cpus);
+       return 0;
+}
+
+void watchdog_nmi_disable(unsigned int cpu)
+{
+       unsigned int next_cpu = watchdog_next_cpu(cpu);
+
+       /*
+        * Offlining this cpu will cause the cpu before this one to start
+        * checking the one after this one.  If this cpu just finished checking
+        * the next cpu and updating hrtimer_interrupts_saved, and then the
+        * previous cpu checks it within one sample period, it will trigger a
+        * false positive.  Touch the watchdog on the next cpu to prevent it.
+        */
+       if (next_cpu < nr_cpu_ids)
+               per_cpu(watchdog_nmi_touch, next_cpu) = true;
+
+       /*
+        * Ensure watchdog_nmi_touch is updated for a cpu before any other
+        * cpu sees it is online.
+        */
+       smp_wmb();
+       cpumask_clear_cpu(cpu, &watchdog_cpus);
+}
+
+#else
 static struct perf_event_attr wd_hw_attr = {
        .type           = PERF_TYPE_HARDWARE,
        .config         = PERF_COUNT_HW_CPU_CYCLES,
@@ -76,6 +200,18 @@ static struct perf_event_attr wd_hw_attr = {
        .disabled       = 1,
 };
 
+/* watchdog detector functions */
+bool is_hardlockup(void)
+{
+       unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
+
+       if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
+               return true;
+
+       __this_cpu_write(hrtimer_interrupts_saved, hrint);
+       return false;
+}
+
 /* Callback function for perf event subsystem */
 static void watchdog_overflow_callback(struct perf_event *event,
                 struct perf_sample_data *data,
@@ -228,3 +364,5 @@ void watchdog_nmi_disable(unsigned int cpu)
                cpu0_err = 0;
        }
 }
+
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU */
diff --git a/kernel/watchdog_hld.h b/kernel/watchdog_hld.h
new file mode 100644 (file)
index 0000000..85a9cb5
--- /dev/null
@@ -0,0 +1,10 @@
+#ifndef WATCHDOG_HLD_H
+#define WATCHDOG_HLD_H
+
+DECLARE_PER_CPU(unsigned long, hrtimer_interrupts);
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+void watchdog_check_hardlockup_other_cpu(void);
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU */
+
+#endif
index e661d349ed336116574bb3849247bfdd15d1ecd3..9a7702e3eed84c2268f4898afc85180228265c94 100644 (file)
@@ -771,15 +771,27 @@ config LOCKUP_DETECTOR
          The overhead should be minimal.  A periodic hrtimer runs to
          generate interrupts and kick the watchdog task every 4 seconds.
          An NMI is generated every 10 seconds or so to check for hardlockups.
+         If NMIs are not available on the platform, every 12 seconds the
+         hrtimer interrupt on one cpu will be used to check for hardlockups
+         on the next cpu.
 
          The frequency of hrtimer and NMI events and the soft and hard lockup
          thresholds can be controlled through the sysctl watchdog_thresh.
 
-config HARDLOCKUP_DETECTOR
+config HARDLOCKUP_DETECTOR_NMI
        def_bool y
        depends on LOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG
        depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI
 
+config HARDLOCKUP_DETECTOR_OTHER_CPU
+       def_bool y
+       depends on LOCKUP_DETECTOR && SMP
+       depends on !HARDLOCKUP_DETECTOR_NMI && !HAVE_NMI_WATCHDOG
+
+config HARDLOCKUP_DETECTOR
+       def_bool y
+       depends on HARDLOCKUP_DETECTOR_NMI || HARDLOCKUP_DETECTOR_OTHER_CPU
+
 config BOOTPARAM_HARDLOCKUP_PANIC
        bool "Panic (Reboot) On Hard Lockups"
        depends on HARDLOCKUP_DETECTOR