mm: correctly synchronize rss-counters at exit/exec
[linux-2.6.git] / kernel / watchdog.c
index 054a67c..df30ee0 100644 (file)
@@ -3,15 +3,14 @@
  *
  * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
  *
- * this code detects hard lockups: incidents in where on a CPU
- * the kernel does not respond to anything except NMI.
- *
- * Note: Most of this code is borrowed heavily from softlockup.c,
- * so thanks to Ingo for the initial implementation.
- * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
+ * Note: Most of this code is borrowed heavily from the original softlockup
+ * detector, so thanks to Ingo for the initial implementation.
+ * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
  * to those contributors as well.
  */
 
+#define pr_fmt(fmt) "NMI watchdog: " fmt
+
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/nmi.h>
@@ -28,7 +27,7 @@
 #include <linux/perf_event.h>
 
 int watchdog_enabled = 1;
-int __read_mostly softlockup_thresh = 60;
+int __read_mostly watchdog_thresh = 10;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -91,6 +90,17 @@ static int __init nosoftlockup_setup(char *str)
 __setup("nosoftlockup", nosoftlockup_setup);
 /*  */
 
+/*
+ * Hard-lockup warnings should be triggered after just a few seconds. Soft-
+ * lockups can have false positives under extreme conditions. So we generally
+ * want a higher threshold for soft lockups than for hard lockups. So we couple
+ * the thresholds with a factor: we make the soft threshold twice the amount of
+ * time the hard threshold is.
+ */
+static int get_softlockup_thresh(void)
+{
+       return watchdog_thresh * 2;
+}
 
 /*
  * Returns seconds, approximately.  We don't need nanosecond
@@ -105,12 +115,13 @@ static unsigned long get_timestamp(int this_cpu)
 static unsigned long get_sample_period(void)
 {
        /*
-        * convert softlockup_thresh from seconds to ns
-        * the divide by 5 is to give hrtimer 5 chances to
-        * increment before the hardlockup detector generates
-        * a warning
+        * convert watchdog_thresh from seconds to ns
+        * the divide by 5 is to give hrtimer several chances (two
+        * or three with the current relation between the soft
+        * and hard thresholds) to increment before the
+        * hardlockup detector generates a warning
         */
-       return softlockup_thresh / 5 * NSEC_PER_SEC;
+       return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
 }
 
 /* Commands for resetting the watchdog */
@@ -182,13 +193,14 @@ static int is_softlockup(unsigned long touch_ts)
        unsigned long now = get_timestamp(smp_processor_id());
 
        /* Warn about unreasonable delays: */
-       if (time_after(now, touch_ts + softlockup_thresh))
+       if (time_after(now, touch_ts + get_softlockup_thresh()))
                return now - touch_ts;
 
        return 0;
 }
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
+
 static struct perf_event_attr wd_hw_attr = {
        .type           = PERF_TYPE_HARDWARE,
        .config         = PERF_COUNT_HW_CPU_CYCLES,
@@ -198,7 +210,7 @@ static struct perf_event_attr wd_hw_attr = {
 };
 
 /* Callback function for perf event subsystem */
-static void watchdog_overflow_callback(struct perf_event *event, int nmi,
+static void watchdog_overflow_callback(struct perf_event *event,
                 struct perf_sample_data *data,
                 struct pt_regs *regs)
 {
@@ -284,7 +296,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
                if (__this_cpu_read(soft_watchdog_warn) == true)
                        return HRTIMER_RESTART;
 
-               printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+               printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
                        smp_processor_id(), duration,
                        current->comm, task_pid_nr(current));
                print_modules();
@@ -309,11 +321,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
  */
 static int watchdog(void *unused)
 {
-       static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+       struct sched_param param = { .sched_priority = 0 };
        struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
 
-       sched_setscheduler(current, SCHED_FIFO, &param);
-
        /* initialize timestamp */
        __touch_watchdog();
 
@@ -324,9 +334,11 @@ static int watchdog(void *unused)
 
        set_current_state(TASK_INTERRUPTIBLE);
        /*
-        * Run briefly once per second to reset the softlockup timestamp.
-        * If this gets delayed for more than 60 seconds then the
-        * debug-printout triggers in watchdog_timer_fn().
+        * Run briefly (kicked by the hrtimer callback function) once every
+        * get_sample_period() seconds (4 seconds by default) to reset the
+        * softlockup timestamp. If this gets delayed for more than
+        * 2*watchdog_thresh seconds then the debug-printout triggers in
+        * watchdog_timer_fn().
         */
        while (!kthread_should_stop()) {
                __touch_watchdog();
@@ -337,8 +349,12 @@ static int watchdog(void *unused)
 
                set_current_state(TASK_INTERRUPTIBLE);
        }
+       /*
+        * Drop the policy/priority elevation during thread exit to avoid a
+        * scheduling latency spike.
+        */
        __set_current_state(TASK_RUNNING);
-
+       sched_setscheduler(current, SCHED_NORMAL, &param);
        return 0;
 }
 
@@ -357,23 +373,26 @@ static int watchdog_nmi_enable(int cpu)
        if (event != NULL)
                goto out_enable;
 
-       /* Try to register using hardware perf events */
        wd_attr = &wd_hw_attr;
-       wd_attr->sample_period = hw_nmi_get_sample_period();
-       event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
+       wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+
+       /* Try to register using hardware perf events */
+       event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
        if (!IS_ERR(event)) {
-               printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
+               pr_info("enabled, takes one hw-pmu counter.\n");
                goto out_save;
        }
 
 
        /* vary the KERN level based on the returned errno */
        if (PTR_ERR(event) == -EOPNOTSUPP)
-               printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+               pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
        else if (PTR_ERR(event) == -ENOENT)
-               printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
+               pr_warning("disabled (cpu%i): hardware events not enabled\n",
+                        cpu);
        else
-               printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
+               pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
+                       cpu, PTR_ERR(event));
        return PTR_ERR(event);
 
        /* success path */
@@ -404,41 +423,48 @@ static void watchdog_nmi_disable(int cpu) { return; }
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 
 /* prepare/enable/disable routines */
-static int watchdog_prepare_cpu(int cpu)
+static void watchdog_prepare_cpu(int cpu)
 {
        struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
 
        WARN_ON(per_cpu(softlockup_watchdog, cpu));
        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        hrtimer->function = watchdog_timer_fn;
-
-       return 0;
 }
 
 static int watchdog_enable(int cpu)
 {
        struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
-       int err;
+       int err = 0;
 
        /* enable the perf event */
        err = watchdog_nmi_enable(cpu);
-       if (err)
-               return err;
+
+       /* Regardless of err above, fall through and start softlockup */
 
        /* create the watchdog thread */
        if (!p) {
-               p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
+               struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+               p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
                if (IS_ERR(p)) {
-                       printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
-                       return PTR_ERR(p);
+                       pr_err("softlockup watchdog for %i failed\n", cpu);
+                       if (!err) {
+                               /* if hardlockup hasn't already set this */
+                               err = PTR_ERR(p);
+                               /* and disable the perf event */
+                               watchdog_nmi_disable(cpu);
+                       }
+                       goto out;
                }
+               sched_setscheduler(p, SCHED_FIFO, &param);
                kthread_bind(p, cpu);
                per_cpu(watchdog_touch_ts, cpu) = 0;
                per_cpu(softlockup_watchdog, cpu) = p;
                wake_up_process(p);
        }
 
-       return 0;
+out:
+       return err;
 }
 
 static void watchdog_disable(int cpu)
@@ -462,6 +488,8 @@ static void watchdog_disable(int cpu)
        }
 }
 
+/* sysctl functions */
+#ifdef CONFIG_SYSCTL
 static void watchdog_enable_all_cpus(void)
 {
        int cpu;
@@ -475,7 +503,7 @@ static void watchdog_enable_all_cpus(void)
                        watchdog_enabled = 1;
 
        if (!watchdog_enabled)
-               printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
+               pr_err("failed to be enabled on some cpus\n");
 
 }
 
@@ -491,31 +519,26 @@ static void watchdog_disable_all_cpus(void)
 }
 
 
-/* sysctl functions */
-#ifdef CONFIG_SYSCTL
 /*
- * proc handler for /proc/sys/kernel/nmi_watchdog
+ * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
  */
 
-int proc_dowatchdog_enabled(struct ctl_table *table, int write,
-                    void __user *buffer, size_t *length, loff_t *ppos)
+int proc_dowatchdog(struct ctl_table *table, int write,
+                   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-       proc_dointvec(table, write, buffer, length, ppos);
+       int ret;
 
-       if (write) {
-               if (watchdog_enabled)
-                       watchdog_enable_all_cpus();
-               else
-                       watchdog_disable_all_cpus();
-       }
-       return 0;
-}
+       ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+       if (ret || !write)
+               goto out;
 
-int proc_dowatchdog_thresh(struct ctl_table *table, int write,
-                            void __user *buffer,
-                            size_t *lenp, loff_t *ppos)
-{
-       return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+       if (watchdog_enabled && watchdog_thresh)
+               watchdog_enable_all_cpus();
+       else
+               watchdog_disable_all_cpus();
+
+out:
+       return ret;
 }
 #endif /* CONFIG_SYSCTL */
 
@@ -527,17 +550,16 @@ static int __cpuinit
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
        int hotcpu = (unsigned long)hcpu;
-       int err = 0;
 
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-               err = watchdog_prepare_cpu(hotcpu);
+               watchdog_prepare_cpu(hotcpu);
                break;
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
                if (watchdog_enabled)
-                       err = watchdog_enable(hotcpu);
+                       watchdog_enable(hotcpu);
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
@@ -550,7 +572,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 #endif /* CONFIG_HOTPLUG_CPU */
        }
-       return notifier_from_errno(err);
+
+       /*
+        * hardlockup and softlockup are not important enough
+        * to block cpu bring up.  Just always succeed and
+        * rely on printk output to flag problems.
+        */
+       return NOTIFY_OK;
 }
 
 static struct notifier_block __cpuinitdata cpu_nfb = {