rcu: Deconfuse dynticks entry-exit tracing

[linux-2.6.git] / kernel / rcutree.c
diff --git a/kernel/rcutree.c b/kernel/rcutree.c

index c95fa89ffef9d4bce14b3dc272e55aba5c8055ef..06e40dd53b23847a8aadadd966e26db6a39e76aa 100644 (file)
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -38,7 +38,7 @@
  #include <linux/nmi.h>
  #include <linux/atomic.h>
  #include <linux/bitops.h>
-#include <linux/module.h>
+#include <linux/export.h>
  #include <linux/completion.h>
  #include <linux/moduleparam.h>
  #include <linux/percpu.h>
@@ -69,7 +69,7 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
                 NUM_RCU_LVL_3, \
                 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
         }, \
-       .signaled = RCU_GP_IDLE, \
+       .fqs_state = RCU_GP_IDLE, \
         .gpnum = -300, \
         .completed = -300, \
         .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
@@ -131,8 +131,6 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
  static void invoke_rcu_core(void);
  static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
  
-#define RCU_KTHREAD_PRIO 1     /* RT priority for per-CPU kthreads. */
-
  /*
   * Track the rcutorture test sequence number and the update version
   * number within a given test.  The rcutorture_testseq is incremented
@@ -197,12 +195,10 @@ void rcu_note_context_switch(int cpu)
  }
  EXPORT_SYMBOL_GPL(rcu_note_context_switch);
  
-#ifdef CONFIG_NO_HZ
  DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
-       .dynticks_nesting = 1,
+       .dynticks_nesting = DYNTICK_TASK_NESTING,
         .dynticks = ATOMIC_INIT(1),
  };
-#endif /* #ifdef CONFIG_NO_HZ */
  
  static int blimit = 10;                /* Maximum callbacks per rcu_do_batch. */
  static int qhimark = 10000;    /* If this many pending, ignore blimit. */
@@ -330,11 +326,11 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
                 return 1;
         }
  
-       /* If preemptible RCU, no point in sending reschedule IPI. */
-       if (rdp->preemptible)
-               return 0;
-
-       /* The CPU is online, so send it a reschedule IPI. */
+       /*
+        * The CPU is online, so send it a reschedule IPI.  This forces
+        * it through the scheduler, and (inefficiently) also handles cases
+        * where idle loops fail to inform RCU about the CPU being idle.
+        */
         if (rdp->cpu != smp_processor_id())
                 smp_send_reschedule(rdp->cpu);
         else
@@ -345,51 +341,101 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
  
  #endif /* #ifdef CONFIG_SMP */
  
-#ifdef CONFIG_NO_HZ
+/*
+ * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
+ *
+ * If the new value of the ->dynticks_nesting counter now is zero,
+ * we really have entered idle, and must do the appropriate accounting.
+ * The caller must have disabled interrupts.
+ */
+static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
+{
+       if (rdtp->dynticks_nesting) {
+               trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
+               return;
+       }
+       trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting);
+       if (!idle_cpu(smp_processor_id())) {
+               WARN_ON_ONCE(1);        /* must be idle task! */
+               trace_rcu_dyntick("Error on entry: not idle task",
+                                  oldval, rdtp->dynticks_nesting);
+               ftrace_dump(DUMP_ALL);
+       }
+       /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+       smp_mb__before_atomic_inc();  /* See above. */
+       atomic_inc(&rdtp->dynticks);
+       smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
+       WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+}
  
  /**
- * rcu_enter_nohz - inform RCU that current CPU is entering nohz
+ * rcu_idle_enter - inform RCU that current CPU is entering idle
   *
- * Enter nohz mode, in other words, -leave- the mode in which RCU
+ * Enter idle mode, in other words, -leave- the mode in which RCU
   * read-side critical sections can occur.  (Though RCU read-side
- * critical sections can occur in irq handlers in nohz mode, a possibility
- * handled by rcu_irq_enter() and rcu_irq_exit()).
+ * critical sections can occur in irq handlers in idle, a possibility
+ * handled by irq_enter() and irq_exit().)
+ *
+ * We crowbar the ->dynticks_nesting field to zero to allow for
+ * the possibility of usermode upcalls having messed up our count
+ * of interrupt nesting level during the prior busy period.
   */
-void rcu_enter_nohz(void)
+void rcu_idle_enter(void)
  {
         unsigned long flags;
+       long long oldval;
         struct rcu_dynticks *rdtp;
  
         local_irq_save(flags);
         rdtp = &__get_cpu_var(rcu_dynticks);
-       if (--rdtp->dynticks_nesting) {
-               local_irq_restore(flags);
-               return;
-       }
-       trace_rcu_dyntick("Start");
-       /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
-       smp_mb__before_atomic_inc();  /* See above. */
-       atomic_inc(&rdtp->dynticks);
-       smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
-       WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+       oldval = rdtp->dynticks_nesting;
+       rdtp->dynticks_nesting = 0;
+       rcu_idle_enter_common(rdtp, oldval);
         local_irq_restore(flags);
  }
  
-/*
- * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
+/**
+ * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
+ *
+ * Exit from an interrupt handler, which might possibly result in entering
+ * idle mode, in other words, leaving the mode in which read-side critical
+ * sections can occur.
   *
- * Exit nohz mode, in other words, -enter- the mode in which RCU
- * read-side critical sections normally occur.
+ * This code assumes that the idle loop never does anything that might
+ * result in unbalanced calls to irq_enter() and irq_exit().  If your
+ * architecture violates this assumption, RCU will give you what you
+ * deserve, good and hard.  But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
   */
-void rcu_exit_nohz(void)
+void rcu_irq_exit(void)
  {
         unsigned long flags;
+       long long oldval;
         struct rcu_dynticks *rdtp;
  
         local_irq_save(flags);
         rdtp = &__get_cpu_var(rcu_dynticks);
-       if (rdtp->dynticks_nesting++) {
-               local_irq_restore(flags);
+       oldval = rdtp->dynticks_nesting;
+       rdtp->dynticks_nesting--;
+       WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
+       rcu_idle_enter_common(rdtp, oldval);
+       local_irq_restore(flags);
+}
+
+/*
+ * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle
+ *
+ * If the new value of the ->dynticks_nesting counter was previously zero,
+ * we really have exited idle, and must do the appropriate accounting.
+ * The caller must have disabled interrupts.
+ */
+static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
+{
+       if (oldval) {
+               trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
                 return;
         }
         smp_mb__before_atomic_inc();  /* Force ordering w/previous sojourn. */
@@ -397,7 +443,72 @@ void rcu_exit_nohz(void)
         /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
         smp_mb__after_atomic_inc();  /* See above. */
         WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
-       trace_rcu_dyntick("End");
+       trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
+       if (!idle_cpu(smp_processor_id())) {
+               WARN_ON_ONCE(1);        /* must be idle task! */
+               trace_rcu_dyntick("Error on exit: not idle task",
+                                 oldval, rdtp->dynticks_nesting);
+               ftrace_dump(DUMP_ALL);
+       }
+}
+
+/**
+ * rcu_idle_exit - inform RCU that current CPU is leaving idle
+ *
+ * Exit idle mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections can occur.
+ *
+ * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to
+ * allow for the possibility of usermode upcalls messing up our count
+ * of interrupt nesting level during the busy period that is just
+ * now starting.
+ */
+void rcu_idle_exit(void)
+{
+       unsigned long flags;
+       struct rcu_dynticks *rdtp;
+       long long oldval;
+
+       local_irq_save(flags);
+       rdtp = &__get_cpu_var(rcu_dynticks);
+       oldval = rdtp->dynticks_nesting;
+       WARN_ON_ONCE(oldval != 0);
+       rdtp->dynticks_nesting = DYNTICK_TASK_NESTING;
+       rcu_idle_exit_common(rdtp, oldval);
+       local_irq_restore(flags);
+}
+
+/**
+ * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
+ *
+ * Enter an interrupt handler, which might possibly result in exiting
+ * idle mode, in other words, entering the mode in which read-side critical
+ * sections can occur.
+ *
+ * Note that the Linux kernel is fully capable of entering an interrupt
+ * handler that it never exits, for example when doing upcalls to
+ * user mode!  This code assumes that the idle loop never does upcalls to
+ * user mode.  If your architecture does do upcalls from the idle loop (or
+ * does anything else that results in unbalanced calls to the irq_enter()
+ * and irq_exit() functions), RCU will give you what you deserve, good
+ * and hard.  But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ */
+void rcu_irq_enter(void)
+{
+       unsigned long flags;
+       struct rcu_dynticks *rdtp;
+       long long oldval;
+
+       local_irq_save(flags);
+       rdtp = &__get_cpu_var(rcu_dynticks);
+       oldval = rdtp->dynticks_nesting;
+       rdtp->dynticks_nesting++;
+       WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
+       rcu_idle_exit_common(rdtp, oldval);
         local_irq_restore(flags);
  }
  
@@ -444,27 +555,37 @@ void rcu_nmi_exit(void)
         WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
  }
  
+#ifdef CONFIG_PROVE_RCU
+
  /**
- * rcu_irq_enter - inform RCU of entry to hard irq context
+ * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
   *
- * If the CPU was idle with dynamic ticks active, this updates the
- * rdtp->dynticks to let the RCU handling know that the CPU is active.
+ * If the current CPU is in its idle loop and is neither in an interrupt
+ * or NMI handler, return true.
   */
-void rcu_irq_enter(void)
+int rcu_is_cpu_idle(void)
  {
-       rcu_exit_nohz();
+       int ret;
+
+       preempt_disable();
+       ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
+       preempt_enable();
+       return ret;
  }
+EXPORT_SYMBOL(rcu_is_cpu_idle);
+
+#endif /* #ifdef CONFIG_PROVE_RCU */
  
  /**
- * rcu_irq_exit - inform RCU of exit from hard irq context
+ * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
   *
- * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
- * to put let the RCU handling be aware that the CPU is going back to idle
- * with no ticks.
+ * If the current CPU is idle or running at a first-level (not nested)
+ * interrupt from idle, return true.  The caller must have at least
+ * disabled preemption.
   */
-void rcu_irq_exit(void)
+int rcu_is_cpu_rrupt_from_idle(void)
  {
-       rcu_enter_nohz();
+       return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
  }
  
  #ifdef CONFIG_SMP
@@ -514,24 +635,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
  
  #endif /* #ifdef CONFIG_SMP */
  
-#else /* #ifdef CONFIG_NO_HZ */
-
-#ifdef CONFIG_SMP
-
-static int dyntick_save_progress_counter(struct rcu_data *rdp)
-{
-       return 0;
-}
-
-static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
-{
-       return rcu_implicit_offline_qs(rdp);
-}
-
-#endif /* #ifdef CONFIG_SMP */
-
-#endif /* #else #ifdef CONFIG_NO_HZ */
-
  int rcu_cpu_stall_suppress __read_mostly;
  
  static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -844,36 +947,32 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
         struct rcu_node *rnp = rcu_get_root(rsp);
  
         if (!rcu_scheduler_fully_active ||
-           !cpu_needs_another_gp(rsp, rdp) ||
-           rsp->fqs_active) {
-               if (rcu_scheduler_fully_active &&
-                   cpu_needs_another_gp(rsp, rdp))
-                       rsp->fqs_need_gp = 1;
-               if (rnp->completed == rsp->completed) {
-                       raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                       return;
-               }
-               raw_spin_unlock(&rnp->lock);     /* irqs remain disabled. */
+           !cpu_needs_another_gp(rsp, rdp)) {
+               /*
+                * Either the scheduler hasn't yet spawned the first
+                * non-idle task or this CPU does not need another
+                * grace period.  Either way, don't start a new grace
+                * period.
+                */
+               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               return;
+       }
  
+       if (rsp->fqs_active) {
                 /*
-                * Propagate new ->completed value to rcu_node structures
-                * so that other CPUs don't have to wait until the start
-                * of the next grace period to process their callbacks.
+                * This CPU needs a grace period, but force_quiescent_state()
+                * is running.  Tell it to start one on this CPU's behalf.
                  */
-               rcu_for_each_node_breadth_first(rsp, rnp) {
-                       raw_spin_lock(&rnp->lock); /* irqs already disabled. */
-                       rnp->completed = rsp->completed;
-                       raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-               }
-               local_irq_restore(flags);
+               rsp->fqs_need_gp = 1;
+               raw_spin_unlock_irqrestore(&rnp->lock, flags);
                 return;
         }
  
         /* Advance to a new grace period and initialize state. */
         rsp->gpnum++;
         trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
-       WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
-       rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
+       WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
+       rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
         rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
         record_gp_stall_check_time(rsp);
  
@@ -883,7 +982,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
                 rnp->qsmask = rnp->qsmaskinit;
                 rnp->gpnum = rsp->gpnum;
                 rnp->completed = rsp->completed;
-               rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
+               rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */
                 rcu_start_gp_per_cpu(rsp, rnp, rdp);
                 rcu_preempt_boost_start_gp(rnp);
                 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
@@ -933,7 +1032,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
  
         rnp = rcu_get_root(rsp);
         raw_spin_lock(&rnp->lock);              /* irqs already disabled. */
-       rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
+       rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
         raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
         raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
  }
@@ -949,6 +1048,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
         __releases(rcu_get_root(rsp)->lock)
  {
         unsigned long gp_duration;
+       struct rcu_node *rnp = rcu_get_root(rsp);
+       struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
  
         WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
  
@@ -960,9 +1061,42 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
         gp_duration = jiffies - rsp->gp_start;
         if (gp_duration > rsp->gp_max)
                 rsp->gp_max = gp_duration;
-       rsp->completed = rsp->gpnum;
+
+       /*
+        * We know the grace period is complete, but to everyone else
+        * it appears to still be ongoing.  But it is also the case
+        * that to everyone else it looks like there is nothing that
+        * they can do to advance the grace period.  It is therefore
+        * safe for us to drop the lock in order to mark the grace
+        * period as completed in all of the rcu_node structures.
+        *
+        * But if this CPU needs another grace period, it will take
+        * care of this while initializing the next grace period.
+        * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
+        * because the callbacks have not yet been advanced: Those
+        * callbacks are waiting on the grace period that just now
+        * completed.
+        */
+       if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
+               raw_spin_unlock(&rnp->lock);     /* irqs remain disabled. */
+
+               /*
+                * Propagate new ->completed value to rcu_node structures
+                * so that other CPUs don't have to wait until the start
+                * of the next grace period to process their callbacks.
+                */
+               rcu_for_each_node_breadth_first(rsp, rnp) {
+                       raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+                       rnp->completed = rsp->gpnum;
+                       raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+               }
+               rnp = rcu_get_root(rsp);
+               raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+       }
+
+       rsp->completed = rsp->gpnum;  /* Declare the grace period complete. */
         trace_rcu_grace_period(rsp->name, rsp->completed, "end");
-       rsp->signaled = RCU_GP_IDLE;
+       rsp->fqs_state = RCU_GP_IDLE;
         rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
  }
  
@@ -1192,7 +1326,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
         else
                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
         if (need_report & RCU_OFL_TASKS_EXP_GP)
-               rcu_report_exp_rnp(rsp, rnp);
+               rcu_report_exp_rnp(rsp, rnp, true);
         rcu_node_kthread_setaffinity(rnp, -1);
  }
  
@@ -1305,16 +1439,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
   * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
   * Also schedule RCU core processing.
   *
- * This function must be called with hardirqs disabled.  It is normally
+ * This function must be called from hardirq context.  It is normally
   * invoked from the scheduling-clock interrupt.  If rcu_pending returns
   * false, there is no point in invoking rcu_check_callbacks().
   */
  void rcu_check_callbacks(int cpu, int user)
  {
         trace_rcu_utilization("Start scheduler-tick");
-       if (user ||
-           (idle_cpu(cpu) && rcu_scheduler_active &&
-            !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+       if (user || rcu_is_cpu_rrupt_from_idle()) {
  
                 /*
                  * Get here if this CPU took its interrupt from user
@@ -1428,7 +1560,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
                 goto unlock_fqs_ret;  /* no GP in progress, time updated. */
         }
         rsp->fqs_active = 1;
-       switch (rsp->signaled) {
+       switch (rsp->fqs_state) {
         case RCU_GP_IDLE:
         case RCU_GP_INIT:
  
@@ -1444,7 +1576,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
                 force_qs_rnp(rsp, dyntick_save_progress_counter);
                 raw_spin_lock(&rnp->lock);  /* irqs already disabled */
                 if (rcu_gp_in_progress(rsp))
-                       rsp->signaled = RCU_FORCE_QS;
+                       rsp->fqs_state = RCU_FORCE_QS;
                 break;
  
         case RCU_FORCE_QS:
@@ -1530,9 +1662,6 @@ static void rcu_process_callbacks(struct softirq_action *unused)
                                 &__get_cpu_var(rcu_sched_data));
         __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
         rcu_preempt_process_callbacks();
-
-       /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
-       rcu_needs_cpu_flush();
         trace_rcu_utilization("End RCU core");
  }
  
@@ -1887,9 +2016,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
         for (i = 0; i < RCU_NEXT_SIZE; i++)
                 rdp->nxttail[i] = &rdp->nxtlist;
         rdp->qlen = 0;
-#ifdef CONFIG_NO_HZ
         rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
-#endif /* #ifdef CONFIG_NO_HZ */
+       WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING);
+       WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
         rdp->cpu = cpu;
         rdp->rsp = rsp;
         raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1916,6 +2045,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
         rdp->qlen_last_fqs_check = 0;
         rdp->n_force_qs_snap = rsp->n_force_qs;
         rdp->blimit = blimit;
+       WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING);
+       WARN_ON_ONCE((atomic_read(&rdp->dynticks->dynticks) & 0x1) != 1);
         raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
  
         /*