]> nv-tegra.nvidia Code Review - linux-2.6.git/blobdiff - kernel/rcutiny_plugin.h
cgroups: fix a css_set not found bug in cgroup_attach_proc
[linux-2.6.git] / kernel / rcutiny_plugin.h
index 95f9239df5120581ed770dd76be8d15575c4d74c..f259c676195fcab24fb36af68130519a931778e0 100644 (file)
  */
 
 #include <linux/kthread.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt)        stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+       struct rcu_head *rcucblist;     /* List of pending callbacks (CBs). */
+       struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
+       struct rcu_head **curtail;      /* ->next pointer of last CB. */
+       RCU_TRACE(long qlen);           /* Number of pending CBs. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+       .donetail       = &rcu_sched_ctrlblk.rcucblist,
+       .curtail        = &rcu_sched_ctrlblk.rcucblist,
+};
+
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+       .donetail       = &rcu_bh_ctrlblk.rcucblist,
+       .curtail        = &rcu_bh_ctrlblk.rcucblist,
+};
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 #ifdef CONFIG_TINY_PREEMPT_RCU
 
@@ -48,17 +80,50 @@ struct rcu_preempt_ctrlblk {
        struct list_head *gp_tasks;
                                /* Pointer to the first task blocking the */
                                /*  current grace period, or NULL if there */
-                               /*  is not such task. */
+                               /*  is no such task. */
        struct list_head *exp_tasks;
                                /* Pointer to first task blocking the */
                                /*  current expedited grace period, or NULL */
                                /*  if there is no such task.  If there */
                                /*  is no current expedited grace period, */
                                /*  then there cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+       struct list_head *boost_tasks;
+                               /* Pointer to first task that needs to be */
+                               /*  priority-boosted, or NULL if no priority */
+                               /*  boosting is needed.  If there is no */
+                               /*  current or expedited grace period, there */
+                               /*  can be no such task. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
        u8 gpnum;               /* Current grace period. */
        u8 gpcpu;               /* Last grace period blocked by the CPU. */
        u8 completed;           /* Last grace period completed. */
                                /*  If all three are equal, RCU is idle. */
+#ifdef CONFIG_RCU_BOOST
+       unsigned long boost_time; /* When to start boosting (jiffies) */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+       unsigned long n_grace_periods;
+#ifdef CONFIG_RCU_BOOST
+       unsigned long n_tasks_boosted;
+                               /* Total number of tasks boosted. */
+       unsigned long n_exp_boosts;
+                               /* Number of tasks boosted for expedited GP. */
+       unsigned long n_normal_boosts;
+                               /* Number of tasks boosted for normal GP. */
+       unsigned long n_balk_blkd_tasks;
+                               /* Refused to boost: no blocked tasks. */
+       unsigned long n_balk_exp_gp_tasks;
+                               /* Refused to boost: nothing blocking GP. */
+       unsigned long n_balk_boost_tasks;
+                               /* Refused to boost: already boosting. */
+       unsigned long n_balk_notyet;
+                               /* Refused to boost: not yet time. */
+       unsigned long n_balk_nos;
+                               /* Refused to boost: not sure why, though. */
+                               /*  This can happen due to race conditions. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#endif /* #ifdef CONFIG_RCU_TRACE */
 };
 
 static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -123,6 +188,197 @@ static int rcu_preempt_gp_in_progress(void)
        return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
 }
 
+/*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t)
+{
+       struct list_head *np;
+
+       np = t->rcu_node_entry.next;
+       if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+               np = NULL;
+       return np;
+}
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Dump additional statistice for TINY_PREEMPT_RCU.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+       seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
+                  rcu_preempt_ctrlblk.rcb.qlen,
+                  rcu_preempt_ctrlblk.n_grace_periods,
+                  rcu_preempt_ctrlblk.gpnum,
+                  rcu_preempt_ctrlblk.gpcpu,
+                  rcu_preempt_ctrlblk.completed,
+                  "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
+                  "N."[!rcu_preempt_ctrlblk.gp_tasks],
+                  "E."[!rcu_preempt_ctrlblk.exp_tasks]);
+#ifdef CONFIG_RCU_BOOST
+       seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+                  "             ",
+                  "B."[!rcu_preempt_ctrlblk.boost_tasks],
+                  rcu_preempt_ctrlblk.n_tasks_boosted,
+                  rcu_preempt_ctrlblk.n_exp_boosts,
+                  rcu_preempt_ctrlblk.n_normal_boosts,
+                  (int)(jiffies & 0xffff),
+                  (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
+       seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n",
+                  "             balk",
+                  rcu_preempt_ctrlblk.n_balk_blkd_tasks,
+                  rcu_preempt_ctrlblk.n_balk_exp_gp_tasks,
+                  rcu_preempt_ctrlblk.n_balk_boost_tasks,
+                  rcu_preempt_ctrlblk.n_balk_notyet,
+                  rcu_preempt_ctrlblk.n_balk_nos);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+#ifdef CONFIG_RCU_BOOST
+
+#include "rtmutex_common.h"
+
+/*
+ * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
+ * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
+ */
+static int rcu_boost(void)
+{
+       unsigned long flags;
+       struct rt_mutex mtx;
+       struct task_struct *t;
+       struct list_head *tb;
+
+       if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
+           rcu_preempt_ctrlblk.exp_tasks == NULL)
+               return 0;  /* Nothing to boost. */
+
+       raw_local_irq_save(flags);
+
+       /*
+        * Recheck with irqs disabled: all tasks in need of boosting
+        * might exit their RCU read-side critical sections on their own
+        * if we are preempted just before disabling irqs.
+        */
+       if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
+           rcu_preempt_ctrlblk.exp_tasks == NULL) {
+               raw_local_irq_restore(flags);
+               return 0;
+       }
+
+       /*
+        * Preferentially boost tasks blocking expedited grace periods.
+        * This cannot starve the normal grace periods because a second
+        * expedited grace period must boost all blocked tasks, including
+        * those blocking the pre-existing normal grace period.
+        */
+       if (rcu_preempt_ctrlblk.exp_tasks != NULL) {
+               tb = rcu_preempt_ctrlblk.exp_tasks;
+               RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
+       } else {
+               tb = rcu_preempt_ctrlblk.boost_tasks;
+               RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
+       }
+       RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+
+       /*
+        * We boost task t by manufacturing an rt_mutex that appears to
+        * be held by task t.  We leave a pointer to that rt_mutex where
+        * task t can find it, and task t will release the mutex when it
+        * exits its outermost RCU read-side critical section.  Then
+        * simply acquiring this artificial rt_mutex will boost task
+        * t's priority.  (Thanks to tglx for suggesting this approach!)
+        */
+       t = container_of(tb, struct task_struct, rcu_node_entry);
+       rt_mutex_init_proxy_locked(&mtx, t);
+       t->rcu_boost_mutex = &mtx;
+       t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+       raw_local_irq_restore(flags);
+       rt_mutex_lock(&mtx);
+       rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
+
+       return rcu_preempt_ctrlblk.boost_tasks != NULL ||
+              rcu_preempt_ctrlblk.exp_tasks != NULL;
+}
+
+/*
+ * Check to see if it is now time to start boosting RCU readers blocking
+ * the current grace period, and, if so, tell the rcu_kthread_task to
+ * start boosting them.  If there is an expedited boost in progress,
+ * we wait for it to complete.
+ *
+ * If there are no blocked readers blocking the current grace period,
+ * return 0 to let the caller know, otherwise return 1.  Note that this
+ * return value is independent of whether or not boosting was done.
+ */
+static int rcu_initiate_boost(void)
+{
+       if (!rcu_preempt_blocked_readers_cgp() &&
+           rcu_preempt_ctrlblk.exp_tasks == NULL) {
+               RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++);
+               return 0;
+       }
+       if (rcu_preempt_ctrlblk.exp_tasks != NULL ||
+           (rcu_preempt_ctrlblk.gp_tasks != NULL &&
+            rcu_preempt_ctrlblk.boost_tasks == NULL &&
+            ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) {
+               if (rcu_preempt_ctrlblk.exp_tasks == NULL)
+                       rcu_preempt_ctrlblk.boost_tasks =
+                               rcu_preempt_ctrlblk.gp_tasks;
+               invoke_rcu_kthread();
+       } else
+               RCU_TRACE(rcu_initiate_boost_trace());
+       return 1;
+}
+
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
+
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+       rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+}
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * If there is no RCU priority boosting, we don't boost.
+ */
+static int rcu_boost(void)
+{
+       return 0;
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate boosting,
+ * but we do indicate whether there are blocked readers blocking the
+ * current grace period.
+ */
+static int rcu_initiate_boost(void)
+{
+       return rcu_preempt_blocked_readers_cgp();
+}
+
+/*
+ * If there is no RCU priority boosting, nothing to do at grace-period start.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+}
+
+#endif /* else #ifdef CONFIG_RCU_BOOST */
+
 /*
  * Record a preemptible-RCU quiescent state for the specified CPU.  Note
  * that this just means that the task currently running on the CPU is
@@ -150,11 +406,14 @@ static void rcu_preempt_cpu_qs(void)
        rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
        current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 
+       /* If there is no GP then there is nothing more to do.  */
+       if (!rcu_preempt_gp_in_progress())
+               return;
        /*
-        * If there is no GP, or if blocked readers are still blocking GP,
-        * then there is nothing more to do.
+        * Check up on boosting.  If there are readers blocking the
+        * current grace period, leave.
         */
-       if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
+       if (rcu_initiate_boost())
                return;
 
        /* Advance callbacks. */
@@ -168,7 +427,7 @@ static void rcu_preempt_cpu_qs(void)
 
        /* If there are done callbacks, cause them to be invoked. */
        if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
-               invoke_rcu_cbs();
+               invoke_rcu_kthread();
 }
 
 /*
@@ -180,12 +439,16 @@ static void rcu_preempt_start_gp(void)
 
                /* Official start of GP. */
                rcu_preempt_ctrlblk.gpnum++;
+               RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
 
                /* Any blocked RCU readers block new GP. */
                if (rcu_preempt_blocked_readers_any())
                        rcu_preempt_ctrlblk.gp_tasks =
                                rcu_preempt_ctrlblk.blkd_tasks.next;
 
+               /* Set up for RCU priority boosting. */
+               rcu_preempt_boost_start_gp();
+
                /* If there is no running reader, CPU is done with GP. */
                if (!rcu_preempt_running_reader())
                        rcu_preempt_cpu_qs();
@@ -306,15 +569,16 @@ static void rcu_read_unlock_special(struct task_struct *t)
                 */
                empty = !rcu_preempt_blocked_readers_cgp();
                empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
-               np = t->rcu_node_entry.next;
-               if (np == &rcu_preempt_ctrlblk.blkd_tasks)
-                       np = NULL;
-               list_del(&t->rcu_node_entry);
+               np = rcu_next_node_entry(t);
+               list_del_init(&t->rcu_node_entry);
                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
                        rcu_preempt_ctrlblk.gp_tasks = np;
                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
                        rcu_preempt_ctrlblk.exp_tasks = np;
-               INIT_LIST_HEAD(&t->rcu_node_entry);
+#ifdef CONFIG_RCU_BOOST
+               if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
+                       rcu_preempt_ctrlblk.boost_tasks = np;
+#endif /* #ifdef CONFIG_RCU_BOOST */
 
                /*
                 * If this was the last task on the current list, and if
@@ -333,6 +597,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
                if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
                        rcu_report_exp_done();
        }
+#ifdef CONFIG_RCU_BOOST
+       /* Unboost self if was boosted. */
+       if (special & RCU_READ_UNLOCK_BOOSTED) {
+               t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
+               rt_mutex_unlock(t->rcu_boost_mutex);
+               t->rcu_boost_mutex = NULL;
+       }
+#endif /* #ifdef CONFIG_RCU_BOOST */
        local_irq_restore(flags);
 }
 
@@ -376,7 +648,7 @@ static void rcu_preempt_check_callbacks(void)
                rcu_preempt_cpu_qs();
        if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
            rcu_preempt_ctrlblk.rcb.donetail)
-               invoke_rcu_cbs();
+               invoke_rcu_kthread();
        if (rcu_preempt_gp_in_progress() &&
            rcu_cpu_blocking_cur_gp() &&
            rcu_preempt_running_reader())
@@ -419,6 +691,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
        local_irq_save(flags);
        *rcu_preempt_ctrlblk.nexttail = head;
        rcu_preempt_ctrlblk.nexttail = &head->next;
+       RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
        rcu_preempt_start_gp();  /* checks to see if GP needed. */
        local_irq_restore(flags);
 }
@@ -530,12 +803,16 @@ void synchronize_rcu_expedited(void)
        rpcp->exp_tasks = rpcp->blkd_tasks.next;
        if (rpcp->exp_tasks == &rpcp->blkd_tasks)
                rpcp->exp_tasks = NULL;
-       local_irq_restore(flags);
 
        /* Wait for tail of ->blkd_tasks list to drain. */
-       if (rcu_preempted_readers_exp())
+       if (!rcu_preempted_readers_exp())
+               local_irq_restore(flags);
+       else {
+               rcu_initiate_boost();
+               local_irq_restore(flags);
                wait_event(sync_rcu_preempt_exp_wq,
                           !rcu_preempted_readers_exp());
+       }
 
        /* Clean up and exit. */
        barrier(); /* ensure expedited GP seen before counter increment. */
@@ -569,11 +846,32 @@ void exit_rcu(void)
        if (t->rcu_read_lock_nesting == 0)
                return;
        t->rcu_read_lock_nesting = 1;
-       rcu_read_unlock();
+       __rcu_read_unlock();
 }
 
 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
 
+#ifdef CONFIG_RCU_TRACE
+
+/*
+ * Because preemptible RCU does not exist, it is not necessary to
+ * dump out its statistics.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * Because preemptible RCU does not exist, it is never necessary to
+ * boost preempted RCU readers.
+ */
+static int rcu_boost(void)
+{
+       return 0;
+}
+
 /*
  * Because preemptible RCU does not exist, it never has any callbacks
  * to check.
@@ -614,3 +912,96 @@ void __init rcu_scheduler_starting(void)
 }
 
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define RCU_BOOST_PRIO 1
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+
+static void rcu_initiate_boost_trace(void)
+{
+       if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
+               rcu_preempt_ctrlblk.n_balk_blkd_tasks++;
+       else if (rcu_preempt_ctrlblk.gp_tasks == NULL &&
+                rcu_preempt_ctrlblk.exp_tasks == NULL)
+               rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++;
+       else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
+               rcu_preempt_ctrlblk.n_balk_boost_tasks++;
+       else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
+               rcu_preempt_ctrlblk.n_balk_notyet++;
+       else
+               rcu_preempt_ctrlblk.n_balk_nos++;
+}
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
+{
+       unsigned long flags;
+
+       raw_local_irq_save(flags);
+       rcp->qlen -= n;
+       raw_local_irq_restore(flags);
+}
+
+/*
+ * Dump statistics for TINY_RCU, such as they are.
+ */
+static int show_tiny_stats(struct seq_file *m, void *unused)
+{
+       show_tiny_preempt_stats(m);
+       seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
+       seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
+       return 0;
+}
+
+static int show_tiny_stats_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, show_tiny_stats, NULL);
+}
+
+static const struct file_operations show_tiny_stats_fops = {
+       .owner = THIS_MODULE,
+       .open = show_tiny_stats_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = single_release,
+};
+
+static struct dentry *rcudir;
+
+static int __init rcutiny_trace_init(void)
+{
+       struct dentry *retval;
+
+       rcudir = debugfs_create_dir("rcu", NULL);
+       if (!rcudir)
+               goto free_out;
+       retval = debugfs_create_file("rcudata", 0444, rcudir,
+                                    NULL, &show_tiny_stats_fops);
+       if (!retval)
+               goto free_out;
+       return 0;
+free_out:
+       debugfs_remove_recursive(rcudir);
+       return 1;
+}
+
+static void __exit rcutiny_trace_cleanup(void)
+{
+       debugfs_remove_recursive(rcudir);
+}
+
+module_init(rcutiny_trace_init);
+module_exit(rcutiny_trace_cleanup);
+
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
+MODULE_LICENSE("GPL");
+
+#endif /* #ifdef CONFIG_RCU_TRACE */