cgroups: add per-thread subsystem callbacks
Ben Blum [Thu, 26 May 2011 23:25:19 +0000 (16:25 -0700)]
Add cgroup subsystem callbacks for per-thread attachment in atomic contexts

Add can_attach_task(), pre_attach(), and attach_task() as new callbacks
for cgroups's subsystem interface.  Unlike can_attach and attach, these
are for per-thread operations, to be called potentially many times when
attaching an entire threadgroup.

Also, the old "bool threadgroup" interface is removed, as replaced by
this.  All subsystems are modified for the new interface - of note is
cpuset, which requires from/to nodemasks for attach to be globally scoped
(though per-cpuset would work too) to persist from its pre_attach to
attach_task and attach.

This is a pre-patch for cgroup-procs-writable.patch.

Signed-off-by: Ben Blum <bblum@andrew.cmu.edu>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Matt Helsley <matthltc@us.ibm.com>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Documentation/cgroups/cgroups.txt
block/blk-cgroup.c
include/linux/cgroup.h
kernel/cgroup.c
kernel/cgroup_freezer.c
kernel/cpuset.c
kernel/sched.c
mm/memcontrol.c
security/device_cgroup.c

index aedf1bd..b3bd3bd 100644 (file)
@@ -575,7 +575,7 @@ rmdir() will fail with it. From this behavior, pre_destroy() can be
 called multiple times against a cgroup.
 
 int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-              struct task_struct *task, bool threadgroup)
+              struct task_struct *task)
 (cgroup_mutex held by caller)
 
 Called prior to moving a task into a cgroup; if the subsystem
@@ -584,9 +584,14 @@ task is passed, then a successful result indicates that *any*
 unspecified task can be moved into the cgroup. Note that this isn't
 called on a fork. If this method returns 0 (success) then this should
 remain valid while the caller holds cgroup_mutex and it is ensured that either
-attach() or cancel_attach() will be called in future. If threadgroup is
-true, then a successful result indicates that all threads in the given
-thread's threadgroup can be moved together.
+attach() or cancel_attach() will be called in future.
+
+int can_attach_task(struct cgroup *cgrp, struct task_struct *tsk);
+(cgroup_mutex held by caller)
+
+As can_attach, but for operations that must be run once per task to be
+attached (possibly many when using cgroup_attach_proc). Called after
+can_attach.
 
 void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
               struct task_struct *task, bool threadgroup)
@@ -598,15 +603,24 @@ function, so that the subsystem can implement a rollback. If not, not necessary.
 This will be called only about subsystems whose can_attach() operation have
 succeeded.
 
+void pre_attach(struct cgroup *cgrp);
+(cgroup_mutex held by caller)
+
+For any non-per-thread attachment work that needs to happen before
+attach_task. Needed by cpuset.
+
 void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-           struct cgroup *old_cgrp, struct task_struct *task,
-           bool threadgroup)
+           struct cgroup *old_cgrp, struct task_struct *task)
 (cgroup_mutex held by caller)
 
 Called after the task has been attached to the cgroup, to allow any
 post-attachment activity that requires memory allocations or blocking.
-If threadgroup is true, the subsystem should take care of all threads
-in the specified thread's threadgroup. Currently does not support any
+
+void attach_task(struct cgroup *cgrp, struct task_struct *tsk);
+(cgroup_mutex held by caller)
+
+As attach, but for operations that must be run once per task to be attached,
+like can_attach_task. Called before attach. Currently does not support any
 subsystem that might need the old_cgrp for every thread in the group.
 
 void fork(struct cgroup_subsy *ss, struct task_struct *task)
index 07371cf..bcaf16e 100644 (file)
@@ -30,10 +30,8 @@ EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
                                                  struct cgroup *);
-static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
-                             struct task_struct *, bool);
-static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
-                          struct cgroup *, struct task_struct *, bool);
+static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *);
+static void blkiocg_attach_task(struct cgroup *, struct task_struct *);
 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
 
@@ -46,8 +44,8 @@ static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
 struct cgroup_subsys blkio_subsys = {
        .name = "blkio",
        .create = blkiocg_create,
-       .can_attach = blkiocg_can_attach,
-       .attach = blkiocg_attach,
+       .can_attach_task = blkiocg_can_attach_task,
+       .attach_task = blkiocg_attach_task,
        .destroy = blkiocg_destroy,
        .populate = blkiocg_populate,
 #ifdef CONFIG_BLK_CGROUP
@@ -1616,9 +1614,7 @@ done:
  * of the main cic data structures.  For now we allow a task to change
  * its cgroup only if it's the only owner of its ioc.
  */
-static int blkiocg_can_attach(struct cgroup_subsys *subsys,
-                               struct cgroup *cgroup, struct task_struct *tsk,
-                               bool threadgroup)
+static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
        struct io_context *ioc;
        int ret = 0;
@@ -1633,9 +1629,7 @@ static int blkiocg_can_attach(struct cgroup_subsys *subsys,
        return ret;
 }
 
-static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
-                               struct cgroup *prev, struct task_struct *tsk,
-                               bool threadgroup)
+static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
        struct io_context *ioc;
 
index 5ac7ebc..1e6cde2 100644 (file)
@@ -467,12 +467,14 @@ struct cgroup_subsys {
        int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
        void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
        int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                         struct task_struct *tsk, bool threadgroup);
+                         struct task_struct *tsk);
+       int (*can_attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
        void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                         struct task_struct *tsk, bool threadgroup);
+                             struct task_struct *tsk);
+       void (*pre_attach)(struct cgroup *cgrp);
+       void (*attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
        void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                       struct cgroup *old_cgrp, struct task_struct *tsk,
-                       bool threadgroup);
+                      struct cgroup *old_cgrp, struct task_struct *tsk);
        void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
        void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp,
                        struct cgroup *old_cgrp, struct task_struct *task);
index 909a355..38fb0ad 100644 (file)
@@ -1759,7 +1759,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
-                       retval = ss->can_attach(ss, cgrp, tsk, false);
+                       retval = ss->can_attach(ss, cgrp, tsk);
                        if (retval) {
                                /*
                                 * Remember on which subsystem the can_attach()
@@ -1771,6 +1771,13 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                                goto out;
                        }
                }
+               if (ss->can_attach_task) {
+                       retval = ss->can_attach_task(cgrp, tsk);
+                       if (retval) {
+                               failed_ss = ss;
+                               goto out;
+                       }
+               }
        }
 
        task_lock(tsk);
@@ -1805,8 +1812,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        write_unlock(&css_set_lock);
 
        for_each_subsys(root, ss) {
+               if (ss->pre_attach)
+                       ss->pre_attach(cgrp);
+               if (ss->attach_task)
+                       ss->attach_task(cgrp, tsk);
                if (ss->attach)
-                       ss->attach(ss, cgrp, oldcgrp, tsk, false);
+                       ss->attach(ss, cgrp, oldcgrp, tsk);
        }
        set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
        synchronize_rcu();
@@ -1829,7 +1840,7 @@ out:
                                 */
                                break;
                        if (ss->cancel_attach)
-                               ss->cancel_attach(ss, cgrp, tsk, false);
+                               ss->cancel_attach(ss, cgrp, tsk);
                }
        }
        return retval;
index e7bebb7..e691818 100644 (file)
@@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss,
  */
 static int freezer_can_attach(struct cgroup_subsys *ss,
                              struct cgroup *new_cgroup,
-                             struct task_struct *task, bool threadgroup)
+                             struct task_struct *task)
 {
        struct freezer *freezer;
 
@@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
        if (freezer->state != CGROUP_THAWED)
                return -EBUSY;
 
+       return 0;
+}
+
+static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+{
        rcu_read_lock();
-       if (__cgroup_freezing_or_frozen(task)) {
+       if (__cgroup_freezing_or_frozen(tsk)) {
                rcu_read_unlock();
                return -EBUSY;
        }
        rcu_read_unlock();
-
-       if (threadgroup) {
-               struct task_struct *c;
-
-               rcu_read_lock();
-               list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
-                       if (__cgroup_freezing_or_frozen(c)) {
-                               rcu_read_unlock();
-                               return -EBUSY;
-                       }
-               }
-               rcu_read_unlock();
-       }
-
        return 0;
 }
 
@@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = {
        .populate       = freezer_populate,
        .subsys_id      = freezer_subsys_id,
        .can_attach     = freezer_can_attach,
+       .can_attach_task = freezer_can_attach_task,
+       .pre_attach     = NULL,
+       .attach_task    = NULL,
        .attach         = NULL,
        .fork           = freezer_fork,
        .exit           = NULL,
index 2bb8c2e..55b297d 100644 (file)
@@ -1367,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp)
        return val;
 }
 
-/* Protected by cgroup_lock */
-static cpumask_var_t cpus_attach;
-
 /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
 static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
-                            struct task_struct *tsk, bool threadgroup)
+                            struct task_struct *tsk)
 {
-       int ret;
        struct cpuset *cs = cgroup_cs(cont);
 
        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1391,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
        if (tsk->flags & PF_THREAD_BOUND)
                return -EINVAL;
 
-       ret = security_task_setscheduler(tsk);
-       if (ret)
-               return ret;
-       if (threadgroup) {
-               struct task_struct *c;
-
-               rcu_read_lock();
-               list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                       ret = security_task_setscheduler(c);
-                       if (ret) {
-                               rcu_read_unlock();
-                               return ret;
-                       }
-               }
-               rcu_read_unlock();
-       }
        return 0;
 }
 
-static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
-                              struct cpuset *cs)
+static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
+{
+       return security_task_setscheduler(task);
+}
+
+/*
+ * Protected by cgroup_lock. The nodemasks must be stored globally because
+ * dynamically allocating them is not allowed in pre_attach, and they must
+ * persist among pre_attach, attach_task, and attach.
+ */
+static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_from;
+static nodemask_t cpuset_attach_nodemask_to;
+
+/* Set-up work for before attaching each task. */
+static void cpuset_pre_attach(struct cgroup *cont)
+{
+       struct cpuset *cs = cgroup_cs(cont);
+
+       if (cs == &top_cpuset)
+               cpumask_copy(cpus_attach, cpu_possible_mask);
+       else
+               guarantee_online_cpus(cs, cpus_attach);
+
+       guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+}
+
+/* Per-thread attachment work. */
+static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
 {
        int err;
+       struct cpuset *cs = cgroup_cs(cont);
+
        /*
         * can_attach beforehand should guarantee that this doesn't fail.
         * TODO: have a better way to handle failure here
@@ -1421,45 +1430,29 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
        err = set_cpus_allowed_ptr(tsk, cpus_attach);
        WARN_ON_ONCE(err);
 
-       cpuset_change_task_nodemask(tsk, to);
+       cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
        cpuset_update_task_spread_flag(cs, tsk);
-
 }
 
 static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
-                         struct cgroup *oldcont, struct task_struct *tsk,
-                         bool threadgroup)
+                         struct cgroup *oldcont, struct task_struct *tsk)
 {
        struct mm_struct *mm;
        struct cpuset *cs = cgroup_cs(cont);
        struct cpuset *oldcs = cgroup_cs(oldcont);
-       static nodemask_t to;           /* protected by cgroup_mutex */
 
-       if (cs == &top_cpuset) {
-               cpumask_copy(cpus_attach, cpu_possible_mask);
-       } else {
-               guarantee_online_cpus(cs, cpus_attach);
-       }
-       guarantee_online_mems(cs, &to);
-
-       /* do per-task migration stuff possibly for each in the threadgroup */
-       cpuset_attach_task(tsk, &to, cs);
-       if (threadgroup) {
-               struct task_struct *c;
-               rcu_read_lock();
-               list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                       cpuset_attach_task(c, &to, cs);
-               }
-               rcu_read_unlock();
-       }
-
-       /* change mm; only needs to be done once even if threadgroup */
-       to = cs->mems_allowed;
+       /*
+        * Change mm, possibly for multiple threads in a threadgroup. This is
+        * expensive and may sleep.
+        */
+       cpuset_attach_nodemask_from = oldcs->mems_allowed;
+       cpuset_attach_nodemask_to = cs->mems_allowed;
        mm = get_task_mm(tsk);
        if (mm) {
-               mpol_rebind_mm(mm, &to);
+               mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
                if (is_memory_migrate(cs))
-                       cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
+                       cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
+                                         &cpuset_attach_nodemask_to);
                mmput(mm);
        }
 }
@@ -1911,6 +1904,9 @@ struct cgroup_subsys cpuset_subsys = {
        .create = cpuset_create,
        .destroy = cpuset_destroy,
        .can_attach = cpuset_can_attach,
+       .can_attach_task = cpuset_can_attach_task,
+       .pre_attach = cpuset_pre_attach,
+       .attach_task = cpuset_attach_task,
        .attach = cpuset_attach,
        .populate = cpuset_populate,
        .post_clone = cpuset_post_clone,
index 2d12893..5e43e9d 100644 (file)
@@ -8764,42 +8764,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        return 0;
 }
 
-static int
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                     struct task_struct *tsk, bool threadgroup)
-{
-       int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
-       if (retval)
-               return retval;
-       if (threadgroup) {
-               struct task_struct *c;
-               rcu_read_lock();
-               list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                       retval = cpu_cgroup_can_attach_task(cgrp, c);
-                       if (retval) {
-                               rcu_read_unlock();
-                               return retval;
-                       }
-               }
-               rcu_read_unlock();
-       }
-       return 0;
-}
-
 static void
-cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                 struct cgroup *old_cont, struct task_struct *tsk,
-                 bool threadgroup)
+cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
        sched_move_task(tsk);
-       if (threadgroup) {
-               struct task_struct *c;
-               rcu_read_lock();
-               list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                       sched_move_task(c);
-               }
-               rcu_read_unlock();
-       }
 }
 
 static void
@@ -8887,8 +8855,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
        .create         = cpu_cgroup_create,
        .destroy        = cpu_cgroup_destroy,
-       .can_attach     = cpu_cgroup_can_attach,
-       .attach         = cpu_cgroup_attach,
+       .can_attach_task = cpu_cgroup_can_attach_task,
+       .attach_task    = cpu_cgroup_attach_task,
        .exit           = cpu_cgroup_exit,
        .populate       = cpu_cgroup_populate,
        .subsys_id      = cpu_cgroup_subsys_id,
index d5fd3dc..fc25992 100644 (file)
@@ -4953,8 +4953,7 @@ static void mem_cgroup_clear_mc(void)
 
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                                struct cgroup *cgroup,
-                               struct task_struct *p,
-                               bool threadgroup)
+                               struct task_struct *p)
 {
        int ret = 0;
        struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
@@ -4993,8 +4992,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
 
 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
                                struct cgroup *cgroup,
-                               struct task_struct *p,
-                               bool threadgroup)
+                               struct task_struct *p)
 {
        mem_cgroup_clear_mc();
 }
@@ -5112,8 +5110,7 @@ retry:
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                struct cgroup *cont,
                                struct cgroup *old_cont,
-                               struct task_struct *p,
-                               bool threadgroup)
+                               struct task_struct *p)
 {
        struct mm_struct *mm;
 
@@ -5131,22 +5128,19 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 #else  /* !CONFIG_MMU */
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                                struct cgroup *cgroup,
-                               struct task_struct *p,
-                               bool threadgroup)
+                               struct task_struct *p)
 {
        return 0;
 }
 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
                                struct cgroup *cgroup,
-                               struct task_struct *p,
-                               bool threadgroup)
+                               struct task_struct *p)
 {
 }
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                struct cgroup *cont,
                                struct cgroup *old_cont,
-                               struct task_struct *p,
-                               bool threadgroup)
+                               struct task_struct *p)
 {
 }
 #endif
index 8d9c48f..cd1f779 100644 (file)
@@ -62,8 +62,7 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
 struct cgroup_subsys devices_subsys;
 
 static int devcgroup_can_attach(struct cgroup_subsys *ss,
-               struct cgroup *new_cgroup, struct task_struct *task,
-               bool threadgroup)
+               struct cgroup *new_cgroup, struct task_struct *task)
 {
        if (current != task && !capable(CAP_SYS_ADMIN))
                        return -EPERM;