]> nv-tegra.nvidia Code Review - linux-2.6.git/blobdiff - kernel/cgroup.c
media: tegra: Tegra videobuf2
[linux-2.6.git] / kernel / cgroup.c
index e31b220a743deda483fe4ec300254affd76f87e0..54a36fe288f0e5406305b28f483618955450f9f3 100644 (file)
  */
 
 #include <linux/cgroup.h>
+#include <linux/cred.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/init_task.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/mm.h>
@@ -57,8 +59,9 @@
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/eventfd.h>
 #include <linux/poll.h>
+#include <linux/flex_array.h> /* used in cgroup_attach_proc */
 
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 
 static DEFINE_MUTEX(cgroup_mutex);
 
@@ -157,7 +160,7 @@ struct css_id {
 };
 
 /*
- * cgroup_event represents events which userspace want to recieve.
+ * cgroup_event represents events which userspace want to receive.
  */
 struct cgroup_event {
        /*
@@ -267,6 +270,33 @@ static void cgroup_release_agent(struct work_struct *work);
 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
 
+/*
+ * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
+ * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
+ * reference to css->refcnt. In general, this refcnt is expected to goes down
+ * to zero, soon.
+ *
+ * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
+ */
+DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+
+static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
+{
+       if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+               wake_up_all(&cgroup_rmdir_waitq);
+}
+
+void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
+{
+       css_get(css);
+}
+
+void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
+{
+       cgroup_wakeup_rmdir_waiter(css->cgroup);
+       css_put(css);
+}
+
 /* Link structure for associating css_set objects with cgroups */
 struct cg_cgroup_link {
        /*
@@ -326,10 +356,35 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
        return &css_set_table[index];
 }
 
+static void free_css_set_work(struct work_struct *work)
+{
+       struct css_set *cg = container_of(work, struct css_set, work);
+       struct cg_cgroup_link *link;
+       struct cg_cgroup_link *saved_link;
+
+       write_lock(&css_set_lock);
+       list_for_each_entry_safe(link, saved_link, &cg->cg_links,
+                                cg_link_list) {
+               struct cgroup *cgrp = link->cgrp;
+               list_del(&link->cg_link_list);
+               list_del(&link->cgrp_link_list);
+               if (atomic_dec_and_test(&cgrp->count)) {
+                       check_for_release(cgrp);
+                       cgroup_wakeup_rmdir_waiter(cgrp);
+               }
+               kfree(link);
+       }
+       write_unlock(&css_set_lock);
+
+       kfree(cg);
+}
+
 static void free_css_set_rcu(struct rcu_head *obj)
 {
        struct css_set *cg = container_of(obj, struct css_set, rcu_head);
-       kfree(cg);
+
+       INIT_WORK(&cg->work, free_css_set_work);
+       schedule_work(&cg->work);
 }
 
 /* We don't maintain the lists running through each css_set to its
@@ -338,10 +393,16 @@ static void free_css_set_rcu(struct rcu_head *obj)
  * compiled into their kernel but not actually in use */
 static int use_task_css_set_links __read_mostly;
 
-static void __put_css_set(struct css_set *cg, int taskexit)
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cg)
+{
+       atomic_inc(&cg->refcount);
+}
+
+static void put_css_set(struct css_set *cg)
 {
-       struct cg_cgroup_link *link;
-       struct cg_cgroup_link *saved_link;
        /*
         * Ensure that the refcount doesn't hit zero while any readers
         * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -355,47 +416,13 @@ static void __put_css_set(struct css_set *cg, int taskexit)
                return;
        }
 
-       /* This css_set is dead. unlink it and release cgroup refcounts */
        hlist_del(&cg->hlist);
        css_set_count--;
 
-       list_for_each_entry_safe(link, saved_link, &cg->cg_links,
-                                cg_link_list) {
-               struct cgroup *cgrp = link->cgrp;
-               list_del(&link->cg_link_list);
-               list_del(&link->cgrp_link_list);
-               if (atomic_dec_and_test(&cgrp->count) &&
-                   notify_on_release(cgrp)) {
-                       if (taskexit)
-                               set_bit(CGRP_RELEASABLE, &cgrp->flags);
-                       check_for_release(cgrp);
-               }
-
-               kfree(link);
-       }
-
        write_unlock(&css_set_lock);
        call_rcu(&cg->rcu_head, free_css_set_rcu);
 }
 
-/*
- * refcounted get/put for css_set objects
- */
-static inline void get_css_set(struct css_set *cg)
-{
-       atomic_inc(&cg->refcount);
-}
-
-static inline void put_css_set(struct css_set *cg)
-{
-       __put_css_set(cg, 0);
-}
-
-static inline void put_css_set_taskexit(struct css_set *cg)
-{
-       __put_css_set(cg, 1);
-}
-
 /*
  * compare_css_sets - helper function for find_existing_css_set().
  * @cg: candidate css_set being tested
@@ -725,9 +752,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
  * another.  It does so using cgroup_mutex, however there are
  * several performance critical places that need to reference
- * task->cgroup without the expense of grabbing a system global
+ * task->cgroups without the expense of grabbing a system global
  * mutex.  Therefore except as noted below, when dereferencing or, as
- * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
+ * in cgroup_attach_task(), modifying a task's cgroups pointer we use
  * task_lock(), which acts on a spinlock (task->alloc_lock) already in
  * the task_struct routinely used for such matters.
  *
@@ -812,13 +839,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
        return ret;
 }
 
-static void free_cgroup_rcu(struct rcu_head *obj)
-{
-       struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
-
-       kfree(cgrp);
-}
-
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 {
        /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -856,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                 */
                BUG_ON(!list_empty(&cgrp->pidlists));
 
-               call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
+               kfree_rcu(cgrp, rcu_head);
        }
        iput(inode);
 }
@@ -923,33 +943,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
        remove_dir(dentry);
 }
 
-/*
- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
- * reference to css->refcnt. In general, this refcnt is expected to goes down
- * to zero, soon.
- *
- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
- */
-DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-
-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
-       if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
-               wake_up_all(&cgroup_rmdir_waitq);
-}
-
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
-{
-       css_get(css);
-}
-
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
-{
-       cgroup_wakeup_rmdir_waiter(css->cgroup);
-       css_put(css);
-}
-
 /*
  * Call with cgroup_mutex held. Drops reference counts on modules, including
  * any duplicate ones that parse_cgroupfs_options took. If this function
@@ -1185,10 +1178,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 
        /*
         * If the 'all' option was specified select all the subsystems,
-        * otherwise 'all, 'none' and a subsystem name options were not
-        * specified, let's default to 'all'
+        * otherwise if 'none', 'name=' and a subsystem name options
+        * were not specified, let's default to 'all'
         */
-       if (all_ss || (!all_ss && !one_ss && !opts->none)) {
+       if (all_ss || (!one_ss && !opts->none && !opts->name)) {
                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (ss == NULL)
@@ -1526,6 +1519,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                struct cgroup *root_cgrp = &root->top_cgroup;
                struct inode *inode;
                struct cgroupfs_root *existing_root;
+               const struct cred *cred;
                int i;
 
                BUG_ON(sb->s_root != NULL);
@@ -1605,7 +1599,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                BUG_ON(!list_empty(&root_cgrp->children));
                BUG_ON(root->number_of_cgroups != 1);
 
+               cred = override_creds(&init_cred);
                cgroup_populate_dir(root_cgrp);
+               revert_creds(cred);
                mutex_unlock(&cgroup_mutex);
                mutex_unlock(&inode->i_mutex);
        } else {
@@ -1709,7 +1705,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
        char *start;
        struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
-                                                     rcu_read_lock_held() ||
                                                      cgroup_lock_is_held());
 
        if (!dentry || cgrp == dummytop) {
@@ -1735,7 +1730,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
                        break;
 
                dentry = rcu_dereference_check(cgrp->dentry,
-                                              rcu_read_lock_held() ||
                                               cgroup_lock_is_held());
                if (!cgrp->parent)
                        continue;
@@ -1748,6 +1742,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 }
 EXPORT_SYMBOL_GPL(cgroup_path);
 
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exist. If not set, this function might sleep, and can fail with
+ * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+                              struct task_struct *tsk, bool guarantee)
+{
+       struct css_set *oldcg;
+       struct css_set *newcg;
+
+       /*
+        * get old css_set. we need to take task_lock and refcount it, because
+        * an exiting task can change its css_set to init_css_set and drop its
+        * old one without taking cgroup_mutex.
+        */
+       task_lock(tsk);
+       oldcg = tsk->cgroups;
+       get_css_set(oldcg);
+       task_unlock(tsk);
+
+       /* locate or allocate a new css_set for this task. */
+       if (guarantee) {
+               /* we know the css_set we want already exists. */
+               struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+               read_lock(&css_set_lock);
+               newcg = find_existing_css_set(oldcg, cgrp, template);
+               BUG_ON(!newcg);
+               get_css_set(newcg);
+               read_unlock(&css_set_lock);
+       } else {
+               might_sleep();
+               /* find_css_set will give us newcg already referenced. */
+               newcg = find_css_set(oldcg, cgrp);
+               if (!newcg) {
+                       put_css_set(oldcg);
+                       return -ENOMEM;
+               }
+       }
+       put_css_set(oldcg);
+
+       /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
+       task_lock(tsk);
+       if (tsk->flags & PF_EXITING) {
+               task_unlock(tsk);
+               put_css_set(newcg);
+               return -ESRCH;
+       }
+       rcu_assign_pointer(tsk->cgroups, newcg);
+       task_unlock(tsk);
+
+       /* Update the css_set linked lists if we're using them */
+       write_lock(&css_set_lock);
+       if (!list_empty(&tsk->cg_list))
+               list_move(&tsk->cg_list, &newcg->tasks);
+       write_unlock(&css_set_lock);
+
+       /*
+        * We just gained a reference on oldcg by taking it from the task. As
+        * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+        * it here; it will be freed under RCU.
+        */
+       put_css_set(oldcg);
+
+       set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+       return 0;
+}
+
 /**
  * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
  * @cgrp: the cgroup the task is attaching to
@@ -1758,12 +1822,11 @@ EXPORT_SYMBOL_GPL(cgroup_path);
  */
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
-       int retval = 0;
+       int retval;
        struct cgroup_subsys *ss, *failed_ss = NULL;
        struct cgroup *oldcgrp;
-       struct css_set *cg;
-       struct css_set *newcg;
        struct cgroupfs_root *root = cgrp->root;
+       struct css_set *cg;
 
        /* Nothing to do if the task is already in that cgroup */
        oldcgrp = task_cgroup_from_root(tsk, root);
@@ -1772,7 +1835,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
-                       retval = ss->can_attach(ss, cgrp, tsk, false);
+                       retval = ss->can_attach(ss, cgrp, tsk);
                        if (retval) {
                                /*
                                 * Remember on which subsystem the can_attach()
@@ -1784,45 +1847,34 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                                goto out;
                        }
                }
+               if (ss->can_attach_task) {
+                       retval = ss->can_attach_task(cgrp, tsk);
+                       if (retval) {
+                               failed_ss = ss;
+                               goto out;
+                       }
+               }
        }
 
        task_lock(tsk);
        cg = tsk->cgroups;
        get_css_set(cg);
        task_unlock(tsk);
-       /*
-        * Locate or allocate a new css_set for this task,
-        * based on its final set of cgroups
-        */
-       newcg = find_css_set(cg, cgrp);
-       put_css_set(cg);
-       if (!newcg) {
-               retval = -ENOMEM;
-               goto out;
-       }
 
-       task_lock(tsk);
-       if (tsk->flags & PF_EXITING) {
-               task_unlock(tsk);
-               put_css_set(newcg);
-               retval = -ESRCH;
+       retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
+       if (retval)
                goto out;
-       }
-       rcu_assign_pointer(tsk->cgroups, newcg);
-       task_unlock(tsk);
-
-       /* Update the css_set linked lists if we're using them */
-       write_lock(&css_set_lock);
-       if (!list_empty(&tsk->cg_list))
-               list_move(&tsk->cg_list, &newcg->tasks);
-       write_unlock(&css_set_lock);
 
        for_each_subsys(root, ss) {
+               if (ss->pre_attach)
+                       ss->pre_attach(cgrp);
+               if (ss->attach_task)
+                       ss->attach_task(cgrp, tsk);
                if (ss->attach)
-                       ss->attach(ss, cgrp, oldcgrp, tsk, false);
+                       ss->attach(ss, cgrp, oldcgrp, tsk);
        }
-       set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
-       synchronize_rcu();
+       set_bit(CGRP_RELEASABLE, &cgrp->flags);
+       /* put_css_set will not destroy cg until after an RCU grace period */
        put_css_set(cg);
 
        /*
@@ -1842,7 +1894,7 @@ out:
                                 */
                                break;
                        if (ss->cancel_attach)
-                               ss->cancel_attach(ss, cgrp, tsk, false);
+                               ss->cancel_attach(ss, cgrp, tsk);
                }
        }
        return retval;
@@ -1873,49 +1925,390 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
 
 /*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * held. May take task_lock of task
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+       struct css_set *cg;
+       struct list_head links;
+};
+
+static bool css_set_check_fetched(struct cgroup *cgrp,
+                                 struct task_struct *tsk, struct css_set *cg,
+                                 struct list_head *newcg_list)
+{
+       struct css_set *newcg;
+       struct cg_list_entry *cg_entry;
+       struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+
+       read_lock(&css_set_lock);
+       newcg = find_existing_css_set(cg, cgrp, template);
+       if (newcg)
+               get_css_set(newcg);
+       read_unlock(&css_set_lock);
+
+       /* doesn't exist at all? */
+       if (!newcg)
+               return false;
+       /* see if it's already in the list */
+       list_for_each_entry(cg_entry, newcg_list, links) {
+               if (cg_entry->cg == newcg) {
+                       put_css_set(newcg);
+                       return true;
+               }
+       }
+
+       /* not found */
+       put_css_set(newcg);
+       return false;
+}
+
+/*
+ * Find the new css_set and store it in the list in preparation for moving the
+ * given task to the given cgroup. Returns 0 or -ENOMEM.
  */
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+                           struct list_head *newcg_list)
+{
+       struct css_set *newcg;
+       struct cg_list_entry *cg_entry;
+
+       /* ensure a new css_set will exist for this thread */
+       newcg = find_css_set(cg, cgrp);
+       if (!newcg)
+               return -ENOMEM;
+       /* add it to the list */
+       cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+       if (!cg_entry) {
+               put_css_set(newcg);
+               return -ENOMEM;
+       }
+       cg_entry->cg = newcg;
+       list_add(&cg_entry->links, newcg_list);
+       return 0;
+}
+
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
+ * take task_lock of each thread in leader's threadgroup individually in turn.
+ */
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+       int retval, i, group_size;
+       struct cgroup_subsys *ss, *failed_ss = NULL;
+       bool cancel_failed_ss = false;
+       /* guaranteed to be initialized later, but the compiler needs this */
+       struct cgroup *oldcgrp = NULL;
+       struct css_set *oldcg;
+       struct cgroupfs_root *root = cgrp->root;
+       /* threadgroup list cursor and array */
+       struct task_struct *tsk;
+       struct flex_array *group;
+       /*
+        * we need to make sure we have css_sets for all the tasks we're
+        * going to move -before- we actually start moving them, so that in
+        * case we get an ENOMEM we can bail out before making any changes.
+        */
+       struct list_head newcg_list;
+       struct cg_list_entry *cg_entry, *temp_nobe;
+
+       /*
+        * step 0: in order to do expensive, possibly blocking operations for
+        * every thread, we cannot iterate the thread group list, since it needs
+        * rcu or tasklist locked. instead, build an array of all threads in the
+        * group - threadgroup_fork_lock prevents new threads from appearing,
+        * and if threads exit, this will just be an over-estimate.
+        */
+       group_size = get_nr_threads(leader);
+       /* flex_array supports very large thread-groups better than kmalloc. */
+       group = flex_array_alloc(sizeof(struct task_struct *), group_size,
+                                GFP_KERNEL);
+       if (!group)
+               return -ENOMEM;
+       /* pre-allocate to guarantee space while iterating in rcu read-side. */
+       retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+       if (retval)
+               goto out_free_group_list;
+
+       /* prevent changes to the threadgroup list while we take a snapshot. */
+       rcu_read_lock();
+       if (!thread_group_leader(leader)) {
+               /*
+                * a race with de_thread from another thread's exec() may strip
+                * us of our leadership, making while_each_thread unsafe to use
+                * on this task. if this happens, there is no choice but to
+                * throw this task away and try again (from cgroup_procs_write);
+                * this is "double-double-toil-and-trouble-check locking".
+                */
+               rcu_read_unlock();
+               retval = -EAGAIN;
+               goto out_free_group_list;
+       }
+       /* take a reference on each task in the group to go in the array. */
+       tsk = leader;
+       i = 0;
+       do {
+               /* as per above, nr_threads may decrease, but not increase. */
+               BUG_ON(i >= group_size);
+               get_task_struct(tsk);
+               /*
+                * saying GFP_ATOMIC has no effect here because we did prealloc
+                * earlier, but it's good form to communicate our expectations.
+                */
+               retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
+               BUG_ON(retval != 0);
+               i++;
+       } while_each_thread(leader, tsk);
+       /* remember the number of threads in the array for later. */
+       group_size = i;
+       rcu_read_unlock();
+
+       /*
+        * step 1: check that we can legitimately attach to the cgroup.
+        */
+       for_each_subsys(root, ss) {
+               if (ss->can_attach) {
+                       retval = ss->can_attach(ss, cgrp, leader);
+                       if (retval) {
+                               failed_ss = ss;
+                               goto out_cancel_attach;
+                       }
+               }
+               /* a callback to be run on every thread in the threadgroup. */
+               if (ss->can_attach_task) {
+                       /* run on each task in the threadgroup. */
+                       for (i = 0; i < group_size; i++) {
+                               tsk = flex_array_get_ptr(group, i);
+                               retval = ss->can_attach_task(cgrp, tsk);
+                               if (retval) {
+                                       failed_ss = ss;
+                                       cancel_failed_ss = true;
+                                       goto out_cancel_attach;
+                               }
+                       }
+               }
+       }
+
+       /*
+        * step 2: make sure css_sets exist for all threads to be migrated.
+        * we use find_css_set, which allocates a new one if necessary.
+        */
+       INIT_LIST_HEAD(&newcg_list);
+       for (i = 0; i < group_size; i++) {
+               tsk = flex_array_get_ptr(group, i);
+               /* nothing to do if this task is already in the cgroup */
+               oldcgrp = task_cgroup_from_root(tsk, root);
+               if (cgrp == oldcgrp)
+                       continue;
+               /* get old css_set pointer */
+               task_lock(tsk);
+               oldcg = tsk->cgroups;
+               get_css_set(oldcg);
+               task_unlock(tsk);
+               /* see if the new one for us is already in the list? */
+               if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+                       /* was already there, nothing to do. */
+                       put_css_set(oldcg);
+               } else {
+                       /* we don't already have it. get new one. */
+                       retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+                       put_css_set(oldcg);
+                       if (retval)
+                               goto out_list_teardown;
+               }
+       }
+
+       /*
+        * step 3: now that we're guaranteed success wrt the css_sets, proceed
+        * to move all tasks to the new cgroup, calling ss->attach_task for each
+        * one along the way. there are no failure cases after here, so this is
+        * the commit point.
+        */
+       for_each_subsys(root, ss) {
+               if (ss->pre_attach)
+                       ss->pre_attach(cgrp);
+       }
+       for (i = 0; i < group_size; i++) {
+               tsk = flex_array_get_ptr(group, i);
+               /* leave current thread as it is if it's already there */
+               oldcgrp = task_cgroup_from_root(tsk, root);
+               if (cgrp == oldcgrp)
+                       continue;
+               /* attach each task to each subsystem */
+               for_each_subsys(root, ss) {
+                       if (ss->attach_task)
+                               ss->attach_task(cgrp, tsk);
+               }
+               /* if the thread is PF_EXITING, it can just get skipped. */
+               retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+               BUG_ON(retval != 0 && retval != -ESRCH);
+       }
+       /* nothing is sensitive to fork() after this point. */
+
+       /*
+        * step 4: do expensive, non-thread-specific subsystem callbacks.
+        * TODO: if ever a subsystem needs to know the oldcgrp for each task
+        * being moved, this call will need to be reworked to communicate that.
+        */
+       for_each_subsys(root, ss) {
+               if (ss->attach)
+                       ss->attach(ss, cgrp, oldcgrp, leader);
+       }
+
+       /*
+        * step 5: success! and cleanup
+        */
+       synchronize_rcu();
+       cgroup_wakeup_rmdir_waiter(cgrp);
+       retval = 0;
+out_list_teardown:
+       /* clean up the list of prefetched css_sets. */
+       list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
+               list_del(&cg_entry->links);
+               put_css_set(cg_entry->cg);
+               kfree(cg_entry);
+       }
+out_cancel_attach:
+       /* same deal as in cgroup_attach_task */
+       if (retval) {
+               for_each_subsys(root, ss) {
+                       if (ss == failed_ss) {
+                               if (cancel_failed_ss && ss->cancel_attach)
+                                       ss->cancel_attach(ss, cgrp, leader);
+                               break;
+                       }
+                       if (ss->cancel_attach)
+                               ss->cancel_attach(ss, cgrp, leader);
+               }
+       }
+       /* clean up the array of referenced threads in the group. */
+       for (i = 0; i < group_size; i++) {
+               tsk = flex_array_get_ptr(group, i);
+               put_task_struct(tsk);
+       }
+out_free_group_list:
+       flex_array_free(group);
+       return retval;
+}
+
+static int cgroup_allow_attach(struct cgroup *cgrp, struct task_struct *tsk)
+{
+       struct cgroup_subsys *ss;
+       int ret;
+
+       for_each_subsys(cgrp->root, ss) {
+               if (ss->allow_attach) {
+                       ret = ss->allow_attach(cgrp, tsk);
+                       if (ret)
+                               return ret;
+               } else {
+                       return -EACCES;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
+ */
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 {
        struct task_struct *tsk;
        const struct cred *cred = current_cred(), *tcred;
        int ret;
 
+       if (!cgroup_lock_live_group(cgrp))
+               return -ENODEV;
+
        if (pid) {
                rcu_read_lock();
                tsk = find_task_by_vpid(pid);
-               if (!tsk || tsk->flags & PF_EXITING) {
+               if (!tsk) {
                        rcu_read_unlock();
+                       cgroup_unlock();
+                       return -ESRCH;
+               }
+               if (threadgroup) {
+                       /*
+                        * RCU protects this access, since tsk was found in the
+                        * tid map. a race with de_thread may cause group_leader
+                        * to stop being the leader, but cgroup_attach_proc will
+                        * detect it later.
+                        */
+                       tsk = tsk->group_leader;
+               } else if (tsk->flags & PF_EXITING) {
+                       /* optimization for the single-task-only case */
+                       rcu_read_unlock();
+                       cgroup_unlock();
                        return -ESRCH;
                }
 
+               /*
+                * even if we're attaching all tasks in the thread group, we
+                * only need to check permissions on one of them.
+                */
                tcred = __task_cred(tsk);
                if (cred->euid &&
                    cred->euid != tcred->uid &&
                    cred->euid != tcred->suid) {
-                       rcu_read_unlock();
-                       return -EACCES;
+                       /*
+                        * if the default permission check fails, give each
+                        * cgroup a chance to extend the permission check
+                        */
+                       ret = cgroup_allow_attach(cgrp, tsk);
+                       if (ret) {
+                               rcu_read_unlock();
+                               cgroup_unlock();
+                               return ret;
+                       }
                }
                get_task_struct(tsk);
                rcu_read_unlock();
        } else {
-               tsk = current;
+               if (threadgroup)
+                       tsk = current->group_leader;
+               else
+                       tsk = current;
                get_task_struct(tsk);
        }
 
-       ret = cgroup_attach_task(cgrp, tsk);
+       if (threadgroup) {
+               threadgroup_fork_write_lock(tsk);
+               ret = cgroup_attach_proc(cgrp, tsk);
+               threadgroup_fork_write_unlock(tsk);
+       } else {
+               ret = cgroup_attach_task(cgrp, tsk);
+       }
        put_task_struct(tsk);
+       cgroup_unlock();
        return ret;
 }
 
 static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
+{
+       return attach_task_by_pid(cgrp, pid, false);
+}
+
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
 {
        int ret;
-       if (!cgroup_lock_live_group(cgrp))
-               return -ENODEV;
-       ret = attach_task_by_pid(cgrp, pid);
-       cgroup_unlock();
+       do {
+               /*
+                * attach_proc fails with -EAGAIN if threadgroup leadership
+                * changes in the middle of the operation, in which case we need
+                * to find the task_struct for the new leader and start over.
+                */
+               ret = attach_task_by_pid(cgrp, tgid, true);
+       } while (ret == -EAGAIN);
        return ret;
 }
 
@@ -3182,7 +3575,8 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
        }
 
        /* the process need read permission on control file */
-       ret = file_permission(cfile, MAY_READ);
+       /* AV: shouldn't we check that it's been opened for read instead? */
+       ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
        if (ret < 0)
                goto fail;
 
@@ -3272,9 +3666,9 @@ static struct cftype files[] = {
        {
                .name = CGROUP_FILE_GENERIC_PREFIX "procs",
                .open = cgroup_procs_open,
-               /* .write_u64 = cgroup_procs_write, TODO */
+               .write_u64 = cgroup_procs_write,
                .release = cgroup_pidlist_release,
-               .mode = S_IRUGO,
+               .mode = S_IRUGO | S_IWUSR,
        },
        {
                .name = "notify_on_release",
@@ -3450,6 +3844,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        if (err < 0)
                goto err_remove;
 
+       set_bit(CGRP_RELEASABLE, &parent->flags);
+
        /* The cgroup directory was pre-locked for us */
        BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
 
@@ -3581,6 +3977,21 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
        return !failed;
 }
 
+/* checks if all of the css_sets attached to a cgroup have a refcount of 0.
+ * Must be called with css_set_lock held */
+static int cgroup_css_sets_empty(struct cgroup *cgrp)
+{
+       struct cg_cgroup_link *link;
+
+       list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
+               struct css_set *cg = link->cg;
+               if (atomic_read(&cg->refcount) > 0)
+                       return 0;
+       }
+
+       return 1;
+}
+
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 {
        struct cgroup *cgrp = dentry->d_fsdata;
@@ -3593,7 +4004,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        /* the vfs holds both inode->i_mutex already */
 again:
        mutex_lock(&cgroup_mutex);
-       if (atomic_read(&cgrp->count) != 0) {
+       if (!cgroup_css_sets_empty(cgrp)) {
                mutex_unlock(&cgroup_mutex);
                return -EBUSY;
        }
@@ -3626,7 +4037,7 @@ again:
 
        mutex_lock(&cgroup_mutex);
        parent = cgrp->parent;
-       if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
+       if (!cgroup_css_sets_empty(cgrp) || !list_empty(&cgrp->children)) {
                clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
                mutex_unlock(&cgroup_mutex);
                return -EBUSY;
@@ -3666,7 +4077,6 @@ again:
        cgroup_d_remove_dir(d);
        dput(d);
 
-       set_bit(CGRP_RELEASABLE, &parent->flags);
        check_for_release(parent);
 
        /*
@@ -4266,123 +4676,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        task_unlock(tsk);
 
        if (cg)
-               put_css_set_taskexit(cg);
-}
-
-/**
- * cgroup_clone - clone the cgroup the given subsystem is attached to
- * @tsk: the task to be moved
- * @subsys: the given subsystem
- * @nodename: the name for the new cgroup
- *
- * Duplicate the current cgroup in the hierarchy that the given
- * subsystem is attached to, and move this task into the new
- * child.
- */
-int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
-                                                       char *nodename)
-{
-       struct dentry *dentry;
-       int ret = 0;
-       struct cgroup *parent, *child;
-       struct inode *inode;
-       struct css_set *cg;
-       struct cgroupfs_root *root;
-       struct cgroup_subsys *ss;
-
-       /* We shouldn't be called by an unregistered subsystem */
-       BUG_ON(!subsys->active);
-
-       /* First figure out what hierarchy and cgroup we're dealing
-        * with, and pin them so we can drop cgroup_mutex */
-       mutex_lock(&cgroup_mutex);
- again:
-       root = subsys->root;
-       if (root == &rootnode) {
-               mutex_unlock(&cgroup_mutex);
-               return 0;
-       }
-
-       /* Pin the hierarchy */
-       if (!atomic_inc_not_zero(&root->sb->s_active)) {
-               /* We race with the final deactivate_super() */
-               mutex_unlock(&cgroup_mutex);
-               return 0;
-       }
-
-       /* Keep the cgroup alive */
-       task_lock(tsk);
-       parent = task_cgroup(tsk, subsys->subsys_id);
-       cg = tsk->cgroups;
-       get_css_set(cg);
-       task_unlock(tsk);
-
-       mutex_unlock(&cgroup_mutex);
-
-       /* Now do the VFS work to create a cgroup */
-       inode = parent->dentry->d_inode;
-
-       /* Hold the parent directory mutex across this operation to
-        * stop anyone else deleting the new cgroup */
-       mutex_lock(&inode->i_mutex);
-       dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
-       if (IS_ERR(dentry)) {
-               printk(KERN_INFO
-                      "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
-                      PTR_ERR(dentry));
-               ret = PTR_ERR(dentry);
-               goto out_release;
-       }
-
-       /* Create the cgroup directory, which also creates the cgroup */
-       ret = vfs_mkdir(inode, dentry, 0755);
-       child = __d_cgrp(dentry);
-       dput(dentry);
-       if (ret) {
-               printk(KERN_INFO
-                      "Failed to create cgroup %s: %d\n", nodename,
-                      ret);
-               goto out_release;
-       }
-
-       /* The cgroup now exists. Retake cgroup_mutex and check
-        * that we're still in the same state that we thought we
-        * were. */
-       mutex_lock(&cgroup_mutex);
-       if ((root != subsys->root) ||
-           (parent != task_cgroup(tsk, subsys->subsys_id))) {
-               /* Aargh, we raced ... */
-               mutex_unlock(&inode->i_mutex);
                put_css_set(cg);
-
-               deactivate_super(root->sb);
-               /* The cgroup is still accessible in the VFS, but
-                * we're not going to try to rmdir() it at this
-                * point. */
-               printk(KERN_INFO
-                      "Race in cgroup_clone() - leaking cgroup %s\n",
-                      nodename);
-               goto again;
-       }
-
-       /* do any required auto-setup */
-       for_each_subsys(root, ss) {
-               if (ss->post_clone)
-                       ss->post_clone(ss, child);
-       }
-
-       /* All seems fine. Finish by moving the task into the new cgroup */
-       ret = cgroup_attach_task(child, tsk);
-       mutex_unlock(&cgroup_mutex);
-
- out_release:
-       mutex_unlock(&inode->i_mutex);
-
-       mutex_lock(&cgroup_mutex);
-       put_css_set(cg);
-       mutex_unlock(&cgroup_mutex);
-       deactivate_super(root->sb);
-       return ret;
 }
 
 /**
@@ -4435,6 +4729,14 @@ static void check_for_release(struct cgroup *cgrp)
        }
 }
 
+/* Caller must verify that the css is not for root cgroup */
+void __css_get(struct cgroup_subsys_state *css, int count)
+{
+       atomic_add(count, &css->refcnt);
+       set_bit(CGRP_RELEASABLE, &css->cgroup->flags);
+}
+EXPORT_SYMBOL_GPL(__css_get);
+
 /* Caller must verify that the css is not for root cgroup */
 void __css_put(struct cgroup_subsys_state *css, int count)
 {
@@ -4443,10 +4745,7 @@ void __css_put(struct cgroup_subsys_state *css, int count)
        rcu_read_lock();
        val = atomic_sub_return(count, &css->refcnt);
        if (val == 1) {
-               if (notify_on_release(cgrp)) {
-                       set_bit(CGRP_RELEASABLE, &cgrp->flags);
-                       check_for_release(cgrp);
-               }
+               check_for_release(cgrp);
                cgroup_wakeup_rmdir_waiter(cgrp);
        }
        rcu_read_unlock();
@@ -4569,8 +4868,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
         * on this or this is under rcu_read_lock(). Once css->id is allocated,
         * it's unchanged until freed.
         */
-       cssid = rcu_dereference_check(css->id,
-                       rcu_read_lock_held() || atomic_read(&css->refcnt));
+       cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
 
        if (cssid)
                return cssid->id;
@@ -4582,8 +4880,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
 {
        struct css_id *cssid;
 
-       cssid = rcu_dereference_check(css->id,
-                       rcu_read_lock_held() || atomic_read(&css->refcnt));
+       cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
 
        if (cssid)
                return cssid->depth;
@@ -4623,14 +4920,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
        return ret;
 }
 
-static void __free_css_id_cb(struct rcu_head *head)
-{
-       struct css_id *id;
-
-       id = container_of(head, struct css_id, rcu_head);
-       kfree(id);
-}
-
 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
 {
        struct css_id *id = css->id;
@@ -4645,7 +4934,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
        spin_lock(&ss->id_lock);
        idr_remove(&ss->idr, id->id);
        spin_unlock(&ss->id_lock);
-       call_rcu(&id->rcu_head, __free_css_id_cb);
+       kfree_rcu(id, rcu_head);
 }
 EXPORT_SYMBOL_GPL(free_css_id);