cgroup: Remove call to synchronize_rcu in cgroup_attach_task
Colin Cross [Mon, 16 Apr 2012 07:01:13 +0000 (12:01 +0530)]
synchronize_rcu can be very expensive, averaging 100 ms in
some cases.  In cgroup_attach_task, it is used to prevent
a task->cgroups pointer dereferenced in an RCU read side
critical section from being invalidated, by delaying the
call to put_css_set until after an RCU grace period.

To avoid the call to synchronize_rcu, make the put_css_set
call rcu-safe by moving the deletion of the css_set links
into free_css_set_work, scheduled by the rcu callback
free_css_set_rcu.

The decrement of the cgroup refcount is no longer
synchronous with the call to put_css_set, which can result
in the cgroup refcount staying positive after the last call
to cgroup_attach_task returns.  To allow the cgroup to be
deleted with cgroup_rmdir synchronously after
cgroup_attach_task, have rmdir check the refcount of all
associated css_sets.  If cgroup_rmdir is called on a cgroup
for which the css_sets all have refcount zero but the
cgroup refcount is nonzero, reuse the rmdir waitqueue to
block the rmdir until free_css_set_work is called.

Signed-off-by: Colin Cross <ccross@android.com>

Conflicts:

kernel/cgroup.c

Change-Id: I3b3f245c8f5e2e5d33f1e54178b2bb6ef10a0817

Conflicts:

kernel/cgroup.c

Signed-off-by: Varun Wadekar <vwadekar@nvidia.com>

kernel/cgroup.c

index 78635ff..6c0e1c1 100644 (file)
@@ -373,6 +373,37 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
        return &css_set_table[index];
 }
 
+static void free_css_set_work(struct work_struct *work)
+{
+       struct css_set *cg = container_of(work, struct css_set, work);
+       struct cg_cgroup_link *link;
+       struct cg_cgroup_link *saved_link;
+
+       write_lock(&css_set_lock);
+       list_for_each_entry_safe(link, saved_link, &cg->cg_links,
+                                cg_link_list) {
+               struct cgroup *cgrp = link->cgrp;
+               list_del(&link->cg_link_list);
+               list_del(&link->cgrp_link_list);
+               if (atomic_dec_and_test(&cgrp->count)) {
+                       check_for_release(cgrp);
+                       cgroup_wakeup_rmdir_waiter(cgrp);
+               }
+               kfree(link);
+       }
+       write_unlock(&css_set_lock);
+
+       kfree(cg);
+}
+
+static void free_css_set_rcu(struct rcu_head *obj)
+{
+       struct css_set *cg = container_of(obj, struct css_set, rcu_head);
+
+       INIT_WORK(&cg->work, free_css_set_work);
+       schedule_work(&cg->work);
+}
+
 /* We don't maintain the lists running through each css_set to its
  * task until after the first call to cgroup_iter_start(). This
  * reduces the fork()/exit() overhead for people who have cgroups
@@ -389,23 +420,24 @@ static inline void get_css_set(struct css_set *cg)
 
 static void put_css_set(struct css_set *cg)
 {
-       struct css_set *cg = container_of(work, struct css_set, work);
-       struct cg_cgroup_link *link;
-       struct cg_cgroup_link *saved_link;
-
+       /*
+        * Ensure that the refcount doesn't hit zero while any readers
+        * can see it. Similar to atomic_dec_and_lock(), but for an
+        * rwlock
+        */
+       if (atomic_add_unless(&cg->refcount, -1, 1))
+               return;
        write_lock(&css_set_lock);
-       list_for_each_entry_safe(link, saved_link, &cg->cg_links,
-                                cg_link_list) {
-               struct cgroup *cgrp = link->cgrp;
-               list_del(&link->cg_link_list);
-               list_del(&link->cgrp_link_list);
-               if (atomic_dec_and_test(&cgrp->count))
-                       check_for_release(cgrp);
-               kfree(link);
+       if (!atomic_dec_and_test(&cg->refcount)) {
+               write_unlock(&css_set_lock);
+               return;
        }
-       write_unlock(&css_set_lock);
 
-       kfree(cg);
+       hlist_del(&cg->hlist);
+       css_set_count--;
+
+       write_unlock(&css_set_lock);
+       call_rcu(&cg->rcu_head, free_css_set_rcu);
 }
 
 static void free_css_set_rcu(struct rcu_head *obj)
@@ -1874,6 +1906,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        struct cgroupfs_root *root = cgrp->root;
        struct cgroup_taskset tset = { };
        struct css_set *newcg;
+       struct css_set *cg;
 
        /* @tsk either already exited or can't exit until the end */
        if (tsk->flags & PF_EXITING)
@@ -1909,15 +1942,20 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                goto out;
        }
 
+       task_lock(tsk);
+       cg = tsk->cgroups;
+       get_css_set(cg);
+       task_unlock(tsk);
+
        cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
 
        for_each_subsys(root, ss) {
                if (ss->attach)
                        ss->attach(cgrp, &tset);
        }
-
        set_bit(CGRP_RELEASABLE, &cgrp->flags);
-       synchronize_rcu();
+       /* put_css_set will not destroy cg until after an RCU grace period */
+       put_css_set(cg);
 
        /*
         * wake up rmdir() waiter. the rmdir should fail since the cgroup