*/
#include <linux/cgroup.h>
+#include <linux/cred.h>
#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/fs.h>
+#include <linux/init_task.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/cgroupstats.h>
#include <linux/hash.h>
#include <linux/namei.h>
-#include <linux/smp_lock.h>
#include <linux/pid_namespace.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/eventfd.h>
#include <linux/poll.h>
+#include <linux/flex_array.h> /* used in cgroup_attach_proc */
-#include <asm/atomic.h>
+#include <linux/atomic.h>
+/*
+ * cgroup_mutex is the master lock. Any modification to cgroup or its
+ * hierarchy must be performed while holding it.
+ *
+ * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
+ * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
+ * release_agent_path and so on. Modifying requires both cgroup_mutex and
+ * cgroup_root_mutex. Readers can acquire either of the two. This is to
+ * break the following locking order cycle.
+ *
+ * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
+ * B. namespace_sem -> cgroup_mutex
+ *
+ * B happens only through cgroup_show_options() and using cgroup_root_mutex
+ * breaks it.
+ */
static DEFINE_MUTEX(cgroup_mutex);
+static DEFINE_MUTEX(cgroup_root_mutex);
/*
* Generate an array of cgroup subsystem pointers. At boot time, this is
* is called after synchronize_rcu(). But for safe use, css_is_removed()
* css_tryget() should be used for avoiding race.
*/
- struct cgroup_subsys_state *css;
+ struct cgroup_subsys_state __rcu *css;
/*
* ID of this css.
*/
};
/*
- * cgroup_event represents events which userspace want to recieve.
+ * cgroup_event represents events which userspace want to receive.
*/
struct cgroup_event {
/*
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
+static int clone_children(const struct cgroup *cgrp)
+{
+ return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+}
+
/*
* for_each_subsys() allows you to iterate on each subsystem attached to
* an active hierarchy
/* the list of cgroups eligible for automatic release. Protected by
* release_list_lock */
static LIST_HEAD(release_list);
-static DEFINE_SPINLOCK(release_list_lock);
+static DEFINE_RAW_SPINLOCK(release_list_lock);
static void cgroup_release_agent(struct work_struct *work);
static DECLARE_WORK(release_agent_work, cgroup_release_agent);
static void check_for_release(struct cgroup *cgrp);
+/*
+ * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
+ * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
+ * reference to css->refcnt. In general, this refcnt is expected to goes down
+ * to zero, soon.
+ *
+ * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
+ */
+static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+
+static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
+{
+ if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+ wake_up_all(&cgroup_rmdir_waitq);
+}
+
+void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
+{
+ css_get(css);
+}
+
+void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
+{
+ cgroup_wakeup_rmdir_waiter(css->cgroup);
+ css_put(css);
+}
+
/* Link structure for associating css_set objects with cgroups */
struct cg_cgroup_link {
/*
return &css_set_table[index];
}
+static void free_css_set_work(struct work_struct *work)
+{
+ struct css_set *cg = container_of(work, struct css_set, work);
+ struct cg_cgroup_link *link;
+ struct cg_cgroup_link *saved_link;
+
+ write_lock(&css_set_lock);
+ list_for_each_entry_safe(link, saved_link, &cg->cg_links,
+ cg_link_list) {
+ struct cgroup *cgrp = link->cgrp;
+ list_del(&link->cg_link_list);
+ list_del(&link->cgrp_link_list);
+ if (atomic_dec_and_test(&cgrp->count)) {
+ check_for_release(cgrp);
+ cgroup_wakeup_rmdir_waiter(cgrp);
+ }
+ kfree(link);
+ }
+ write_unlock(&css_set_lock);
+
+ kfree(cg);
+}
+
static void free_css_set_rcu(struct rcu_head *obj)
{
struct css_set *cg = container_of(obj, struct css_set, rcu_head);
- kfree(cg);
+
+ INIT_WORK(&cg->work, free_css_set_work);
+ schedule_work(&cg->work);
}
/* We don't maintain the lists running through each css_set to its
* compiled into their kernel but not actually in use */
static int use_task_css_set_links __read_mostly;
-static void __put_css_set(struct css_set *cg, int taskexit)
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cg)
+{
+ atomic_inc(&cg->refcount);
+}
+
+static void put_css_set(struct css_set *cg)
{
- struct cg_cgroup_link *link;
- struct cg_cgroup_link *saved_link;
/*
* Ensure that the refcount doesn't hit zero while any readers
* can see it. Similar to atomic_dec_and_lock(), but for an
return;
}
- /* This css_set is dead. unlink it and release cgroup refcounts */
hlist_del(&cg->hlist);
css_set_count--;
- list_for_each_entry_safe(link, saved_link, &cg->cg_links,
- cg_link_list) {
- struct cgroup *cgrp = link->cgrp;
- list_del(&link->cg_link_list);
- list_del(&link->cgrp_link_list);
- if (atomic_dec_and_test(&cgrp->count) &&
- notify_on_release(cgrp)) {
- if (taskexit)
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
- }
-
- kfree(link);
- }
-
write_unlock(&css_set_lock);
call_rcu(&cg->rcu_head, free_css_set_rcu);
}
-/*
- * refcounted get/put for css_set objects
- */
-static inline void get_css_set(struct css_set *cg)
-{
- atomic_inc(&cg->refcount);
-}
-
-static inline void put_css_set(struct css_set *cg)
-{
- __put_css_set(cg, 0);
-}
-
-static inline void put_css_set_taskexit(struct css_set *cg)
-{
- __put_css_set(cg, 1);
-}
+/* We don't maintain the lists running through each css_set to its
+ * task until after the first call to cgroup_iter_start(). This
+ * reduces the fork()/exit() overhead for people who have cgroups
+ * compiled into their kernel but not actually in use */
+static int use_task_css_set_links __read_mostly;
/*
* compare_css_sets - helper function for find_existing_css_set().
* cgroup_attach_task(), which overwrites one tasks cgroup pointer with
* another. It does so using cgroup_mutex, however there are
* several performance critical places that need to reference
- * task->cgroup without the expense of grabbing a system global
+ * task->cgroups without the expense of grabbing a system global
* mutex. Therefore except as noted below, when dereferencing or, as
- * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
+ * in cgroup_attach_task(), modifying a task's cgroups pointer we use
* task_lock(), which acts on a spinlock (task->alloc_lock) already in
* the task_struct routinely used for such matters.
*
* -> cgroup_mkdir.
*/
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
+static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
static int cgroup_populate_dir(struct cgroup *cgrp);
static const struct inode_operations cgroup_dir_inode_operations;
static int alloc_css_id(struct cgroup_subsys *ss,
struct cgroup *parent, struct cgroup *child);
-static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
+static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
{
struct inode *inode = new_inode(sb);
if (inode) {
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
for_each_subsys(cgrp->root, ss)
if (ss->pre_destroy) {
- ret = ss->pre_destroy(ss, cgrp);
+ ret = ss->pre_destroy(cgrp);
if (ret)
break;
}
return ret;
}
-static void free_cgroup_rcu(struct rcu_head *obj)
-{
- struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
-
- kfree(cgrp);
-}
-
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
/* is dentry a directory ? if so, kfree() associated cgroup */
* Release the subsystem state objects.
*/
for_each_subsys(cgrp->root, ss)
- ss->destroy(ss, cgrp);
+ ss->destroy(cgrp);
cgrp->root->number_of_cgroups--;
mutex_unlock(&cgroup_mutex);
*/
BUG_ON(!list_empty(&cgrp->pidlists));
- call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
+ kfree_rcu(cgrp, rcu_head);
}
iput(inode);
}
+static int cgroup_delete(const struct dentry *d)
+{
+ return 1;
+}
+
static void remove_dir(struct dentry *d)
{
struct dentry *parent = dget(d->d_parent);
struct list_head *node;
BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
- spin_lock(&dcache_lock);
+ spin_lock(&dentry->d_lock);
node = dentry->d_subdirs.next;
while (node != &dentry->d_subdirs) {
struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
+
+ spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
list_del_init(node);
if (d->d_inode) {
/* This should never be called on a cgroup
* directory with child cgroups */
BUG_ON(d->d_inode->i_mode & S_IFDIR);
- d = dget_locked(d);
- spin_unlock(&dcache_lock);
+ dget_dlock(d);
+ spin_unlock(&d->d_lock);
+ spin_unlock(&dentry->d_lock);
d_delete(d);
simple_unlink(dentry->d_inode, d);
dput(d);
- spin_lock(&dcache_lock);
- }
+ spin_lock(&dentry->d_lock);
+ } else
+ spin_unlock(&d->d_lock);
node = dentry->d_subdirs.next;
}
- spin_unlock(&dcache_lock);
+ spin_unlock(&dentry->d_lock);
}
/*
*/
static void cgroup_d_remove_dir(struct dentry *dentry)
{
+ struct dentry *parent;
+
cgroup_clear_directory(dentry);
- spin_lock(&dcache_lock);
+ parent = dentry->d_parent;
+ spin_lock(&parent->d_lock);
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
list_del_init(&dentry->d_u.d_child);
- spin_unlock(&dcache_lock);
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&parent->d_lock);
remove_dir(dentry);
}
-/*
- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
- * reference to css->refcnt. In general, this refcnt is expected to goes down
- * to zero, soon.
- *
- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
- */
-DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-
-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
- if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
- wake_up_all(&cgroup_rmdir_waitq);
-}
-
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
-{
- css_get(css);
-}
-
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
-{
- cgroup_wakeup_rmdir_waiter(css->cgroup);
- css_put(css);
-}
-
/*
* Call with cgroup_mutex held. Drops reference counts on modules, including
* any duplicate ones that parse_cgroupfs_options took. If this function
int i;
BUG_ON(!mutex_is_locked(&cgroup_mutex));
+ BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
removed_bits = root->actual_subsys_bits & ~final_bits;
added_bits = final_bits & ~root->actual_subsys_bits;
list_move(&ss->sibling, &root->subsys_list);
ss->root = root;
if (ss->bind)
- ss->bind(ss, cgrp);
+ ss->bind(cgrp);
mutex_unlock(&ss->hierarchy_mutex);
/* refcount was already taken, and we're keeping it */
} else if (bit & removed_bits) {
BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
mutex_lock(&ss->hierarchy_mutex);
if (ss->bind)
- ss->bind(ss, dummytop);
+ ss->bind(dummytop);
dummytop->subsys[i]->cgroup = dummytop;
cgrp->subsys[i] = NULL;
subsys[i]->root = &rootnode;
return 0;
}
-static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
{
- struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
+ struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
struct cgroup_subsys *ss;
- mutex_lock(&cgroup_mutex);
+ mutex_lock(&cgroup_root_mutex);
for_each_subsys(root, ss)
seq_printf(seq, ",%s", ss->name);
if (test_bit(ROOT_NOPREFIX, &root->flags))
seq_puts(seq, ",noprefix");
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+ if (clone_children(&root->top_cgroup))
+ seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_printf(seq, ",name=%s", root->name);
- mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&cgroup_root_mutex);
return 0;
}
unsigned long subsys_bits;
unsigned long flags;
char *release_agent;
+ bool clone_children;
char *name;
/* User explicitly requested empty subsystem */
bool none;
*/
static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
{
- char *token, *o = data ?: "all";
+ char *token, *o = data;
+ bool all_ss = false, one_ss = false;
unsigned long mask = (unsigned long)-1;
int i;
bool module_pin_failed = false;
while ((token = strsep(&o, ",")) != NULL) {
if (!*token)
return -EINVAL;
- if (!strcmp(token, "all")) {
- /* Add all non-disabled subsystems */
- opts->subsys_bits = 0;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss == NULL)
- continue;
- if (!ss->disabled)
- opts->subsys_bits |= 1ul << i;
- }
- } else if (!strcmp(token, "none")) {
+ if (!strcmp(token, "none")) {
/* Explicitly have no subsystems */
opts->none = true;
- } else if (!strcmp(token, "noprefix")) {
+ continue;
+ }
+ if (!strcmp(token, "all")) {
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (one_ss)
+ return -EINVAL;
+ all_ss = true;
+ continue;
+ }
+ if (!strcmp(token, "noprefix")) {
set_bit(ROOT_NOPREFIX, &opts->flags);
- } else if (!strncmp(token, "release_agent=", 14)) {
+ continue;
+ }
+ if (!strcmp(token, "clone_children")) {
+ opts->clone_children = true;
+ continue;
+ }
+ if (!strncmp(token, "release_agent=", 14)) {
/* Specifying two release agents is forbidden */
if (opts->release_agent)
return -EINVAL;
opts->release_agent =
- kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
+ kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
if (!opts->release_agent)
return -ENOMEM;
- } else if (!strncmp(token, "name=", 5)) {
+ continue;
+ }
+ if (!strncmp(token, "name=", 5)) {
const char *name = token + 5;
/* Can't specify an empty name */
if (!strlen(name))
if (opts->name)
return -EINVAL;
opts->name = kstrndup(name,
- MAX_CGROUP_ROOT_NAMELEN,
+ MAX_CGROUP_ROOT_NAMELEN - 1,
GFP_KERNEL);
if (!opts->name)
return -ENOMEM;
- } else {
- struct cgroup_subsys *ss;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- ss = subsys[i];
- if (ss == NULL)
- continue;
- if (!strcmp(token, ss->name)) {
- if (!ss->disabled)
- set_bit(i, &opts->subsys_bits);
- break;
- }
- }
- if (i == CGROUP_SUBSYS_COUNT)
- return -ENOENT;
+
+ continue;
+ }
+
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss == NULL)
+ continue;
+ if (strcmp(token, ss->name))
+ continue;
+ if (ss->disabled)
+ continue;
+
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (all_ss)
+ return -EINVAL;
+ set_bit(i, &opts->subsys_bits);
+ one_ss = true;
+
+ break;
+ }
+ if (i == CGROUP_SUBSYS_COUNT)
+ return -ENOENT;
+ }
+
+ /*
+ * If the 'all' option was specified select all the subsystems,
+ * otherwise if 'none', 'name=' and a subsystem name options
+ * were not specified, let's default to 'all'
+ */
+ if (all_ss || (!one_ss && !opts->none && !opts->name)) {
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss == NULL)
+ continue;
+ if (ss->disabled)
+ continue;
+ set_bit(i, &opts->subsys_bits);
}
}
struct cgroup *cgrp = &root->top_cgroup;
struct cgroup_sb_opts opts;
- lock_kernel();
mutex_lock(&cgrp->dentry->d_inode->i_mutex);
mutex_lock(&cgroup_mutex);
+ mutex_lock(&cgroup_root_mutex);
/* See what subsystems are wanted */
ret = parse_cgroupfs_options(data, &opts);
out_unlock:
kfree(opts.release_agent);
kfree(opts.name);
+ mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
- unlock_kernel();
return ret;
}
strcpy(root->release_agent_path, opts->release_agent);
if (opts->name)
strcpy(root->name, opts->name);
+ if (opts->clone_children)
+ set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
return root;
}
static int cgroup_get_rootdir(struct super_block *sb)
{
+ static const struct dentry_operations cgroup_dops = {
+ .d_iput = cgroup_diput,
+ .d_delete = cgroup_delete,
+ };
+
struct inode *inode =
cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
- struct dentry *dentry;
if (!inode)
return -ENOMEM;
inode->i_op = &cgroup_dir_inode_operations;
/* directories start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
- dentry = d_alloc_root(inode);
- if (!dentry) {
- iput(inode);
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root)
return -ENOMEM;
- }
- sb->s_root = dentry;
+ /* for everything else we want ->d_op set */
+ sb->s_d_op = &cgroup_dops;
return 0;
}
-static int cgroup_get_sb(struct file_system_type *fs_type,
+static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
- void *data, struct vfsmount *mnt)
+ void *data)
{
struct cgroup_sb_opts opts;
struct cgroupfs_root *root;
int ret = 0;
struct super_block *sb;
struct cgroupfs_root *new_root;
+ struct inode *inode;
/* First find the desired set of subsystems */
mutex_lock(&cgroup_mutex);
/* We used the new root structure, so this is a new hierarchy */
struct list_head tmp_cg_links;
struct cgroup *root_cgrp = &root->top_cgroup;
- struct inode *inode;
struct cgroupfs_root *existing_root;
+ const struct cred *cred;
int i;
BUG_ON(sb->s_root != NULL);
mutex_lock(&inode->i_mutex);
mutex_lock(&cgroup_mutex);
+ mutex_lock(&cgroup_root_mutex);
- if (strlen(root->name)) {
- /* Check for name clashes with existing mounts */
- for_each_active_root(existing_root) {
- if (!strcmp(existing_root->name, root->name)) {
- ret = -EBUSY;
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- goto drop_new_super;
- }
- }
- }
+ /* Check for name clashes with existing mounts */
+ ret = -EBUSY;
+ if (strlen(root->name))
+ for_each_active_root(existing_root)
+ if (!strcmp(existing_root->name, root->name))
+ goto unlock_drop;
/*
* We're accessing css_set_count without locking
* have some link structures left over
*/
ret = allocate_cg_links(css_set_count, &tmp_cg_links);
- if (ret) {
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- goto drop_new_super;
- }
+ if (ret)
+ goto unlock_drop;
ret = rebind_subsystems(root, root->subsys_bits);
if (ret == -EBUSY) {
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
free_cg_links(&tmp_cg_links);
- goto drop_new_super;
+ goto unlock_drop;
}
/*
* There must be no failure case after here, since rebinding
BUG_ON(!list_empty(&root_cgrp->children));
BUG_ON(root->number_of_cgroups != 1);
+ cred = override_creds(&init_cred);
cgroup_populate_dir(root_cgrp);
+ revert_creds(cred);
+ mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&inode->i_mutex);
} else {
drop_parsed_module_refcounts(opts.subsys_bits);
}
- simple_set_mnt(mnt, sb);
kfree(opts.release_agent);
kfree(opts.name);
- return 0;
+ return dget(sb->s_root);
+ unlock_drop:
+ mutex_unlock(&cgroup_root_mutex);
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&inode->i_mutex);
drop_new_super:
deactivate_locked_super(sb);
drop_modules:
out_err:
kfree(opts.release_agent);
kfree(opts.name);
-
- return ret;
+ return ERR_PTR(ret);
}
static void cgroup_kill_sb(struct super_block *sb) {
BUG_ON(!list_empty(&cgrp->sibling));
mutex_lock(&cgroup_mutex);
+ mutex_lock(&cgroup_root_mutex);
/* Rebind all subsystems back to the default hierarchy */
ret = rebind_subsystems(root, 0);
root_count--;
}
+ mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
kill_litter_super(sb);
static struct file_system_type cgroup_fs_type = {
.name = "cgroup",
- .get_sb = cgroup_get_sb,
+ .mount = cgroup_mount,
.kill_sb = cgroup_kill_sb,
};
{
char *start;
struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
- rcu_read_lock_held() ||
cgroup_lock_is_held());
if (!dentry || cgrp == dummytop) {
break;
dentry = rcu_dereference_check(cgrp->dentry,
- rcu_read_lock_held() ||
cgroup_lock_is_held());
if (!cgrp->parent)
continue;
}
EXPORT_SYMBOL_GPL(cgroup_path);
+/*
+ * Control Group taskset
+ */
+struct task_and_cgroup {
+ struct task_struct *task;
+ struct cgroup *cgrp;
+ struct css_set *cg;
+};
+
+struct cgroup_taskset {
+ struct task_and_cgroup single;
+ struct flex_array *tc_array;
+ int tc_array_len;
+ int idx;
+ struct cgroup *cur_cgrp;
+};
+
+/**
+ * cgroup_taskset_first - reset taskset and return the first task
+ * @tset: taskset of interest
+ *
+ * @tset iteration is initialized and the first task is returned.
+ */
+struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
+{
+ if (tset->tc_array) {
+ tset->idx = 0;
+ return cgroup_taskset_next(tset);
+ } else {
+ tset->cur_cgrp = tset->single.cgrp;
+ return tset->single.task;
+ }
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_first);
+
+/**
+ * cgroup_taskset_next - iterate to the next task in taskset
+ * @tset: taskset of interest
+ *
+ * Return the next task in @tset. Iteration must have been initialized
+ * with cgroup_taskset_first().
+ */
+struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
+{
+ struct task_and_cgroup *tc;
+
+ if (!tset->tc_array || tset->idx >= tset->tc_array_len)
+ return NULL;
+
+ tc = flex_array_get(tset->tc_array, tset->idx++);
+ tset->cur_cgrp = tc->cgrp;
+ return tc->task;
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_next);
+
+/**
+ * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
+ * @tset: taskset of interest
+ *
+ * Return the cgroup for the current (last returned) task of @tset. This
+ * function must be preceded by either cgroup_taskset_first() or
+ * cgroup_taskset_next().
+ */
+struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
+{
+ return tset->cur_cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
+
+/**
+ * cgroup_taskset_size - return the number of tasks in taskset
+ * @tset: taskset of interest
+ */
+int cgroup_taskset_size(struct cgroup_taskset *tset)
+{
+ return tset->tc_array ? tset->tc_array_len : 1;
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_size);
+
+
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exist. If not set, this function might sleep, and can fail with
+ * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
+ */
+static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+ struct task_struct *tsk, struct css_set *newcg)
+{
+ struct css_set *oldcg;
+
+ /*
+ * We are synchronized through threadgroup_lock() against PF_EXITING
+ * setting such that we can't race against cgroup_exit() changing the
+ * css_set to init_css_set and dropping the old one.
+ */
+ WARN_ON_ONCE(tsk->flags & PF_EXITING);
+ oldcg = tsk->cgroups;
+
+ task_lock(tsk);
+ rcu_assign_pointer(tsk->cgroups, newcg);
+ task_unlock(tsk);
+
+ /* Update the css_set linked lists if we're using them */
+ write_lock(&css_set_lock);
+ if (!list_empty(&tsk->cg_list))
+ list_move(&tsk->cg_list, &newcg->tasks);
+ write_unlock(&css_set_lock);
+
+ /*
+ * We just gained a reference on oldcg by taking it from the task. As
+ * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+ * it here; it will be freed under RCU.
+ */
+ put_css_set(oldcg);
+
+ set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+}
+
/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
* @cgrp: the cgroup the task is attaching to
* @tsk: the task to be attached
*
- * Call holding cgroup_mutex. May take task_lock of
- * the task 'tsk' during call.
+ * Call with cgroup_mutex and threadgroup locked. May take task_lock of
+ * @tsk during call.
*/
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
int retval = 0;
struct cgroup_subsys *ss, *failed_ss = NULL;
struct cgroup *oldcgrp;
- struct css_set *cg;
- struct css_set *newcg;
struct cgroupfs_root *root = cgrp->root;
+ struct cgroup_taskset tset = { };
+ struct css_set *newcg;
+ struct css_set *cg;
+
+ /* @tsk either already exited or can't exit until the end */
+ if (tsk->flags & PF_EXITING)
+ return -ESRCH;
/* Nothing to do if the task is already in that cgroup */
oldcgrp = task_cgroup_from_root(tsk, root);
if (cgrp == oldcgrp)
return 0;
+ tset.single.task = tsk;
+ tset.single.cgrp = oldcgrp;
+
+ for_each_subsys(root, ss) {
+ if (ss->can_attach) {
+ retval = ss->can_attach(cgrp, &tset);
+ if (retval) {
+ /*
+ * Remember on which subsystem the can_attach()
+ * failed, so that we only call cancel_attach()
+ * against the subsystems whose can_attach()
+ * succeeded. (See below)
+ */
+ failed_ss = ss;
+ goto out;
+ }
+ }
+ }
+
+ newcg = find_css_set(tsk->cgroups, cgrp);
+ if (!newcg) {
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ task_lock(tsk);
+ cg = tsk->cgroups;
+ get_css_set(cg);
+ task_unlock(tsk);
+
+ cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
+
+ for_each_subsys(root, ss) {
+ if (ss->attach)
+ ss->attach(cgrp, &tset);
+ }
+ set_bit(CGRP_RELEASABLE, &cgrp->flags);
+ /* put_css_set will not destroy cg until after an RCU grace period */
+ put_css_set(cg);
+
+ /*
+ * wake up rmdir() waiter. the rmdir should fail since the cgroup
+ * is no longer empty.
+ */
+ cgroup_wakeup_rmdir_waiter(cgrp);
+out:
+ if (retval) {
+ for_each_subsys(root, ss) {
+ if (ss == failed_ss)
+ /*
+ * This subsystem was the one that failed the
+ * can_attach() check earlier, so we don't need
+ * to call cancel_attach() against it or any
+ * remaining subsystems.
+ */
+ break;
+ if (ss->cancel_attach)
+ ss->cancel_attach(cgrp, &tset);
+ }
+ }
+ return retval;
+}
+
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+{
+ struct cgroupfs_root *root;
+ int retval = 0;
+
+ cgroup_lock();
+ for_each_active_root(root) {
+ struct cgroup *from_cg = task_cgroup_from_root(from, root);
+
+ retval = cgroup_attach_task(from_cg, tsk);
+ if (retval)
+ break;
+ }
+ cgroup_unlock();
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
+
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
+ * task_lock of each thread in leader's threadgroup individually in turn.
+ */
+static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+ int retval, i, group_size;
+ struct cgroup_subsys *ss, *failed_ss = NULL;
+ /* guaranteed to be initialized later, but the compiler needs this */
+ struct cgroupfs_root *root = cgrp->root;
+ /* threadgroup list cursor and array */
+ struct task_struct *tsk;
+ struct task_and_cgroup *tc;
+ struct flex_array *group;
+ struct cgroup_taskset tset = { };
+
+ /*
+ * step 0: in order to do expensive, possibly blocking operations for
+ * every thread, we cannot iterate the thread group list, since it needs
+ * rcu or tasklist locked. instead, build an array of all threads in the
+ * group - group_rwsem prevents new threads from appearing, and if
+ * threads exit, this will just be an over-estimate.
+ */
+ group_size = get_nr_threads(leader);
+ /* flex_array supports very large thread-groups better than kmalloc. */
+ group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
+ if (!group)
+ return -ENOMEM;
+ /* pre-allocate to guarantee space while iterating in rcu read-side. */
+ retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+ if (retval)
+ goto out_free_group_list;
+
+ tsk = leader;
+ i = 0;
+ /*
+ * Prevent freeing of tasks while we take a snapshot. Tasks that are
+ * already PF_EXITING could be freed from underneath us unless we
+ * take an rcu_read_lock.
+ */
+ rcu_read_lock();
+ do {
+ struct task_and_cgroup ent;
+
+ /* @tsk either already exited or can't exit until the end */
+ if (tsk->flags & PF_EXITING)
+ continue;
+
+ /* as per above, nr_threads may decrease, but not increase. */
+ BUG_ON(i >= group_size);
+ ent.task = tsk;
+ ent.cgrp = task_cgroup_from_root(tsk, root);
+ /* nothing to do if this task is already in the cgroup */
+ if (ent.cgrp == cgrp)
+ continue;
+ /*
+ * saying GFP_ATOMIC has no effect here because we did prealloc
+ * earlier, but it's good form to communicate our expectations.
+ */
+ retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
+ BUG_ON(retval != 0);
+ i++;
+ } while_each_thread(leader, tsk);
+ rcu_read_unlock();
+ /* remember the number of threads in the array for later. */
+ group_size = i;
+ tset.tc_array = group;
+ tset.tc_array_len = group_size;
+
+ /* methods shouldn't be called if no task is actually migrating */
+ retval = 0;
+ if (!group_size)
+ goto out_free_group_list;
+
+ /*
+ * step 1: check that we can legitimately attach to the cgroup.
+ */
for_each_subsys(root, ss) {
if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, tsk, false);
+ retval = ss->can_attach(cgrp, &tset);
if (retval) {
- /*
- * Remember on which subsystem the can_attach()
- * failed, so that we only call cancel_attach()
- * against the subsystems whose can_attach()
- * succeeded. (See below)
- */
failed_ss = ss;
- goto out;
+ goto out_cancel_attach;
}
}
}
- task_lock(tsk);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
/*
- * Locate or allocate a new css_set for this task,
- * based on its final set of cgroups
+ * step 2: make sure css_sets exist for all threads to be migrated.
+ * we use find_css_set, which allocates a new one if necessary.
*/
- newcg = find_css_set(cg, cgrp);
- put_css_set(cg);
- if (!newcg) {
- retval = -ENOMEM;
- goto out;
- }
-
- task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
- task_unlock(tsk);
- put_css_set(newcg);
- retval = -ESRCH;
- goto out;
+ for (i = 0; i < group_size; i++) {
+ tc = flex_array_get(group, i);
+ tc->cg = find_css_set(tc->task->cgroups, cgrp);
+ if (!tc->cg) {
+ retval = -ENOMEM;
+ goto out_put_css_set_refs;
+ }
}
- rcu_assign_pointer(tsk->cgroups, newcg);
- task_unlock(tsk);
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list)) {
- list_del(&tsk->cg_list);
- list_add(&tsk->cg_list, &newcg->tasks);
+ /*
+ * step 3: now that we're guaranteed success wrt the css_sets,
+ * proceed to move all tasks to the new cgroup. There are no
+ * failure cases after here, so this is the commit point.
+ */
+ for (i = 0; i < group_size; i++) {
+ tc = flex_array_get(group, i);
+ cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg);
}
- write_unlock(&css_set_lock);
+ /* nothing is sensitive to fork() after this point. */
+ /*
+ * step 4: do subsystem attach callbacks.
+ */
for_each_subsys(root, ss) {
if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, tsk, false);
+ ss->attach(cgrp, &tset);
}
- set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
- synchronize_rcu();
- put_css_set(cg);
/*
- * wake up rmdir() waiter. the rmdir should fail since the cgroup
- * is no longer empty.
+ * step 5: success! and cleanup
*/
+ synchronize_rcu();
cgroup_wakeup_rmdir_waiter(cgrp);
-out:
+ retval = 0;
+out_put_css_set_refs:
+ if (retval) {
+ for (i = 0; i < group_size; i++) {
+ tc = flex_array_get(group, i);
+ if (!tc->cg)
+ break;
+ put_css_set(tc->cg);
+ }
+ }
+out_cancel_attach:
if (retval) {
for_each_subsys(root, ss) {
if (ss == failed_ss)
- /*
- * This subsystem was the one that failed the
- * can_attach() check earlier, so we don't need
- * to call cancel_attach() against it or any
- * remaining subsystems.
- */
break;
if (ss->cancel_attach)
- ss->cancel_attach(ss, cgrp, tsk, false);
+ ss->cancel_attach(cgrp, &tset);
}
}
+out_free_group_list:
+ flex_array_free(group);
return retval;
}
-/**
- * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup
- * @tsk: the task to be attached
- */
-int cgroup_attach_task_current_cg(struct task_struct *tsk)
+static int cgroup_allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
{
- struct cgroupfs_root *root;
- struct cgroup *cur_cg;
- int retval = 0;
+ struct cgroup_subsys *ss;
+ int ret;
- cgroup_lock();
- for_each_active_root(root) {
- cur_cg = task_cgroup_from_root(current, root);
- retval = cgroup_attach_task(cur_cg, tsk);
- if (retval)
- break;
+ for_each_subsys(cgrp->root, ss) {
+ if (ss->allow_attach) {
+ ret = ss->allow_attach(cgrp, tset);
+ if (ret)
+ return ret;
+ } else {
+ return -EACCES;
+ }
}
- cgroup_unlock();
- return retval;
+ return 0;
}
-EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg);
/*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * held. May take task_lock of task
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will lock
+ * cgroup_mutex and threadgroup; may take task_lock of task.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
int ret;
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+
+retry_find_task:
+ rcu_read_lock();
if (pid) {
- rcu_read_lock();
tsk = find_task_by_vpid(pid);
- if (!tsk || tsk->flags & PF_EXITING) {
+ if (!tsk) {
rcu_read_unlock();
- return -ESRCH;
+ ret= -ESRCH;
+ goto out_unlock_cgroup;
}
-
+ /*
+ * even if we're attaching all tasks in the thread group, we
+ * only need to check permissions on one of them.
+ */
tcred = __task_cred(tsk);
if (cred->euid &&
cred->euid != tcred->uid &&
cred->euid != tcred->suid) {
- rcu_read_unlock();
- return -EACCES;
+ /*
+ * if the default permission check fails, give each
+ * cgroup a chance to extend the permission check
+ */
+ struct cgroup_taskset tset = { };
+ tset.single.task = tsk;
+ tset.single.cgrp = cgrp;
+ ret = cgroup_allow_attach(cgrp, &tset);
+ if (ret) {
+ rcu_read_unlock();
+ cgroup_unlock();
+ return ret;
+ }
}
- get_task_struct(tsk);
- rcu_read_unlock();
- } else {
+ } else
tsk = current;
- get_task_struct(tsk);
- }
- ret = cgroup_attach_task(cgrp, tsk);
+ if (threadgroup)
+ tsk = tsk->group_leader;
+ get_task_struct(tsk);
+ rcu_read_unlock();
+
+ threadgroup_lock(tsk);
+ if (threadgroup) {
+ if (!thread_group_leader(tsk)) {
+ /*
+ * a race with de_thread from another thread's exec()
+ * may strip us of our leadership, if this happens,
+ * there is no choice but to throw this task away and
+ * try again; this is
+ * "double-double-toil-and-trouble-check locking".
+ */
+ threadgroup_unlock(tsk);
+ put_task_struct(tsk);
+ goto retry_find_task;
+ }
+ ret = cgroup_attach_proc(cgrp, tsk);
+ } else
+ ret = cgroup_attach_task(cgrp, tsk);
+ threadgroup_unlock(tsk);
+
put_task_struct(tsk);
+out_unlock_cgroup:
+ cgroup_unlock();
return ret;
}
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
{
- int ret;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
- ret = attach_task_by_pid(cgrp, pid);
- cgroup_unlock();
- return ret;
+ return attach_task_by_pid(cgrp, pid, false);
+}
+
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
+ return attach_task_by_pid(cgrp, tgid, true);
}
/**
const char *buffer)
{
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+ if (strlen(buffer) >= PATH_MAX)
+ return -EINVAL;
if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
+ mutex_lock(&cgroup_root_mutex);
strcpy(cgrp->root->release_agent_path, buffer);
+ mutex_unlock(&cgroup_root_mutex);
cgroup_unlock();
return 0;
}
};
static const struct inode_operations cgroup_dir_inode_operations = {
- .lookup = simple_lookup,
+ .lookup = cgroup_lookup,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.rename = cgroup_rename,
};
+static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+ if (dentry->d_name.len > NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+ d_add(dentry, NULL);
+ return NULL;
+}
+
/*
* Check if a file is a control file
*/
return __d_cft(file->f_dentry);
}
-static int cgroup_create_file(struct dentry *dentry, mode_t mode,
+static int cgroup_create_file(struct dentry *dentry, umode_t mode,
struct super_block *sb)
{
- static const struct dentry_operations cgroup_dops = {
- .d_iput = cgroup_diput,
- };
-
struct inode *inode;
if (!dentry)
inode->i_size = 0;
inode->i_fop = &cgroup_file_operations;
}
- dentry->d_op = &cgroup_dops;
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
return 0;
* @mode: mode to set on new directory.
*/
static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
- mode_t mode)
+ umode_t mode)
{
struct dentry *parent;
int error = 0;
* returns S_IRUGO if it has only a read handler
* returns S_IWUSR if it has only a write hander
*/
-static mode_t cgroup_file_mode(const struct cftype *cft)
+static umode_t cgroup_file_mode(const struct cftype *cft)
{
- mode_t mode = 0;
+ umode_t mode = 0;
if (cft->mode)
return cft->mode;
struct dentry *dir = cgrp->dentry;
struct dentry *dentry;
int error;
- mode_t mode;
+ umode_t mode;
char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
* using their cgroups capability, we don't maintain the lists running
* through each css_set to its tasks until we see the list actually
* used - in other words after the first call to cgroup_iter_start().
- *
- * The tasklist_lock is not held here, as do_each_thread() and
- * while_each_thread() are protected by RCU.
*/
static void cgroup_enable_task_cg_lists(void)
{
struct task_struct *p, *g;
write_lock(&css_set_lock);
use_task_css_set_links = 1;
+ /*
+ * We need tasklist_lock because RCU is not safe against
+ * while_each_thread(). Besides, a forking task that has passed
+ * cgroup_post_fork() without seeing use_task_css_set_links = 1
+ * is not guaranteed to have its child immediately visible in the
+ * tasklist if we walk through it with RCU.
+ */
+ read_lock(&tasklist_lock);
do_each_thread(g, p) {
task_lock(p);
/*
list_add(&p->cg_list, &p->cgroups->tasks);
task_unlock(p);
} while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
write_unlock(&css_set_lock);
}
void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
+ __acquires(css_set_lock)
{
/*
* The first time anyone tries to iterate across a cgroup,
}
void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
+ __releases(css_set_lock)
{
read_unlock(&css_set_lock);
}
*
*/
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+ CGROUP_FILE_PROCS,
+ CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+ /*
+ * used to find which pidlist is wanted. doesn't change as long as
+ * this particular list stays in the list.
+ */
+ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+ /* array of xids */
+ pid_t *list;
+ /* how many elements the above list has */
+ int length;
+ /* how many files are using the current array */
+ int use_count;
+ /* each of these stored in a list by its cgroup */
+ struct list_head links;
+ /* pointer to the cgroup we belong to, for list removal purposes */
+ struct cgroup *owner;
+ /* protects the other fields */
+ struct rw_semaphore mutex;
+};
+
/*
* The following two functions "fix" the issue where there are more pids
* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
}
/* the process need read permission on control file */
- ret = file_permission(cfile, MAY_READ);
+ /* AV: shouldn't we check that it's been opened for read instead? */
+ ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
if (ret < 0)
goto fail;
return ret;
}
+static u64 cgroup_clone_children_read(struct cgroup *cgrp,
+ struct cftype *cft)
+{
+ return clone_children(cgrp);
+}
+
+static int cgroup_clone_children_write(struct cgroup *cgrp,
+ struct cftype *cft,
+ u64 val)
+{
+ if (val)
+ set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+ else
+ clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+ return 0;
+}
+
/*
* for the common functions, 'private' gives the type of file
*/
{
.name = CGROUP_FILE_GENERIC_PREFIX "procs",
.open = cgroup_procs_open,
- /* .write_u64 = cgroup_procs_write, TODO */
+ .write_u64 = cgroup_procs_write,
.release = cgroup_pidlist_release,
- .mode = S_IRUGO,
+ .mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
.write_string = cgroup_write_event_control,
.mode = S_IWUGO,
},
+ {
+ .name = "cgroup.clone_children",
+ .read_u64 = cgroup_clone_children_read,
+ .write_u64 = cgroup_clone_children_write,
+ },
};
static struct cftype cft_release_agent = {
* Must be called with the mutex on the parent inode held
*/
static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
- mode_t mode)
+ umode_t mode)
{
struct cgroup *cgrp;
struct cgroupfs_root *root = parent->root;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+ if (clone_children(parent))
+ set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+
for_each_subsys(root, ss) {
- struct cgroup_subsys_state *css = ss->create(ss, cgrp);
+ struct cgroup_subsys_state *css = ss->create(cgrp);
if (IS_ERR(css)) {
err = PTR_ERR(css);
goto err_destroy;
}
/* At error, ->destroy() callback has to free assigned ID. */
+ if (clone_children(parent) && ss->post_clone)
+ ss->post_clone(cgrp);
}
cgroup_lock_hierarchy(root);
if (err < 0)
goto err_remove;
+ set_bit(CGRP_RELEASABLE, &parent->flags);
+
/* The cgroup directory was pre-locked for us */
BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
for_each_subsys(root, ss) {
if (cgrp->subsys[ss->subsys_id])
- ss->destroy(ss, cgrp);
+ ss->destroy(cgrp);
}
mutex_unlock(&cgroup_mutex);
return err;
}
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct cgroup *c_parent = dentry->d_parent->d_fsdata;
return !failed;
}
+/* checks if all of the css_sets attached to a cgroup have a refcount of 0.
+ * Must be called with css_set_lock held */
+static int cgroup_css_sets_empty(struct cgroup *cgrp)
+{
+ struct cg_cgroup_link *link;
+
+ list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
+ struct css_set *cg = link->cg;
+ if (atomic_read(&cg->refcount) > 0)
+ return 0;
+ }
+
+ return 1;
+}
+
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
struct cgroup *cgrp = dentry->d_fsdata;
/* the vfs holds both inode->i_mutex already */
again:
mutex_lock(&cgroup_mutex);
- if (atomic_read(&cgrp->count) != 0) {
+ if (!cgroup_css_sets_empty(cgrp)) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
mutex_lock(&cgroup_mutex);
parent = cgrp->parent;
- if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
+ if (!cgroup_css_sets_empty(cgrp) || !list_empty(&cgrp->children)) {
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
mutex_unlock(&cgroup_mutex);
return -EBUSY;
finish_wait(&cgroup_rmdir_waitq, &wait);
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- spin_lock(&release_list_lock);
+ raw_spin_lock(&release_list_lock);
set_bit(CGRP_REMOVED, &cgrp->flags);
if (!list_empty(&cgrp->release_list))
- list_del(&cgrp->release_list);
- spin_unlock(&release_list_lock);
+ list_del_init(&cgrp->release_list);
+ raw_spin_unlock(&release_list_lock);
cgroup_lock_hierarchy(cgrp->root);
/* delete this cgroup from parent->children */
- list_del(&cgrp->sibling);
+ list_del_init(&cgrp->sibling);
cgroup_unlock_hierarchy(cgrp->root);
- spin_lock(&cgrp->dentry->d_lock);
d = dget(cgrp->dentry);
- spin_unlock(&d->d_lock);
cgroup_d_remove_dir(d);
dput(d);
- set_bit(CGRP_RELEASABLE, &parent->flags);
check_for_release(parent);
/*
/* Create the top cgroup state for this subsystem */
list_add(&ss->sibling, &rootnode.subsys_list);
ss->root = &rootnode;
- css = ss->create(ss, dummytop);
+ css = ss->create(dummytop);
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_cgroup_css(css, ss, dummytop);
* no ss->create seems to need anything important in the ss struct, so
* this can happen first (i.e. before the rootnode attachment).
*/
- css = ss->create(ss, dummytop);
+ css = ss->create(dummytop);
if (IS_ERR(css)) {
/* failure case - need to deassign the subsys[] slot. */
subsys[i] = NULL;
int ret = cgroup_init_idr(ss, css);
if (ret) {
dummytop->subsys[ss->subsys_id] = NULL;
- ss->destroy(ss, dummytop);
+ ss->destroy(dummytop);
subsys[i] = NULL;
mutex_unlock(&cgroup_mutex);
return ret;
subsys[ss->subsys_id] = NULL;
/* remove subsystem from rootnode's list of subsystems */
- list_del(&ss->sibling);
+ list_del_init(&ss->sibling);
/*
* disentangle the css from all css_sets attached to the dummytop. as
* pointer to find their state. note that this also takes care of
* freeing the css_id.
*/
- ss->destroy(ss, dummytop);
+ ss->destroy(dummytop);
dummytop->subsys[ss->subsys_id] = NULL;
mutex_unlock(&cgroup_mutex);
*
* A pointer to the shared css_set was automatically copied in
* fork.c by dup_task_struct(). However, we ignore that copy, since
- * it was not made under the protection of RCU or cgroup_mutex, so
- * might no longer be a valid cgroup pointer. cgroup_attach_task() might
- * have already changed current->cgroups, allowing the previously
- * referenced cgroup group to be removed and freed.
+ * it was not made under the protection of RCU, cgroup_mutex or
+ * threadgroup_change_begin(), so it might no longer be a valid
+ * cgroup pointer. cgroup_attach_task() might have already changed
+ * current->cgroups, allowing the previously referenced cgroup
+ * group to be removed and freed.
+ *
+ * Outside the pointer validity we also need to process the css_set
+ * inheritance between threadgoup_change_begin() and
+ * threadgoup_change_end(), this way there is no leak in any process
+ * wide migration performed by cgroup_attach_proc() that could otherwise
+ * miss a thread because it is too early or too late in the fork stage.
*
* At the point that cgroup_fork() is called, 'current' is the parent
* task, and the passed argument 'child' points to the child task.
*/
void cgroup_fork(struct task_struct *child)
{
- task_lock(current);
+ /*
+ * We don't need to task_lock() current because current->cgroups
+ * can't be changed concurrently here. The parent obviously hasn't
+ * exited and called cgroup_exit(), and we are synchronized against
+ * cgroup migration through threadgroup_change_begin().
+ */
child->cgroups = current->cgroups;
get_css_set(child->cgroups);
- task_unlock(current);
INIT_LIST_HEAD(&child->cg_list);
}
for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (ss->fork)
- ss->fork(ss, child);
+ ss->fork(child);
}
}
}
*/
void cgroup_post_fork(struct task_struct *child)
{
+ /*
+ * use_task_css_set_links is set to 1 before we walk the tasklist
+ * under the tasklist_lock and we read it here after we added the child
+ * to the tasklist under the tasklist_lock as well. If the child wasn't
+ * yet in the tasklist when we walked through it from
+ * cgroup_enable_task_cg_lists(), then use_task_css_set_links value
+ * should be visible now due to the paired locking and barriers implied
+ * by LOCK/UNLOCK: it is written before the tasklist_lock unlock
+ * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
+ * lock on fork.
+ */
if (use_task_css_set_links) {
write_lock(&css_set_lock);
- task_lock(child);
- if (list_empty(&child->cg_list))
+ if (list_empty(&child->cg_list)) {
+ /*
+ * It's safe to use child->cgroups without task_lock()
+ * here because we are protected through
+ * threadgroup_change_begin() against concurrent
+ * css_set change in cgroup_task_migrate(). Also
+ * the task can't exit at that point until
+ * wake_up_new_task() is called, so we are protected
+ * against cgroup_exit() setting child->cgroup to
+ * init_css_set.
+ */
list_add(&child->cg_list, &child->cgroups->tasks);
- task_unlock(child);
+ }
write_unlock(&css_set_lock);
}
}
*/
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
{
- int i;
struct css_set *cg;
-
- if (run_callbacks && need_forkexit_callback) {
- /*
- * modular subsystems can't use callbacks, so no need to lock
- * the subsys array
- */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->exit)
- ss->exit(ss, tsk);
- }
- }
+ int i;
/*
* Unlink from the css_set task list if necessary.
if (!list_empty(&tsk->cg_list)) {
write_lock(&css_set_lock);
if (!list_empty(&tsk->cg_list))
- list_del(&tsk->cg_list);
+ list_del_init(&tsk->cg_list);
write_unlock(&css_set_lock);
}
task_lock(tsk);
cg = tsk->cgroups;
tsk->cgroups = &init_css_set;
- task_unlock(tsk);
- if (cg)
- put_css_set_taskexit(cg);
-}
-
-/**
- * cgroup_clone - clone the cgroup the given subsystem is attached to
- * @tsk: the task to be moved
- * @subsys: the given subsystem
- * @nodename: the name for the new cgroup
- *
- * Duplicate the current cgroup in the hierarchy that the given
- * subsystem is attached to, and move this task into the new
- * child.
- */
-int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
- char *nodename)
-{
- struct dentry *dentry;
- int ret = 0;
- struct cgroup *parent, *child;
- struct inode *inode;
- struct css_set *cg;
- struct cgroupfs_root *root;
- struct cgroup_subsys *ss;
- /* We shouldn't be called by an unregistered subsystem */
- BUG_ON(!subsys->active);
-
- /* First figure out what hierarchy and cgroup we're dealing
- * with, and pin them so we can drop cgroup_mutex */
- mutex_lock(&cgroup_mutex);
- again:
- root = subsys->root;
- if (root == &rootnode) {
- mutex_unlock(&cgroup_mutex);
- return 0;
- }
-
- /* Pin the hierarchy */
- if (!atomic_inc_not_zero(&root->sb->s_active)) {
- /* We race with the final deactivate_super() */
- mutex_unlock(&cgroup_mutex);
- return 0;
+ if (run_callbacks && need_forkexit_callback) {
+ /*
+ * modular subsystems can't use callbacks, so no need to lock
+ * the subsys array
+ */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss->exit) {
+ struct cgroup *old_cgrp =
+ rcu_dereference_raw(cg->subsys[i])->cgroup;
+ struct cgroup *cgrp = task_cgroup(tsk, i);
+ ss->exit(cgrp, old_cgrp, tsk);
+ }
+ }
}
-
- /* Keep the cgroup alive */
- task_lock(tsk);
- parent = task_cgroup(tsk, subsys->subsys_id);
- cg = tsk->cgroups;
- get_css_set(cg);
task_unlock(tsk);
- mutex_unlock(&cgroup_mutex);
-
- /* Now do the VFS work to create a cgroup */
- inode = parent->dentry->d_inode;
-
- /* Hold the parent directory mutex across this operation to
- * stop anyone else deleting the new cgroup */
- mutex_lock(&inode->i_mutex);
- dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
- if (IS_ERR(dentry)) {
- printk(KERN_INFO
- "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
- PTR_ERR(dentry));
- ret = PTR_ERR(dentry);
- goto out_release;
- }
-
- /* Create the cgroup directory, which also creates the cgroup */
- ret = vfs_mkdir(inode, dentry, 0755);
- child = __d_cgrp(dentry);
- dput(dentry);
- if (ret) {
- printk(KERN_INFO
- "Failed to create cgroup %s: %d\n", nodename,
- ret);
- goto out_release;
- }
-
- /* The cgroup now exists. Retake cgroup_mutex and check
- * that we're still in the same state that we thought we
- * were. */
- mutex_lock(&cgroup_mutex);
- if ((root != subsys->root) ||
- (parent != task_cgroup(tsk, subsys->subsys_id))) {
- /* Aargh, we raced ... */
- mutex_unlock(&inode->i_mutex);
+ if (cg)
put_css_set(cg);
-
- deactivate_super(root->sb);
- /* The cgroup is still accessible in the VFS, but
- * we're not going to try to rmdir() it at this
- * point. */
- printk(KERN_INFO
- "Race in cgroup_clone() - leaking cgroup %s\n",
- nodename);
- goto again;
- }
-
- /* do any required auto-setup */
- for_each_subsys(root, ss) {
- if (ss->post_clone)
- ss->post_clone(ss, child);
- }
-
- /* All seems fine. Finish by moving the task into the new cgroup */
- ret = cgroup_attach_task(child, tsk);
- mutex_unlock(&cgroup_mutex);
-
- out_release:
- mutex_unlock(&inode->i_mutex);
-
- mutex_lock(&cgroup_mutex);
- put_css_set(cg);
- mutex_unlock(&cgroup_mutex);
- deactivate_super(root->sb);
- return ret;
}
/**
* already queued for a userspace notification, queue
* it now */
int need_schedule_work = 0;
- spin_lock(&release_list_lock);
+ raw_spin_lock(&release_list_lock);
if (!cgroup_is_removed(cgrp) &&
list_empty(&cgrp->release_list)) {
list_add(&cgrp->release_list, &release_list);
need_schedule_work = 1;
}
- spin_unlock(&release_list_lock);
+ raw_spin_unlock(&release_list_lock);
if (need_schedule_work)
schedule_work(&release_agent_work);
}
}
+/* Caller must verify that the css is not for root cgroup */
+void __css_get(struct cgroup_subsys_state *css, int count)
+{
+ atomic_add(count, &css->refcnt);
+ set_bit(CGRP_RELEASABLE, &css->cgroup->flags);
+}
+EXPORT_SYMBOL_GPL(__css_get);
+
/* Caller must verify that the css is not for root cgroup */
void __css_put(struct cgroup_subsys_state *css, int count)
{
rcu_read_lock();
val = atomic_sub_return(count, &css->refcnt);
if (val == 1) {
- if (notify_on_release(cgrp)) {
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
- }
+ check_for_release(cgrp);
cgroup_wakeup_rmdir_waiter(cgrp);
}
rcu_read_unlock();
{
BUG_ON(work != &release_agent_work);
mutex_lock(&cgroup_mutex);
- spin_lock(&release_list_lock);
+ raw_spin_lock(&release_list_lock);
while (!list_empty(&release_list)) {
char *argv[3], *envp[3];
int i;
struct cgroup,
release_list);
list_del_init(&cgrp->release_list);
- spin_unlock(&release_list_lock);
+ raw_spin_unlock(&release_list_lock);
pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!pathbuf)
goto continue_free;
continue_free:
kfree(pathbuf);
kfree(agentbuf);
- spin_lock(&release_list_lock);
+ raw_spin_lock(&release_list_lock);
}
- spin_unlock(&release_list_lock);
+ raw_spin_unlock(&release_list_lock);
mutex_unlock(&cgroup_mutex);
}
* on this or this is under rcu_read_lock(). Once css->id is allocated,
* it's unchanged until freed.
*/
- cssid = rcu_dereference_check(css->id,
- rcu_read_lock_held() || atomic_read(&css->refcnt));
+ cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
if (cssid)
return cssid->id;
{
struct css_id *cssid;
- cssid = rcu_dereference_check(css->id,
- rcu_read_lock_held() || atomic_read(&css->refcnt));
+ cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
if (cssid)
return cssid->depth;
return ret;
}
-static void __free_css_id_cb(struct rcu_head *head)
-{
- struct css_id *id;
-
- id = container_of(head, struct css_id, rcu_head);
- kfree(id);
-}
-
void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
{
struct css_id *id = css->id;
spin_lock(&ss->id_lock);
idr_remove(&ss->idr, id->id);
spin_unlock(&ss->id_lock);
- call_rcu(&id->rcu_head, __free_css_id_cb);
+ kfree_rcu(id, rcu_head);
}
EXPORT_SYMBOL_GPL(free_css_id);
return NULL;
BUG_ON(!ss->use_id);
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
/* fill start point for scan */
tmpid = id;
while (1) {
* scan next entry from bitmap(tree), tmpid is updated after
* idr_get_next().
*/
- spin_lock(&ss->id_lock);
tmp = idr_get_next(&ss->idr, &tmpid);
- spin_unlock(&ss->id_lock);
-
if (!tmp)
break;
if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
return ret;
}
+/*
+ * get corresponding css from file open on cgroupfs directory
+ */
+struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
+{
+ struct cgroup *cgrp;
+ struct inode *inode;
+ struct cgroup_subsys_state *css;
+
+ inode = f->f_dentry->d_inode;
+ /* check in cgroup filesystem dir */
+ if (inode->i_op != &cgroup_dir_inode_operations)
+ return ERR_PTR(-EBADF);
+
+ if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
+ return ERR_PTR(-EINVAL);
+
+ /* get cgroup */
+ cgrp = __d_cgrp(f->f_dentry);
+ css = cgrp->subsys[id];
+ return css ? css : ERR_PTR(-ENOENT);
+}
+
#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
- struct cgroup *cont)
+static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
{
struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
return css;
}
-static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+static void debug_destroy(struct cgroup *cont)
{
kfree(cont->subsys[debug_subsys_id]);
}