#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/memory.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/sort.h>
#include <asm/uaccess.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/mutex.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
struct cpuset, css);
}
+#ifdef CONFIG_NUMA
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+ return task->mempolicy;
+}
+#else
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+ return false;
+}
+#endif
+
+
/* bits in struct cpuset flags field */
typedef enum {
CS_CPU_EXCLUSIVE,
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
*/
-static int cpuset_get_sb(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name,
- void *data, struct vfsmount *mnt)
+static struct dentry *cpuset_mount(struct file_system_type *fs_type,
+ int flags, const char *unused_dev_name, void *data)
{
struct file_system_type *cgroup_fs = get_fs_type("cgroup");
- int ret = -ENODEV;
+ struct dentry *ret = ERR_PTR(-ENODEV);
if (cgroup_fs) {
char mountopts[] =
"cpuset,noprefix,"
"release_agent=/sbin/cpuset_release_agent";
- ret = cgroup_fs->get_sb(cgroup_fs, flags,
- unused_dev_name, mountopts, mnt);
+ ret = cgroup_fs->mount(cgroup_fs, flags,
+ unused_dev_name, mountopts);
put_filesystem(cgroup_fs);
}
return ret;
static struct file_system_type cpuset_fs_type = {
.name = "cpuset",
- .get_sb = cpuset_get_sb,
+ .mount = cpuset_mount,
};
/*
* are online. If none are online, walk up the cpuset hierarchy
* until we find one that does have some online cpus. If we get
* all the way to the top and still haven't found any online cpus,
- * return cpu_online_map. Or if passed a NULL cs from an exit'ing
- * task, return cpu_online_map.
+ * return cpu_online_mask. Or if passed a NULL cs from an exit'ing
+ * task, return cpu_online_mask.
*
* One way or another, we guarantee to return some non-empty subset
- * of cpu_online_map.
+ * of cpu_online_mask.
*
* Call with callback_mutex held.
*/
int retval;
int is_load_balanced;
- /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
+ /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
return -EACCES;
static void cpuset_change_task_nodemask(struct task_struct *tsk,
nodemask_t *newmems)
{
-repeat:
+ bool need_loop;
+
/*
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
return;
task_lock(tsk);
- nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
- mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-
-
/*
- * ensure checking ->mems_allowed_change_disable after setting all new
- * allowed nodes.
- *
- * the read-side task can see an nodemask with new allowed nodes and
- * old allowed nodes. and if it allocates page when cpuset clears newly
- * disallowed ones continuous, it can see the new allowed bits.
- *
- * And if setting all new allowed nodes is after the checking, setting
- * all new allowed nodes and clearing newly disallowed ones will be done
- * continuous, and the read-side task may find no node to alloc page.
+ * Determine if a loop is necessary if another thread is doing
+ * get_mems_allowed(). If at least one node remains unchanged and
+ * tsk does not have a mempolicy, then an empty nodemask will not be
+ * possible when mems_allowed is larger than a word.
*/
- smp_mb();
+ need_loop = task_has_mempolicy(tsk) ||
+ !nodes_intersects(*newmems, tsk->mems_allowed);
- /*
- * Allocation of memory is very fast, we needn't sleep when waiting
- * for the read-side.
- */
- while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
- task_unlock(tsk);
- if (!task_curr(tsk))
- yield();
- goto repeat;
- }
+ if (need_loop)
+ write_seqcount_begin(&tsk->mems_allowed_seq);
- /*
- * ensure checking ->mems_allowed_change_disable before clearing all new
- * disallowed nodes.
- *
- * if clearing newly disallowed bits before the checking, the read-side
- * task may find no node to alloc page.
- */
- smp_mb();
+ nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+ mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
tsk->mems_allowed = *newmems;
+
+ if (need_loop)
+ write_seqcount_end(&tsk->mems_allowed_seq);
+
task_unlock(tsk);
}
struct cpuset *cs;
int migrate;
const nodemask_t *oldmem = scan->data;
- NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
-
- if (!newmems)
- return;
+ static nodemask_t newmems; /* protected by cgroup_mutex */
cs = cgroup_cs(scan->cg);
- guarantee_online_mems(cs, newmems);
+ guarantee_online_mems(cs, &newmems);
- cpuset_change_task_nodemask(p, newmems);
-
- NODEMASK_FREE(newmems);
+ cpuset_change_task_nodemask(p, &newmems);
mm = get_task_mm(p);
if (!mm)
static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
#ifdef CONFIG_SMP
- if (val < -1 || val >= SD_LV_MAX)
+ if (val < -1 || val >= sched_domain_level_max)
return -EINVAL;
#endif
return val;
}
-/* Protected by cgroup_lock */
-static cpumask_var_t cpus_attach;
-
/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct task_struct *tsk, bool threadgroup)
+ struct task_struct *tsk)
{
- int ret;
struct cpuset *cs = cgroup_cs(cont);
+ if ((current != task) && (!capable(CAP_SYS_ADMIN))) {
+ const struct cred *cred = current_cred(), *tcred;
+
+ if (cred->euid != tcred->uid && cred->euid != tcred->suid)
+ return -EPERM;
+ }
+
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
return -ENOSPC;
if (tsk->flags & PF_THREAD_BOUND)
return -EINVAL;
- ret = security_task_setscheduler(tsk, 0, NULL);
- if (ret)
- return ret;
- if (threadgroup) {
- struct task_struct *c;
-
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- ret = security_task_setscheduler(c, 0, NULL);
- if (ret) {
- rcu_read_unlock();
- return ret;
- }
- }
- rcu_read_unlock();
- }
return 0;
}
-static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
- struct cpuset *cs)
+static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
{
- int err;
- /*
- * can_attach beforehand should guarantee that this doesn't fail.
- * TODO: have a better way to handle failure here
- */
- err = set_cpus_allowed_ptr(tsk, cpus_attach);
- WARN_ON_ONCE(err);
+ return security_task_setscheduler(task);
+}
- cpuset_change_task_nodemask(tsk, to);
- cpuset_update_task_spread_flag(cs, tsk);
+/*
+ * Protected by cgroup_lock. The nodemasks must be stored globally because
+ * dynamically allocating them is not allowed in can_attach, and they must
+ * persist until attach.
+ */
+static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_from;
+static nodemask_t cpuset_attach_nodemask_to;
+
+/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
+static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+{
+ struct cpuset *cs = cgroup_cs(cgrp);
+ struct task_struct *task;
+ int ret;
+
+ if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+ return -ENOSPC;
+ cgroup_taskset_for_each(task, cgrp, tset) {
+ /*
+ * Kthreads bound to specific cpus cannot be moved to a new
+ * cpuset; we cannot change their cpu affinity and
+ * isolating such threads by their set of allowed nodes is
+ * unnecessary. Thus, cpusets are not applicable for such
+ * threads. This prevents checking for success of
+ * set_cpus_allowed_ptr() on all attached tasks before
+ * cpus_allowed may be changed.
+ */
+ if (task->flags & PF_THREAD_BOUND)
+ return -EINVAL;
+ if ((ret = security_task_setscheduler(task)))
+ return ret;
+ }
+
+ /* prepare for attach */
+ if (cs == &top_cpuset)
+ cpumask_copy(cpus_attach, cpu_possible_mask);
+ else
+ guarantee_online_cpus(cs, cpus_attach);
+
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+
+ return 0;
}
-static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct cgroup *oldcont, struct task_struct *tsk,
- bool threadgroup)
+static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
{
struct mm_struct *mm;
- struct cpuset *cs = cgroup_cs(cont);
- struct cpuset *oldcs = cgroup_cs(oldcont);
- NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
- NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
+ struct task_struct *task;
+ struct task_struct *leader = cgroup_taskset_first(tset);
+ struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
+ struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *oldcs = cgroup_cs(oldcgrp);
- if (from == NULL || to == NULL)
- goto alloc_fail;
+ cgroup_taskset_for_each(task, cgrp, tset) {
+ /*
+ * can_attach beforehand should guarantee that this doesn't
+ * fail. TODO: have a better way to handle failure here
+ */
+ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
- if (cs == &top_cpuset) {
- cpumask_copy(cpus_attach, cpu_possible_mask);
- } else {
- guarantee_online_cpus(cs, cpus_attach);
- }
- guarantee_online_mems(cs, to);
-
- /* do per-task migration stuff possibly for each in the threadgroup */
- cpuset_attach_task(tsk, to, cs);
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- cpuset_attach_task(c, to, cs);
- }
- rcu_read_unlock();
+ cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
+ cpuset_update_task_spread_flag(cs, task);
}
- /* change mm; only needs to be done once even if threadgroup */
- *from = oldcs->mems_allowed;
- *to = cs->mems_allowed;
- mm = get_task_mm(tsk);
+ /*
+ * Change mm, possibly for multiple threads in a threadgroup. This is
+ * expensive and may sleep.
+ */
+ cpuset_attach_nodemask_from = oldcs->mems_allowed;
+ cpuset_attach_nodemask_to = cs->mems_allowed;
+ mm = get_task_mm(leader);
if (mm) {
- mpol_rebind_mm(mm, to);
+ mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
if (is_memory_migrate(cs))
- cpuset_migrate_mm(mm, from, to);
+ cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
+ &cpuset_attach_nodemask_to);
mmput(mm);
}
-
-alloc_fail:
- NODEMASK_FREE(from);
- NODEMASK_FREE(to);
}
/* The various types of files and directories in a cpuset file system */
return -ENODEV;
trialcs = alloc_trial_cpuset(cs);
- if (!trialcs)
- return -ENOMEM;
+ if (!trialcs) {
+ retval = -ENOMEM;
+ goto out;
+ }
switch (cft->private) {
case FILE_CPULIST:
}
free_trial_cpuset(trialcs);
+out:
cgroup_unlock();
return retval;
}
* across a page fault.
*/
-static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
+static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
{
- int ret;
+ size_t count;
mutex_lock(&callback_mutex);
- ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
+ count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
mutex_unlock(&callback_mutex);
- return ret;
+ return count;
}
-static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
+static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
{
- NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
- int retval;
-
- if (mask == NULL)
- return -ENOMEM;
+ size_t count;
mutex_lock(&callback_mutex);
- *mask = cs->mems_allowed;
+ count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
mutex_unlock(&callback_mutex);
- retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
-
- NODEMASK_FREE(mask);
-
- return retval;
+ return count;
}
static ssize_t cpuset_common_file_read(struct cgroup *cont,
}
/*
- * post_clone() is called at the end of cgroup_clone().
- * 'cgroup' was just created automatically as a result of
- * a cgroup_clone(), and the current task is about to
- * be moved into 'cgroup'.
+ * post_clone() is called during cgroup_create() when the
+ * clone_children mount argument was specified. The cgroup
+ * can not yet have any tasks.
*
* Currently we refuse to set up the cgroup - thereby
* refusing the task to be entered, and as a result refusing
* (and likewise for mems) to the new cgroup. Called with cgroup_mutex
* held.
*/
-static void cpuset_post_clone(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
+static void cpuset_post_clone(struct cgroup *cgroup)
{
struct cgroup *parent, *child;
struct cpuset *cs, *parent_cs;
cs = cgroup_cs(cgroup);
parent_cs = cgroup_cs(parent);
+ mutex_lock(&callback_mutex);
cs->mems_allowed = parent_cs->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
+ mutex_unlock(&callback_mutex);
return;
}
/*
* cpuset_create - create a cpuset
- * ss: cpuset cgroup subsystem
* cont: control group that the new cpuset will be part of
*/
-static struct cgroup_subsys_state *cpuset_create(
- struct cgroup_subsys *ss,
- struct cgroup *cont)
+static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
{
struct cpuset *cs;
struct cpuset *parent;
* will call async_rebuild_sched_domains().
*/
-static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+static void cpuset_destroy(struct cgroup *cont)
{
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *cp; /* scans cpusets being updated */
struct cpuset *child; /* scans child cpusets of cp */
struct cgroup *cont;
- NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
- if (oldmems == NULL)
- return;
+ static nodemask_t oldmems; /* protected by cgroup_mutex */
list_add_tail((struct list_head *)&root->stack_list, &queue);
nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
continue;
- *oldmems = cp->mems_allowed;
+ oldmems = cp->mems_allowed;
/* Remove offline cpus and mems from this cpuset. */
mutex_lock(&callback_mutex);
remove_tasks_in_empty_cpuset(cp);
else {
update_tasks_cpumask(cp, NULL);
- update_tasks_nodemask(cp, oldmems, NULL);
+ update_tasks_nodemask(cp, &oldmems, NULL);
}
}
- NODEMASK_FREE(oldmems);
}
/*
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
- NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
- if (oldmems == NULL)
- return NOTIFY_DONE;
+ static nodemask_t oldmems; /* protected by cgroup_mutex */
cgroup_lock();
switch (action) {
case MEM_ONLINE:
- *oldmems = top_cpuset.mems_allowed;
+ oldmems = top_cpuset.mems_allowed;
mutex_lock(&callback_mutex);
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
mutex_unlock(&callback_mutex);
- update_tasks_nodemask(&top_cpuset, oldmems, NULL);
+ update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
break;
case MEM_OFFLINE:
/*
}
cgroup_unlock();
- NODEMASK_FREE(oldmems);
return NOTIFY_OK;
}
#endif
*
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
- * subset of cpu_online_map, even if this means going outside the
+ * subset of cpu_online_mask, even if this means going outside the
* tasks cpuset.
**/
mutex_unlock(&callback_mutex);
}
-int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
const struct cpuset *cs;
- int cpu;
rcu_read_lock();
cs = task_cs(tsk);
if (cs)
- cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+ do_set_cpus_allowed(tsk, cs->cpus_allowed);
rcu_read_unlock();
/*
* changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
* set any mask even if it is not right from task_cs() pov,
* the pending set_cpus_allowed_ptr() will fix things.
+ *
+ * select_fallback_rq() will fix things ups and set cpu_possible_mask
+ * if required.
*/
-
- cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
- if (cpu >= nr_cpu_ids) {
- /*
- * Either tsk->cpus_allowed is wrong (see above) or it
- * is actually empty. The latter case is only possible
- * if we are racing with remove_tasks_in_empty_cpuset().
- * Like above we can temporary set any mask and rely on
- * set_cpus_allowed_ptr() as synchronization point.
- */
- cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
- cpu = cpumask_any(cpu_active_mask);
- }
-
- return cpu;
}
void cpuset_init_current_mems_allowed(void)
int cpuset_mem_spread_node(void)
{
+ if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
+ current->cpuset_mem_spread_rotor =
+ node_random(¤t->mems_allowed);
+
return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
}
int cpuset_slab_spread_node(void)
{
+ if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
+ current->cpuset_slab_spread_rotor =
+ node_random(¤t->mems_allowed);
+
return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
}