sched: group-scheduler core
Srivatsa Vaddagiri [Mon, 15 Oct 2007 15:00:07 +0000 (17:00 +0200)]
Add interface to control cpu bandwidth allocation to task-groups.

(not yet configurable, due to missing CONFIG_CONTAINERS)

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

init/Kconfig
kernel/sched.c
kernel/sched_fair.c
kernel/sched_idletask.c
kernel/sched_rt.c

index d54d0ca..11c6762 100644 (file)
@@ -281,6 +281,15 @@ config CPUSETS
 
          Say N if unsure.
 
+config FAIR_GROUP_SCHED
+       bool "Fair group scheduler"
+       depends on EXPERIMENTAL && CONTAINERS
+       help
+         This option enables you to group tasks and control CPU resource
+         allocation to such groups.
+
+         Say N if unsure.
+
 config SYSFS_DEPRECATED
        bool "Create deprecated sysfs files"
        default y
index 4ad789d..b2688ce 100644 (file)
@@ -171,6 +171,58 @@ struct rt_prio_array {
        struct list_head queue[MAX_RT_PRIO];
 };
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+#include <linux/container.h>
+
+struct cfs_rq;
+
+/* task group related information */
+struct task_grp {
+       struct container_subsys_state css;
+       /* schedulable entities of this group on each cpu */
+       struct sched_entity **se;
+       /* runqueue "owned" by this group on each cpu */
+       struct cfs_rq **cfs_rq;
+       unsigned long shares;
+};
+
+/* Default task group's sched entity on each cpu */
+static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
+/* Default task group's cfs_rq on each cpu */
+static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+
+static struct sched_entity *init_sched_entity_p[CONFIG_NR_CPUS];
+static struct cfs_rq *init_cfs_rq_p[CONFIG_NR_CPUS];
+
+/* Default task group.
+ *     Every task in system belong to this group at bootup.
+ */
+static struct task_grp init_task_grp =  {
+                                       .se     = init_sched_entity_p,
+                                       .cfs_rq = init_cfs_rq_p,
+                                       };
+
+/* return group to which a task belongs */
+static inline struct task_grp *task_grp(struct task_struct *p)
+{
+       return container_of(task_subsys_state(p, cpu_subsys_id),
+                               struct task_grp, css);
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+       p->se.cfs_rq = task_grp(p)->cfs_rq[task_cpu(p)];
+       p->se.parent = task_grp(p)->se[task_cpu(p)];
+}
+
+#else
+
+static inline void set_task_cfs_rq(struct task_struct *p) { }
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
        struct load_weight load;
@@ -197,6 +249,7 @@ struct cfs_rq {
         * list is used during load balance.
         */
        struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
+       struct task_grp *tg;    /* group that "owns" this runqueue */
 #endif
 };
 
@@ -419,18 +472,6 @@ unsigned long long cpu_clock(int cpu)
        return now;
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* Change a task's ->cfs_rq if it moves across CPUs */
-static inline void set_task_cfs_rq(struct task_struct *p)
-{
-       p->se.cfs_rq = &task_rq(p)->cfs;
-}
-#else
-static inline void set_task_cfs_rq(struct task_struct *p)
-{
-}
-#endif
-
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)     do { } while (0)
 #endif
@@ -970,8 +1011,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 #ifdef CONFIG_SMP
        task_thread_info(p)->cpu = cpu;
-       set_task_cfs_rq(p);
 #endif
+       set_task_cfs_rq(p);
 }
 
 #ifdef CONFIG_SMP
@@ -3885,8 +3926,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
        oldprio = p->prio;
        on_rq = p->se.on_rq;
-       if (on_rq)
+       if (on_rq) {
                dequeue_task(rq, p, 0);
+               if (task_running(rq, p))
+                       p->sched_class->put_prev_task(rq, p);
+       }
 
        if (rt_prio(prio))
                p->sched_class = &rt_sched_class;
@@ -3905,6 +3949,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                if (task_running(rq, p)) {
                        if (p->prio > oldprio)
                                resched_task(rq->curr);
+                       p->sched_class->set_curr_task(rq);
                } else {
                        check_preempt_curr(rq, p);
                }
@@ -4190,8 +4235,11 @@ recheck:
        }
        update_rq_clock(rq);
        on_rq = p->se.on_rq;
-       if (on_rq)
+       if (on_rq) {
                deactivate_task(rq, p, 0);
+               if (task_running(rq, p))
+                       p->sched_class->put_prev_task(rq, p);
+       }
        oldprio = p->prio;
        __setscheduler(rq, p, policy, param->sched_priority);
        if (on_rq) {
@@ -4204,6 +4252,7 @@ recheck:
                if (task_running(rq, p)) {
                        if (p->prio > oldprio)
                                resched_task(rq->curr);
+                       p->sched_class->set_curr_task(rq);
                } else {
                        check_preempt_curr(rq, p);
                }
@@ -6444,7 +6493,25 @@ void __init sched_init(void)
                init_cfs_rq(&rq->cfs, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-               list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+               {
+                       struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
+                       struct sched_entity *se =
+                                        &per_cpu(init_sched_entity, i);
+
+                       init_cfs_rq_p[i] = cfs_rq;
+                       init_cfs_rq(cfs_rq, rq);
+                       cfs_rq->tg = &init_task_grp;
+                       list_add(&cfs_rq->leaf_cfs_rq_list,
+                                                        &rq->leaf_cfs_rq_list);
+
+                       init_sched_entity_p[i] = se;
+                       se->cfs_rq = &rq->cfs;
+                       se->my_q = cfs_rq;
+                       se->load.weight = NICE_0_LOAD;
+                       se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
+                       se->parent = NULL;
+               }
+               init_task_grp.shares = NICE_0_LOAD;
 #endif
 
                for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -6632,3 +6699,250 @@ void set_curr_task(int cpu, struct task_struct *p)
 }
 
 #endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* return corresponding task_grp object of a container */
+static inline struct task_grp *container_tg(struct container *cont)
+{
+       return container_of(container_subsys_state(cont, cpu_subsys_id),
+                                        struct task_grp, css);
+}
+
+/* allocate runqueue etc for a new task group */
+static struct container_subsys_state *
+sched_create_group(struct container_subsys *ss, struct container *cont)
+{
+       struct task_grp *tg;
+       struct cfs_rq *cfs_rq;
+       struct sched_entity *se;
+       int i;
+
+       if (!cont->parent) {
+               /* This is early initialization for the top container */
+               init_task_grp.css.container = cont;
+               return &init_task_grp.css;
+       }
+
+       /* we support only 1-level deep hierarchical scheduler atm */
+       if (cont->parent->parent)
+               return ERR_PTR(-EINVAL);
+
+       tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+       if (!tg)
+               return ERR_PTR(-ENOMEM);
+
+       tg->cfs_rq = kzalloc(sizeof(cfs_rq) * num_possible_cpus(), GFP_KERNEL);
+       if (!tg->cfs_rq)
+               goto err;
+       tg->se = kzalloc(sizeof(se) * num_possible_cpus(), GFP_KERNEL);
+       if (!tg->se)
+               goto err;
+
+       for_each_possible_cpu(i) {
+               struct rq *rq = cpu_rq(i);
+
+               cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
+                                                        cpu_to_node(i));
+               if (!cfs_rq)
+                       goto err;
+
+               se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
+                                                       cpu_to_node(i));
+               if (!se)
+                       goto err;
+
+               memset(cfs_rq, 0, sizeof(struct cfs_rq));
+               memset(se, 0, sizeof(struct sched_entity));
+
+               tg->cfs_rq[i] = cfs_rq;
+               init_cfs_rq(cfs_rq, rq);
+               cfs_rq->tg = tg;
+               list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+
+               tg->se[i] = se;
+               se->cfs_rq = &rq->cfs;
+               se->my_q = cfs_rq;
+               se->load.weight = NICE_0_LOAD;
+               se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
+               se->parent = NULL;
+       }
+
+       tg->shares = NICE_0_LOAD;
+
+       /* Bind the container to task_grp object we just created */
+       tg->css.container = cont;
+
+       return &tg->css;
+
+err:
+       for_each_possible_cpu(i) {
+               if (tg->cfs_rq && tg->cfs_rq[i])
+                       kfree(tg->cfs_rq[i]);
+               if (tg->se && tg->se[i])
+                       kfree(tg->se[i]);
+       }
+       if (tg->cfs_rq)
+               kfree(tg->cfs_rq);
+       if (tg->se)
+               kfree(tg->se);
+       if (tg)
+               kfree(tg);
+
+       return ERR_PTR(-ENOMEM);
+}
+
+
+/* destroy runqueue etc associated with a task group */
+static void sched_destroy_group(struct container_subsys *ss,
+                                       struct container *cont)
+{
+       struct task_grp *tg = container_tg(cont);
+       struct cfs_rq *cfs_rq;
+       struct sched_entity *se;
+       int i;
+
+       for_each_possible_cpu(i) {
+               cfs_rq = tg->cfs_rq[i];
+               list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+       }
+
+       /* wait for possible concurrent references to cfs_rqs complete */
+       synchronize_sched();
+
+       /* now it should be safe to free those cfs_rqs */
+       for_each_possible_cpu(i) {
+               cfs_rq = tg->cfs_rq[i];
+               kfree(cfs_rq);
+
+               se = tg->se[i];
+               kfree(se);
+       }
+
+       kfree(tg->cfs_rq);
+       kfree(tg->se);
+       kfree(tg);
+}
+
+static int sched_can_attach(struct container_subsys *ss,
+                            struct container *cont, struct task_struct *tsk)
+{
+       /* We don't support RT-tasks being in separate groups */
+       if (tsk->sched_class != &fair_sched_class)
+               return -EINVAL;
+
+       return 0;
+}
+
+/* change task's runqueue when it moves between groups */
+static void sched_move_task(struct container_subsys *ss, struct container *cont,
+                       struct container *old_cont, struct task_struct *tsk)
+{
+       int on_rq, running;
+       unsigned long flags;
+       struct rq *rq;
+
+       rq = task_rq_lock(tsk, &flags);
+
+       if (tsk->sched_class != &fair_sched_class)
+               goto done;
+
+       update_rq_clock(rq);
+
+       running = task_running(rq, tsk);
+       on_rq = tsk->se.on_rq;
+
+       if (on_rq) {
+               dequeue_task(rq, tsk, 0);
+               if (unlikely(running))
+                       tsk->sched_class->put_prev_task(rq, tsk);
+       }
+
+       set_task_cfs_rq(tsk);
+
+       if (on_rq) {
+               enqueue_task(rq, tsk, 0);
+               if (unlikely(running))
+                       tsk->sched_class->set_curr_task(rq);
+       }
+
+done:
+       task_rq_unlock(rq, &flags);
+}
+
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
+{
+       struct cfs_rq *cfs_rq = se->cfs_rq;
+       struct rq *rq = cfs_rq->rq;
+       int on_rq;
+
+       spin_lock_irq(&rq->lock);
+
+       on_rq = se->on_rq;
+       if (on_rq)
+               dequeue_entity(cfs_rq, se, 0);
+
+       se->load.weight = shares;
+       se->load.inv_weight = div64_64((1ULL<<32), shares);
+
+       if (on_rq)
+               enqueue_entity(cfs_rq, se, 0);
+
+       spin_unlock_irq(&rq->lock);
+}
+
+static ssize_t cpu_shares_write(struct container *cont, struct cftype *cftype,
+                               struct file *file, const char __user *userbuf,
+                               size_t nbytes, loff_t *ppos)
+{
+       int i;
+       unsigned long shareval;
+       struct task_grp *tg = container_tg(cont);
+       char buffer[2*sizeof(unsigned long) + 1];
+
+       if (nbytes > 2*sizeof(unsigned long))   /* safety check */
+               return -E2BIG;
+
+       if (copy_from_user(buffer, userbuf, nbytes))
+               return -EFAULT;
+
+       buffer[nbytes] = 0;     /* nul-terminate */
+       shareval = simple_strtoul(buffer, NULL, 10);
+
+       tg->shares = shareval;
+       for_each_possible_cpu(i)
+               set_se_shares(tg->se[i], shareval);
+
+       return nbytes;
+}
+
+static u64 cpu_shares_read_uint(struct container *cont, struct cftype *cft)
+{
+       struct task_grp *tg = container_tg(cont);
+
+       return (u64) tg->shares;
+}
+
+struct cftype cpuctl_share = {
+       .name = "shares",
+       .read_uint = cpu_shares_read_uint,
+       .write = cpu_shares_write,
+};
+
+static int sched_populate(struct container_subsys *ss, struct container *cont)
+{
+       return container_add_file(cont, ss, &cpuctl_share);
+}
+
+struct container_subsys cpu_subsys = {
+       .name = "cpu",
+       .create = sched_create_group,
+       .destroy  = sched_destroy_group,
+       .can_attach = sched_can_attach,
+       .attach = sched_move_task,
+       .populate = sched_populate,
+       .subsys_id = cpu_subsys_id,
+       .early_init = 1,
+};
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
index ec445ca..12ab933 100644 (file)
@@ -610,8 +610,7 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
  */
 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 {
-       /* A later patch will take group into account */
-       return &cpu_rq(this_cpu)->cfs;
+       return cfs_rq->tg->cfs_rq[this_cpu];
 }
 
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
index 3503fb2..5ebf829 100644 (file)
@@ -50,6 +50,10 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr)
 {
 }
 
+static void set_curr_task_idle(struct rq *rq)
+{
+}
+
 /*
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
@@ -66,6 +70,7 @@ static struct sched_class idle_sched_class __read_mostly = {
 
        .load_balance           = load_balance_idle,
 
+       .set_curr_task          = set_curr_task_idle,
        .task_tick              = task_tick_idle,
        /* no .task_new for idle tasks */
 };
index 4b87476..45b339f 100644 (file)
@@ -218,6 +218,10 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
        }
 }
 
+static void set_curr_task_rt(struct rq *rq)
+{
+}
+
 static struct sched_class rt_sched_class __read_mostly = {
        .enqueue_task           = enqueue_task_rt,
        .dequeue_task           = dequeue_task_rt,
@@ -230,5 +234,6 @@ static struct sched_class rt_sched_class __read_mostly = {
 
        .load_balance           = load_balance_rt,
 
+       .set_curr_task          = set_curr_task_rt,
        .task_tick              = task_tick_rt,
 };