Merge branch 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6.git] / kernel / sched.c
index 22c532a..fff1c4a 100644 (file)
  */
 #define RUNTIME_INF    ((u64)~0ULL)
 
+DEFINE_TRACE(sched_wait_task);
+DEFINE_TRACE(sched_wakeup);
+DEFINE_TRACE(sched_wakeup_new);
+DEFINE_TRACE(sched_switch);
+DEFINE_TRACE(sched_migrate_task);
+
 #ifdef CONFIG_SMP
 /*
  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -260,6 +266,10 @@ struct task_group {
        struct cgroup_subsys_state css;
 #endif
 
+#ifdef CONFIG_USER_SCHED
+       uid_t uid;
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* schedulable entities of this group on each cpu */
        struct sched_entity **se;
@@ -285,6 +295,12 @@ struct task_group {
 
 #ifdef CONFIG_USER_SCHED
 
+/* Helper function to pass uid information to create_sched_user() */
+void set_tg_uid(struct user_struct *user)
+{
+       user->tg->uid = user->uid;
+}
+
 /*
  * Root task group.
  *     Every UID task group (including init_task_group aka UID-0) will
@@ -344,7 +360,9 @@ static inline struct task_group *task_group(struct task_struct *p)
        struct task_group *tg;
 
 #ifdef CONFIG_USER_SCHED
-       tg = p->user->tg;
+       rcu_read_lock();
+       tg = __task_cred(p)->user->tg;
+       rcu_read_unlock();
 #elif defined(CONFIG_CGROUP_SCHED)
        tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
                                struct task_group, css);
@@ -585,6 +603,8 @@ struct rq {
 #ifdef CONFIG_SCHEDSTATS
        /* latency stats */
        struct sched_info rq_sched_info;
+       unsigned long long rq_cpu_time;
+       /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
 
        /* sys_sched_yield() stats */
        unsigned int yld_exp_empty;
@@ -702,45 +722,18 @@ static __read_mostly char *sched_feat_names[] = {
 
 #undef SCHED_FEAT
 
-static int sched_feat_open(struct inode *inode, struct file *filp)
-{
-       filp->private_data = inode->i_private;
-       return 0;
-}
-
-static ssize_t
-sched_feat_read(struct file *filp, char __user *ubuf,
-               size_t cnt, loff_t *ppos)
+static int sched_feat_show(struct seq_file *m, void *v)
 {
-       char *buf;
-       int r = 0;
-       int len = 0;
        int i;
 
        for (i = 0; sched_feat_names[i]; i++) {
-               len += strlen(sched_feat_names[i]);
-               len += 4;
-       }
-
-       buf = kmalloc(len + 2, GFP_KERNEL);
-       if (!buf)
-               return -ENOMEM;
-
-       for (i = 0; sched_feat_names[i]; i++) {
-               if (sysctl_sched_features & (1UL << i))
-                       r += sprintf(buf + r, "%s ", sched_feat_names[i]);
-               else
-                       r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
+               if (!(sysctl_sched_features & (1UL << i)))
+                       seq_puts(m, "NO_");
+               seq_printf(m, "%s ", sched_feat_names[i]);
        }
+       seq_puts(m, "\n");
 
-       r += sprintf(buf + r, "\n");
-       WARN_ON(r >= len + 2);
-
-       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-
-       kfree(buf);
-
-       return r;
+       return 0;
 }
 
 static ssize_t
@@ -785,10 +778,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        return cnt;
 }
 
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+       return single_open(filp, sched_feat_show, NULL);
+}
+
 static struct file_operations sched_feat_fops = {
-       .open   = sched_feat_open,
-       .read   = sched_feat_read,
-       .write  = sched_feat_write,
+       .open           = sched_feat_open,
+       .write          = sched_feat_write,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
 };
 
 static __init int sched_init_debug(void)
@@ -1472,27 +1472,13 @@ static void
 update_group_shares_cpu(struct task_group *tg, int cpu,
                        unsigned long sd_shares, unsigned long sd_rq_weight)
 {
-       int boost = 0;
        unsigned long shares;
        unsigned long rq_weight;
 
        if (!tg->se[cpu])
                return;
 
-       rq_weight = tg->cfs_rq[cpu]->load.weight;
-
-       /*
-        * If there are currently no tasks on the cpu pretend there is one of
-        * average load so that when a new task gets to run here it will not
-        * get delayed by group starvation.
-        */
-       if (!rq_weight) {
-               boost = 1;
-               rq_weight = NICE_0_LOAD;
-       }
-
-       if (unlikely(rq_weight > sd_rq_weight))
-               rq_weight = sd_rq_weight;
+       rq_weight = tg->cfs_rq[cpu]->rq_weight;
 
        /*
         *           \Sum shares * rq_weight
@@ -1500,7 +1486,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
         *               \Sum rq_weight
         *
         */
-       shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+       shares = (sd_shares * rq_weight) / sd_rq_weight;
        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
 
        if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1509,11 +1495,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
                unsigned long flags;
 
                spin_lock_irqsave(&rq->lock, flags);
-               /*
-                * record the actual number of shares, not the boosted amount.
-                */
-               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-               tg->cfs_rq[cpu]->rq_weight = rq_weight;
+               tg->cfs_rq[cpu]->shares = shares;
 
                __set_se_shares(tg->se[cpu], shares);
                spin_unlock_irqrestore(&rq->lock, flags);
@@ -1527,13 +1509,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
  */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-       unsigned long rq_weight = 0;
+       unsigned long weight, rq_weight = 0;
        unsigned long shares = 0;
        struct sched_domain *sd = data;
        int i;
 
        for_each_cpu_mask(i, sd->span) {
-               rq_weight += tg->cfs_rq[i]->load.weight;
+               /*
+                * If there are currently no tasks on the cpu pretend there
+                * is one of average load so that when a new task gets to
+                * run here it will not get delayed by group starvation.
+                */
+               weight = tg->cfs_rq[i]->load.weight;
+               if (!weight)
+                       weight = NICE_0_LOAD;
+
+               tg->cfs_rq[i]->rq_weight = weight;
+               rq_weight += weight;
                shares += tg->cfs_rq[i]->shares;
        }
 
@@ -1543,9 +1535,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
                shares = tg->shares;
 
-       if (!rq_weight)
-               rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
-
        for_each_cpu_mask(i, sd->span)
                update_group_shares_cpu(tg, i, shares, rq_weight);
 
@@ -1610,6 +1599,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 
 #endif
 
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(this_rq->lock)
+       __acquires(busiest->lock)
+       __acquires(this_rq->lock)
+{
+       int ret = 0;
+
+       if (unlikely(!irqs_disabled())) {
+               /* printk() doesn't work good under rq->lock */
+               spin_unlock(&this_rq->lock);
+               BUG_ON(1);
+       }
+       if (unlikely(!spin_trylock(&busiest->lock))) {
+               if (busiest < this_rq) {
+                       spin_unlock(&this_rq->lock);
+                       spin_lock(&busiest->lock);
+                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
+                       ret = 1;
+               } else
+                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
+       }
+       return ret;
+}
+
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(busiest->lock)
+{
+       spin_unlock(&busiest->lock);
+       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1843,6 +1865,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
        clock_offset = old_rq->clock - new_rq->clock;
 
+       trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+
 #ifdef CONFIG_SCHEDSTATS
        if (p->se.wait_start)
                p->se.wait_start -= clock_offset;
@@ -2252,6 +2276,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
        smp_wmb();
        rq = task_rq_lock(p, &flags);
+       update_rq_clock(rq);
        old_state = p->state;
        if (!(old_state & state))
                goto out;
@@ -2309,12 +2334,11 @@ out_activate:
                schedstat_inc(p, se.nr_wakeups_local);
        else
                schedstat_inc(p, se.nr_wakeups_remote);
-       update_rq_clock(rq);
        activate_task(rq, p, 1);
        success = 1;
 
 out_running:
-       trace_sched_wakeup(rq, p);
+       trace_sched_wakeup(rq, p, success);
        check_preempt_curr(rq, p, sync);
 
        p->state = TASK_RUNNING;
@@ -2447,7 +2471,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                p->sched_class->task_new(rq, p);
                inc_nr_running(rq);
        }
-       trace_sched_wakeup_new(rq, p);
+       trace_sched_wakeup_new(rq, p, 1);
        check_preempt_curr(rq, p, 0);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
@@ -2810,40 +2834,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 }
 
 /*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
- */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(this_rq->lock)
-       __acquires(busiest->lock)
-       __acquires(this_rq->lock)
-{
-       int ret = 0;
-
-       if (unlikely(!irqs_disabled())) {
-               /* printk() doesn't work good under rq->lock */
-               spin_unlock(&this_rq->lock);
-               BUG_ON(1);
-       }
-       if (unlikely(!spin_trylock(&busiest->lock))) {
-               if (busiest < this_rq) {
-                       spin_unlock(&this_rq->lock);
-                       spin_lock(&busiest->lock);
-                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
-                       ret = 1;
-               } else
-                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
-       }
-       return ret;
-}
-
-static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(busiest->lock)
-{
-       spin_unlock(&busiest->lock);
-       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
-}
-
-/*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
  * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -2860,7 +2850,6 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
            || unlikely(!cpu_active(dest_cpu)))
                goto out;
 
-       trace_sched_migrate_task(rq, p, dest_cpu);
        /* force the process onto the specified CPU */
        if (migrate_task(p, dest_cpu, &req)) {
                /* Need to wait for migration thread (might exit: take ref). */
@@ -3705,7 +3694,7 @@ out_balanced:
 static void idle_balance(int this_cpu, struct rq *this_rq)
 {
        struct sched_domain *sd;
-       int pulled_task = -1;
+       int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
        cpumask_t tmpmask;
 
@@ -4201,7 +4190,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
 
        if (p == rq->idle) {
                p->stime = cputime_add(p->stime, steal);
-               account_group_system_time(p, steal);
                if (atomic_read(&rq->nr_iowait) > 0)
                        cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
                else
@@ -4337,7 +4325,7 @@ void __kprobes sub_preempt_count(int val)
        /*
         * Underflow?
         */
-       if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+       if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
                return;
        /*
         * Is the spinlock portion underflowing?
@@ -5132,6 +5120,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
        set_load_weight(p);
 }
 
+/*
+ * check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+       const struct cred *cred = current_cred(), *pcred;
+       bool match;
+
+       rcu_read_lock();
+       pcred = __task_cred(p);
+       match = (cred->euid == pcred->euid ||
+                cred->euid == pcred->uid);
+       rcu_read_unlock();
+       return match;
+}
+
 static int __sched_setscheduler(struct task_struct *p, int policy,
                                struct sched_param *param, bool user)
 {
@@ -5191,8 +5195,7 @@ recheck:
                        return -EPERM;
 
                /* can't change other user's priorities */
-               if ((current->euid != p->euid) &&
-                   (current->euid != p->uid))
+               if (!check_same_owner(p))
                        return -EPERM;
        }
 
@@ -5424,8 +5427,7 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
        read_unlock(&tasklist_lock);
 
        retval = -EPERM;
-       if ((current->euid != p->euid) && (current->euid != p->uid) &&
-                       !capable(CAP_SYS_NICE))
+       if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                goto out_unlock;
 
        retval = security_task_setscheduler(p, 0, NULL);
@@ -5894,6 +5896,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         * The idle tasks have their own, simple scheduling class:
         */
        idle->sched_class = &idle_sched_class;
+       ftrace_graph_init_task(idle);
 }
 
 /*
@@ -6124,7 +6127,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
 
 /*
  * Figure out where task on dead CPU should go, use force if necessary.
- * NOTE: interrupts should be disabled by the caller
  */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
@@ -6636,28 +6638,6 @@ early_initcall(migration_init);
 
 #ifdef CONFIG_SCHED_DEBUG
 
-static inline const char *sd_level_to_string(enum sched_domain_level lvl)
-{
-       switch (lvl) {
-       case SD_LV_NONE:
-                       return "NONE";
-       case SD_LV_SIBLING:
-                       return "SIBLING";
-       case SD_LV_MC:
-                       return "MC";
-       case SD_LV_CPU:
-                       return "CPU";
-       case SD_LV_NODE:
-                       return "NODE";
-       case SD_LV_ALLNODES:
-                       return "ALLNODES";
-       case SD_LV_MAX:
-                       return "MAX";
-
-       }
-       return "MAX";
-}
-
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  cpumask_t *groupmask)
 {
@@ -6677,8 +6657,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                return -1;
        }
 
-       printk(KERN_CONT "span %s level %s\n",
-               str, sd_level_to_string(sd->level));
+       printk(KERN_CONT "span %s level %s\n", str, sd->name);
 
        if (!cpu_isset(cpu, sd->span)) {
                printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6814,6 +6793,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                SD_BALANCE_EXEC |
                                SD_SHARE_CPUPOWER |
                                SD_SHARE_PKG_RESOURCES);
+               if (nr_node_ids == 1)
+                       pflags &= ~SD_SERIALIZE;
        }
        if (~cflags & pflags)
                return 0;
@@ -7334,13 +7315,21 @@ struct allmasks {
 };
 
 #if    NR_CPUS > 128
-#define        SCHED_CPUMASK_ALLOC             1
-#define        SCHED_CPUMASK_FREE(v)           kfree(v)
-#define        SCHED_CPUMASK_DECLARE(v)        struct allmasks *v
+#define SCHED_CPUMASK_DECLARE(v)       struct allmasks *v
+static inline void sched_cpumask_alloc(struct allmasks **masks)
+{
+       *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
+}
+static inline void sched_cpumask_free(struct allmasks *masks)
+{
+       kfree(masks);
+}
 #else
-#define        SCHED_CPUMASK_ALLOC             0
-#define        SCHED_CPUMASK_FREE(v)
-#define        SCHED_CPUMASK_DECLARE(v)        struct allmasks _v, *v = &_v
+#define SCHED_CPUMASK_DECLARE(v)       struct allmasks _v, *v = &_v
+static inline void sched_cpumask_alloc(struct allmasks **masks)
+{ }
+static inline void sched_cpumask_free(struct allmasks *masks)
+{ }
 #endif
 
 #define        SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
@@ -7416,9 +7405,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                return -ENOMEM;
        }
 
-#if SCHED_CPUMASK_ALLOC
        /* get space for all scratch cpumask variables */
-       allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
+       sched_cpumask_alloc(&allmasks);
        if (!allmasks) {
                printk(KERN_WARNING "Cannot alloc cpumask array\n");
                kfree(rd);
@@ -7427,7 +7415,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
                return -ENOMEM;
        }
-#endif
+
        tmpmask = (cpumask_t *)allmasks;
 
 
@@ -7681,13 +7669,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                cpu_attach_domain(sd, rd, i);
        }
 
-       SCHED_CPUMASK_FREE((void *)allmasks);
+       sched_cpumask_free(allmasks);
        return 0;
 
 #ifdef CONFIG_NUMA
 error:
        free_sched_groups(cpu_map, tmpmask);
-       SCHED_CPUMASK_FREE((void *)allmasks);
+       sched_cpumask_free(allmasks);
        kfree(rd);
        return -ENOMEM;
 #endif
@@ -7710,8 +7698,14 @@ static struct sched_domain_attr *dattr_cur;
  */
 static cpumask_t fallback_doms;
 
-void __attribute__((weak)) arch_update_cpu_topology(void)
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __attribute__((weak)) arch_update_cpu_topology(void)
 {
+       return 0;
 }
 
 /*
@@ -7751,8 +7745,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
        cpumask_t tmpmask;
        int i;
 
-       unregister_sched_domain_sysctl();
-
        for_each_cpu_mask_nr(i, *cpu_map)
                cpu_attach_domain(NULL, &def_root_domain, i);
        synchronize_sched();
@@ -7805,17 +7797,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
                             struct sched_domain_attr *dattr_new)
 {
        int i, j, n;
+       int new_topology;
 
        mutex_lock(&sched_domains_mutex);
 
        /* always unregister in case we don't destroy any domains */
        unregister_sched_domain_sysctl();
 
+       /* Let architecture update cpu core mappings. */
+       new_topology = arch_update_cpu_topology();
+
        n = doms_new ? ndoms_new : 0;
 
        /* Destroy deleted domains */
        for (i = 0; i < ndoms_cur; i++) {
-               for (j = 0; j < n; j++) {
+               for (j = 0; j < n && !new_topology; j++) {
                        if (cpus_equal(doms_cur[i], doms_new[j])
                            && dattrs_equal(dattr_cur, i, dattr_new, j))
                                goto match1;
@@ -7830,12 +7826,12 @@ match1:
                ndoms_cur = 0;
                doms_new = &fallback_doms;
                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-               dattr_new = NULL;
+               WARN_ON_ONCE(dattr_new);
        }
 
        /* Build new domains */
        for (i = 0; i < ndoms_new; i++) {
-               for (j = 0; j < ndoms_cur; j++) {
+               for (j = 0; j < ndoms_cur && !new_topology; j++) {
                        if (cpus_equal(doms_new[i], doms_cur[j])
                            && dattrs_equal(dattr_new, i, dattr_cur, j))
                                goto match2;
@@ -8490,7 +8486,7 @@ static
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct cfs_rq *cfs_rq;
-       struct sched_entity *se, *parent_se;
+       struct sched_entity *se;
        struct rq *rq;
        int i;
 
@@ -8506,18 +8502,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
 
-               cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+                                     GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
                        goto err;
 
-               se = kmalloc_node(sizeof(struct sched_entity),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               se = kzalloc_node(sizeof(struct sched_entity),
+                                 GFP_KERNEL, cpu_to_node(i));
                if (!se)
                        goto err;
 
-               parent_se = parent ? parent->se[i] : NULL;
-               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
+               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
        }
 
        return 1;
@@ -8578,7 +8573,7 @@ static
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct rt_rq *rt_rq;
-       struct sched_rt_entity *rt_se, *parent_se;
+       struct sched_rt_entity *rt_se;
        struct rq *rq;
        int i;
 
@@ -8595,18 +8590,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
 
-               rt_rq = kmalloc_node(sizeof(struct rt_rq),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               rt_rq = kzalloc_node(sizeof(struct rt_rq),
+                                    GFP_KERNEL, cpu_to_node(i));
                if (!rt_rq)
                        goto err;
 
-               rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+                                    GFP_KERNEL, cpu_to_node(i));
                if (!rt_se)
                        goto err;
 
-               parent_se = parent ? parent->rt_se[i] : NULL;
-               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
+               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
        }
 
        return 1;
@@ -9249,11 +9243,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
  * (balbir@in.ibm.com).
  */
 
-/* track cpu usage of a group of tasks */
+/* track cpu usage of a group of tasks and its child groups */
 struct cpuacct {
        struct cgroup_subsys_state css;
        /* cpuusage holds pointer to a u64-type object on every cpu */
        u64 *cpuusage;
+       struct cpuacct *parent;
 };
 
 struct cgroup_subsys cpuacct_subsys;
@@ -9287,6 +9282,9 @@ static struct cgroup_subsys_state *cpuacct_create(
                return ERR_PTR(-ENOMEM);
        }
 
+       if (cgrp->parent)
+               ca->parent = cgroup_ca(cgrp->parent);
+
        return &ca->css;
 }
 
@@ -9300,6 +9298,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
        kfree(ca);
 }
 
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+{
+       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+       u64 data;
+
+#ifndef CONFIG_64BIT
+       /*
+        * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+        */
+       spin_lock_irq(&cpu_rq(cpu)->lock);
+       data = *cpuusage;
+       spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+       data = *cpuusage;
+#endif
+
+       return data;
+}
+
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+
+#ifndef CONFIG_64BIT
+       /*
+        * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+        */
+       spin_lock_irq(&cpu_rq(cpu)->lock);
+       *cpuusage = val;
+       spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+       *cpuusage = val;
+#endif
+}
+
 /* return total cpu usage (in nanoseconds) of a group */
 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
 {
@@ -9307,17 +9340,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
        u64 totalcpuusage = 0;
        int i;
 
-       for_each_possible_cpu(i) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
-
-               /*
-                * Take rq->lock to make 64-bit addition safe on 32-bit
-                * platforms.
-                */
-               spin_lock_irq(&cpu_rq(i)->lock);
-               totalcpuusage += *cpuusage;
-               spin_unlock_irq(&cpu_rq(i)->lock);
-       }
+       for_each_present_cpu(i)
+               totalcpuusage += cpuacct_cpuusage_read(ca, i);
 
        return totalcpuusage;
 }
@@ -9334,23 +9358,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
                goto out;
        }
 
-       for_each_possible_cpu(i) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
+       for_each_present_cpu(i)
+               cpuacct_cpuusage_write(ca, i, 0);
 
-               spin_lock_irq(&cpu_rq(i)->lock);
-               *cpuusage = 0;
-               spin_unlock_irq(&cpu_rq(i)->lock);
-       }
 out:
        return err;
 }
 
+static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+                                  struct seq_file *m)
+{
+       struct cpuacct *ca = cgroup_ca(cgroup);
+       u64 percpu;
+       int i;
+
+       for_each_present_cpu(i) {
+               percpu = cpuacct_cpuusage_read(ca, i);
+               seq_printf(m, "%llu ", (unsigned long long) percpu);
+       }
+       seq_printf(m, "\n");
+       return 0;
+}
+
 static struct cftype files[] = {
        {
                .name = "usage",
                .read_u64 = cpuusage_read,
                .write_u64 = cpuusage_write,
        },
+       {
+               .name = "usage_percpu",
+               .read_seq_string = cpuacct_percpu_seq_read,
+       },
+
 };
 
 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9366,14 +9406,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
        struct cpuacct *ca;
+       int cpu;
 
        if (!cpuacct_subsys.active)
                return;
 
+       cpu = task_cpu(tsk);
        ca = task_ca(tsk);
-       if (ca) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
 
+       for (; ca; ca = ca->parent) {
+               u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
 }