Merge commit 'v2.6.29-rc7' into perfcounters/core
[linux-2.6.git] / kernel / sched.c
index b309027..78f4424 100644 (file)
@@ -125,6 +125,9 @@ DEFINE_TRACE(sched_switch);
 DEFINE_TRACE(sched_migrate_task);
 
 #ifdef CONFIG_SMP
+
+static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+
 /*
  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
  * Since cpu_power is a 'constant', we can use a reciprocal divide.
@@ -209,7 +212,6 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
        hrtimer_init(&rt_b->rt_period_timer,
                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        rt_b->rt_period_timer.function = sched_rt_period_timer;
-       rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
 }
 
 static inline int rt_bandwidth_enabled(void)
@@ -221,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
        ktime_t now;
 
-       if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
+       if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                return;
 
        if (hrtimer_active(&rt_b->rt_period_timer))
@@ -361,7 +363,9 @@ static inline struct task_group *task_group(struct task_struct *p)
        struct task_group *tg;
 
 #ifdef CONFIG_USER_SCHED
-       tg = p->user->tg;
+       rcu_read_lock();
+       tg = __task_cred(p)->user->tg;
+       rcu_read_unlock();
 #elif defined(CONFIG_CGROUP_SCHED)
        tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
                                struct task_group, css);
@@ -509,6 +513,14 @@ struct root_domain {
 #ifdef CONFIG_SMP
        struct cpupri cpupri;
 #endif
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+       /*
+        * Preferred wake up cpu nominated by sched_mc balance that will be
+        * used when most cpus are idle in the system indicating overall very
+        * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
+        */
+       unsigned int sched_mc_preferred_wakeup_cpu;
+#endif
 };
 
 /*
@@ -546,6 +558,7 @@ struct rq {
        struct load_weight load;
        unsigned long nr_load_updates;
        u64 nr_switches;
+       u64 nr_migrations_in;
 
        struct cfs_rq cfs;
        struct rt_rq rt;
@@ -602,6 +615,8 @@ struct rq {
 #ifdef CONFIG_SCHEDSTATS
        /* latency stats */
        struct sched_info rq_sched_info;
+       unsigned long long rq_cpu_time;
+       /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
 
        /* sys_sched_yield() stats */
        unsigned int yld_exp_empty;
@@ -654,7 +669,7 @@ static inline int cpu_of(struct rq *rq)
 #define task_rq(p)             cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
 
-static inline void update_rq_clock(struct rq *rq)
+inline void update_rq_clock(struct rq *rq)
 {
        rq->clock = sched_clock_cpu(cpu_of(rq));
 }
@@ -965,6 +980,26 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
        }
 }
 
+void curr_rq_lock_irq_save(unsigned long *flags)
+       __acquires(rq->lock)
+{
+       struct rq *rq;
+
+       local_irq_save(*flags);
+       rq = cpu_rq(smp_processor_id());
+       spin_lock(&rq->lock);
+}
+
+void curr_rq_unlock_irq_restore(unsigned long *flags)
+       __releases(rq->lock)
+{
+       struct rq *rq;
+
+       rq = cpu_rq(smp_processor_id());
+       spin_unlock(&rq->lock);
+       local_irq_restore(*flags);
+}
+
 void task_rq_unlock_wait(struct task_struct *p)
 {
        struct rq *rq = task_rq(p);
@@ -1135,7 +1170,6 @@ static void init_rq_hrtick(struct rq *rq)
 
        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        rq->hrtick_timer.function = hrtick;
-       rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 }
 #else  /* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
@@ -1310,8 +1344,8 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
  * slice expiry etc.
  */
 
-#define WEIGHT_IDLEPRIO                2
-#define WMULT_IDLEPRIO         (1 << 31)
+#define WEIGHT_IDLEPRIO                3
+#define WMULT_IDLEPRIO         1431655765
 
 /*
  * Nice levels are multiplicative, with a gentle 10% change for every
@@ -1863,6 +1897,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
        clock_offset = old_rq->clock - new_rq->clock;
 
+       trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+
 #ifdef CONFIG_SCHEDSTATS
        if (p->se.wait_start)
                p->se.wait_start -= clock_offset;
@@ -1870,12 +1906,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                p->se.sleep_start -= clock_offset;
        if (p->se.block_start)
                p->se.block_start -= clock_offset;
+#endif
        if (old_cpu != new_cpu) {
-               schedstat_inc(p, se.nr_migrations);
+               p->se.nr_migrations++;
+               new_rq->nr_migrations_in++;
+#ifdef CONFIG_SCHEDSTATS
                if (task_hot(p, old_rq->clock, NULL))
                        schedstat_inc(p, se.nr_forced2_migrations);
-       }
 #endif
+       }
        p->se.vruntime -= old_cfsrq->min_vruntime -
                                         new_cfsrq->min_vruntime;
 
@@ -2227,6 +2266,27 @@ static int sched_balance_self(int cpu, int flag)
 
 #endif /* CONFIG_SMP */
 
+/**
+ * task_oncpu_function_call - call a function on the cpu on which a task runs
+ * @p:         the task to evaluate
+ * @func:      the function to be called
+ * @info:      the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ */
+void task_oncpu_function_call(struct task_struct *p,
+                             void (*func) (void *info), void *info)
+{
+       int cpu;
+
+       preempt_disable();
+       cpu = task_cpu(p);
+       if (task_curr(p))
+               smp_call_function_single(cpu, func, info, 1);
+       preempt_enable();
+}
+
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
@@ -2269,6 +2329,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
        smp_wmb();
        rq = task_rq_lock(p, &flags);
+       update_rq_clock(rq);
        old_state = p->state;
        if (!(old_state & state))
                goto out;
@@ -2326,12 +2387,11 @@ out_activate:
                schedstat_inc(p, se.nr_wakeups_local);
        else
                schedstat_inc(p, se.nr_wakeups_remote);
-       update_rq_clock(rq);
        activate_task(rq, p, 1);
        success = 1;
 
 out_running:
-       trace_sched_wakeup(rq, p);
+       trace_sched_wakeup(rq, p, success);
        check_preempt_curr(rq, p, sync);
 
        p->state = TASK_RUNNING;
@@ -2369,6 +2429,7 @@ static void __sched_fork(struct task_struct *p)
        p->se.exec_start                = 0;
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
+       p->se.nr_migrations             = 0;
        p->se.last_wakeup               = 0;
        p->se.avg_overlap               = 0;
 
@@ -2464,7 +2525,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                p->sched_class->task_new(rq, p);
                inc_nr_running(rq);
        }
-       trace_sched_wakeup_new(rq, p);
+       trace_sched_wakeup_new(rq, p, 1);
        check_preempt_curr(rq, p, 0);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
@@ -2589,6 +2650,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         */
        prev_state = prev->state;
        finish_arch_switch(prev);
+       perf_counter_task_sched_in(current, cpu_of(rq));
        finish_lock_switch(rq, prev);
 #ifdef CONFIG_SMP
        if (current->sched_class->post_schedule)
@@ -2751,6 +2813,21 @@ unsigned long nr_active(void)
 }
 
 /*
+ * Externally visible per-cpu scheduler statistics:
+ * cpu_nr_switches(cpu) - number of context switches on that cpu
+ * cpu_nr_migrations(cpu) - number of migrations into that cpu
+ */
+u64 cpu_nr_switches(int cpu)
+{
+       return cpu_rq(cpu)->nr_switches;
+}
+
+u64 cpu_nr_migrations(int cpu)
+{
+       return cpu_rq(cpu)->nr_migrations_in;
+}
+
+/*
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
  */
@@ -2843,7 +2920,6 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
            || unlikely(!cpu_active(dest_cpu)))
                goto out;
 
-       trace_sched_migrate_task(rq, p, dest_cpu);
        /* force the process onto the specified CPU */
        if (migrate_task(p, dest_cpu, &req)) {
                /* Need to wait for migration thread (might exit: take ref). */
@@ -3241,7 +3317,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                 */
                if ((sum_nr_running < min_nr_running) ||
                    (sum_nr_running == min_nr_running &&
-                    cpumask_first(sched_group_cpus(group)) <
+                    cpumask_first(sched_group_cpus(group)) >
                     cpumask_first(sched_group_cpus(group_min)))) {
                        group_min = group;
                        min_nr_running = sum_nr_running;
@@ -3257,7 +3333,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                if (sum_nr_running <= group_capacity - 1) {
                        if (sum_nr_running > leader_nr_running ||
                            (sum_nr_running == leader_nr_running &&
-                            cpumask_first(sched_group_cpus(group)) >
+                            cpumask_first(sched_group_cpus(group)) <
                             cpumask_first(sched_group_cpus(group_leader)))) {
                                group_leader = group;
                                leader_nr_running = sum_nr_running;
@@ -3384,6 +3460,10 @@ out_balanced:
 
        if (this == group_leader && group_leader != group_min) {
                *imbalance = min_load_per_task;
+               if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+                       cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+                               cpumask_first(sched_group_cpus(group_leader));
+               }
                return group_min;
        }
 #endif
@@ -3658,10 +3738,69 @@ redo:
        }
 
        if (!ld_moved) {
+               int active_balance = 0;
+
                schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
                    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                        return -1;
+
+               if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
+                       return -1;
+
+               if (sd->nr_balance_failed++ < 2)
+                       return -1;
+
+               /*
+                * The only task running in a non-idle cpu can be moved to this
+                * cpu in an attempt to completely freeup the other CPU
+                * package. The same method used to move task in load_balance()
+                * have been extended for load_balance_newidle() to speedup
+                * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
+                *
+                * The package power saving logic comes from
+                * find_busiest_group().  If there are no imbalance, then
+                * f_b_g() will return NULL.  However when sched_mc={1,2} then
+                * f_b_g() will select a group from which a running task may be
+                * pulled to this cpu in order to make the other package idle.
+                * If there is no opportunity to make a package idle and if
+                * there are no imbalance, then f_b_g() will return NULL and no
+                * action will be taken in load_balance_newidle().
+                *
+                * Under normal task pull operation due to imbalance, there
+                * will be more than one task in the source run queue and
+                * move_tasks() will succeed.  ld_moved will be true and this
+                * active balance code will not be triggered.
+                */
+
+               /* Lock busiest in correct order while this_rq is held */
+               double_lock_balance(this_rq, busiest);
+
+               /*
+                * don't kick the migration_thread, if the curr
+                * task on busiest cpu can't be moved to this_cpu
+                */
+               if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
+                       double_unlock_balance(this_rq, busiest);
+                       all_pinned = 1;
+                       return ld_moved;
+               }
+
+               if (!busiest->active_balance) {
+                       busiest->active_balance = 1;
+                       busiest->push_cpu = this_cpu;
+                       active_balance = 1;
+               }
+
+               double_unlock_balance(this_rq, busiest);
+               /*
+                * Should not call ttwu while holding a rq->lock
+                */
+               spin_unlock(&this_rq->lock);
+               if (active_balance)
+                       wake_up_process(busiest->migration_thread);
+               spin_lock(&this_rq->lock);
+
        } else
                sd->nr_balance_failed = 0;
 
@@ -3803,19 +3942,24 @@ int select_nohz_load_balancer(int stop_tick)
        int cpu = smp_processor_id();
 
        if (stop_tick) {
-               cpumask_set_cpu(cpu, nohz.cpu_mask);
                cpu_rq(cpu)->in_nohz_recently = 1;
 
-               /*
-                * If we are going offline and still the leader, give up!
-                */
-               if (!cpu_active(cpu) &&
-                   atomic_read(&nohz.load_balancer) == cpu) {
+               if (!cpu_active(cpu)) {
+                       if (atomic_read(&nohz.load_balancer) != cpu)
+                               return 0;
+
+                       /*
+                        * If we are going offline and still the leader,
+                        * give up!
+                        */
                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
                                BUG();
+
                        return 0;
                }
 
+               cpumask_set_cpu(cpu, nohz.cpu_mask);
+
                /* time for ilb owner also to sleep */
                if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                        if (atomic_read(&nohz.load_balancer) == cpu)
@@ -4055,6 +4199,29 @@ EXPORT_PER_CPU_SYMBOL(kstat);
  * Return any ns on the sched_clock that have not yet been banked in
  * @p in case that task is currently running.
  */
+unsigned long long __task_delta_exec(struct task_struct *p, int update)
+{
+       s64 delta_exec;
+       struct rq *rq;
+
+       rq = task_rq(p);
+       WARN_ON_ONCE(!runqueue_is_locked());
+       WARN_ON_ONCE(!task_current(rq, p));
+
+       if (update)
+               update_rq_clock(rq);
+
+       delta_exec = rq->clock - p->se.exec_start;
+
+       WARN_ON_ONCE(delta_exec < 0);
+
+       return delta_exec;
+}
+
+/*
+ * Return any ns on the sched_clock that have not yet been banked in
+ * @p in case that task is currently running.
+ */
 unsigned long long task_delta_exec(struct task_struct *p)
 {
        unsigned long flags;
@@ -4081,13 +4248,17 @@ unsigned long long task_delta_exec(struct task_struct *p)
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in user space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
  */
-void account_user_time(struct task_struct *p, cputime_t cputime)
+void account_user_time(struct task_struct *p, cputime_t cputime,
+                      cputime_t cputime_scaled)
 {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
        cputime64_t tmp;
 
+       /* Add user time to process. */
        p->utime = cputime_add(p->utime, cputime);
+       p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
        account_group_user_time(p, cputime);
 
        /* Add user time to cpustat. */
@@ -4104,51 +4275,48 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
  * Account guest cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in virtual machine since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
  */
-static void account_guest_time(struct task_struct *p, cputime_t cputime)
+static void account_guest_time(struct task_struct *p, cputime_t cputime,
+                              cputime_t cputime_scaled)
 {
        cputime64_t tmp;
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 
        tmp = cputime_to_cputime64(cputime);
 
+       /* Add guest time to process. */
        p->utime = cputime_add(p->utime, cputime);
+       p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
        account_group_user_time(p, cputime);
        p->gtime = cputime_add(p->gtime, cputime);
 
+       /* Add guest time to cpustat. */
        cpustat->user = cputime64_add(cpustat->user, tmp);
        cpustat->guest = cputime64_add(cpustat->guest, tmp);
 }
 
 /*
- * Account scaled user cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user space since the last update
- */
-void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
-{
-       p->utimescaled = cputime_add(p->utimescaled, cputime);
-}
-
-/*
  * Account system cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
  * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
  */
 void account_system_time(struct task_struct *p, int hardirq_offset,
-                        cputime_t cputime)
+                        cputime_t cputime, cputime_t cputime_scaled)
 {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       struct rq *rq = this_rq();
        cputime64_t tmp;
 
        if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
-               account_guest_time(p, cputime);
+               account_guest_time(p, cputime, cputime_scaled);
                return;
        }
 
+       /* Add system time to process. */
        p->stime = cputime_add(p->stime, cputime);
+       p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
        account_group_system_time(p, cputime);
 
        /* Add system time to cpustat. */
@@ -4157,49 +4325,85 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
                cpustat->irq = cputime64_add(cpustat->irq, tmp);
        else if (softirq_count())
                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
-       else if (p != rq->idle)
-               cpustat->system = cputime64_add(cpustat->system, tmp);
-       else if (atomic_read(&rq->nr_iowait) > 0)
-               cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
        else
-               cpustat->idle = cputime64_add(cpustat->idle, tmp);
+               cpustat->system = cputime64_add(cpustat->system, tmp);
+
        /* Account for system time used */
        acct_update_integrals(p);
 }
 
 /*
- * Account scaled system cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
+ * Account for involuntary wait time.
+ * @steal: the cpu time spent in involuntary wait
  */
-void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
+void account_steal_time(cputime_t cputime)
 {
-       p->stimescaled = cputime_add(p->stimescaled, cputime);
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+       cputime64_t cputime64 = cputime_to_cputime64(cputime);
+
+       cpustat->steal = cputime64_add(cpustat->steal, cputime64);
 }
 
 /*
- * Account for involuntary wait time.
- * @p: the process from which the cpu time has been stolen
- * @steal: the cpu time spent in involuntary wait
+ * Account for idle time.
+ * @cputime: the cpu time spent in idle wait
  */
-void account_steal_time(struct task_struct *p, cputime_t steal)
+void account_idle_time(cputime_t cputime)
 {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       cputime64_t tmp = cputime_to_cputime64(steal);
+       cputime64_t cputime64 = cputime_to_cputime64(cputime);
        struct rq *rq = this_rq();
 
-       if (p == rq->idle) {
-               p->stime = cputime_add(p->stime, steal);
-               if (atomic_read(&rq->nr_iowait) > 0)
-                       cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
-               else
-                       cpustat->idle = cputime64_add(cpustat->idle, tmp);
-       } else
-               cpustat->steal = cputime64_add(cpustat->steal, tmp);
+       if (atomic_read(&rq->nr_iowait) > 0)
+               cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
+       else
+               cpustat->idle = cputime64_add(cpustat->idle, cputime64);
+}
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+
+/*
+ * Account a single tick of cpu time.
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: indicates if the tick is a user or a system tick
+ */
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+       cputime_t one_jiffy = jiffies_to_cputime(1);
+       cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
+       struct rq *rq = this_rq();
+
+       if (user_tick)
+               account_user_time(p, one_jiffy, one_jiffy_scaled);
+       else if (p != rq->idle)
+               account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
+                                   one_jiffy_scaled);
+       else
+               account_idle_time(one_jiffy);
 }
 
 /*
+ * Account multiple ticks of steal time.
+ * @p: the process from which the cpu time has been stolen
+ * @ticks: number of stolen ticks
+ */
+void account_steal_ticks(unsigned long ticks)
+{
+       account_steal_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Account multiple ticks of idle time.
+ * @ticks: number of stolen ticks
+ */
+void account_idle_ticks(unsigned long ticks)
+{
+       account_idle_time(jiffies_to_cputime(ticks));
+}
+
+#endif
+
+/*
  * Use precise platform statistics if available:
  */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
@@ -4277,6 +4481,7 @@ void scheduler_tick(void)
        update_rq_clock(rq);
        update_cpu_load(rq);
        curr->sched_class->task_tick(rq, curr, 0);
+       perf_counter_task_tick(curr, cpu);
        spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
@@ -4326,7 +4531,7 @@ void __kprobes sub_preempt_count(int val)
        /*
         * Underflow?
         */
-       if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
+       if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
                return;
        /*
         * Is the spinlock portion underflowing?
@@ -4472,6 +4677,7 @@ need_resched_nonpreemptible:
 
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
+               perf_counter_task_sched_out(prev, cpu);
 
                rq->nr_switches++;
                rq->curr = next;
@@ -4573,8 +4779,8 @@ EXPORT_SYMBOL(default_wake_function);
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
-static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-                            int nr_exclusive, int sync, void *key)
+void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+                       int nr_exclusive, int sync, void *key)
 {
        wait_queue_t *curr, *next;
 
@@ -5012,7 +5218,7 @@ int can_nice(const struct task_struct *p, const int nice)
  * sys_setpriority is a more generic, but much slower function that
  * does similar things.
  */
-asmlinkage long sys_nice(int increment)
+SYSCALL_DEFINE1(nice, int, increment)
 {
        long nice, retval;
 
@@ -5121,6 +5327,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
        set_load_weight(p);
 }
 
+/*
+ * check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+       const struct cred *cred = current_cred(), *pcred;
+       bool match;
+
+       rcu_read_lock();
+       pcred = __task_cred(p);
+       match = (cred->euid == pcred->euid ||
+                cred->euid == pcred->uid);
+       rcu_read_unlock();
+       return match;
+}
+
 static int __sched_setscheduler(struct task_struct *p, int policy,
                                struct sched_param *param, bool user)
 {
@@ -5180,8 +5402,7 @@ recheck:
                        return -EPERM;
 
                /* can't change other user's priorities */
-               if ((current->euid != p->euid) &&
-                   (current->euid != p->uid))
+               if (!check_same_owner(p))
                        return -EPERM;
        }
 
@@ -5304,8 +5525,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  */
-asmlinkage long
-sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
+               struct sched_param __user *, param)
 {
        /* negative values for policy are not valid */
        if (policy < 0)
@@ -5319,7 +5540,7 @@ sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
  * @pid: the pid in question.
  * @param: structure containing the new RT priority.
  */
-asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 {
        return do_sched_setscheduler(pid, -1, param);
 }
@@ -5328,7 +5549,7 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
  * @pid: the pid in question.
  */
-asmlinkage long sys_sched_getscheduler(pid_t pid)
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 {
        struct task_struct *p;
        int retval;
@@ -5353,7 +5574,7 @@ asmlinkage long sys_sched_getscheduler(pid_t pid)
  * @pid: the pid in question.
  * @param: structure containing the RT priority.
  */
-asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
 {
        struct sched_param lp;
        struct task_struct *p;
@@ -5420,8 +5641,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
                goto out_free_cpus_allowed;
        }
        retval = -EPERM;
-       if ((current->euid != p->euid) && (current->euid != p->uid) &&
-                       !capable(CAP_SYS_NICE))
+       if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                goto out_unlock;
 
        retval = security_task_setscheduler(p, 0, NULL);
@@ -5472,8 +5692,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to the new cpu mask
  */
-asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
-                                     unsigned long __user *user_mask_ptr)
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+               unsigned long __user *, user_mask_ptr)
 {
        cpumask_var_t new_mask;
        int retval;
@@ -5520,8 +5740,8 @@ out_unlock:
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to hold the current cpu mask
  */
-asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
-                                     unsigned long __user *user_mask_ptr)
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+               unsigned long __user *, user_mask_ptr)
 {
        int ret;
        cpumask_var_t mask;
@@ -5550,7 +5770,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
  * This function yields the current CPU to other tasks. If there are no
  * other threads running on this CPU then this function will return.
  */
-asmlinkage long sys_sched_yield(void)
+SYSCALL_DEFINE0(sched_yield)
 {
        struct rq *rq = this_rq_lock();
 
@@ -5691,7 +5911,7 @@ long __sched io_schedule_timeout(long timeout)
  * this syscall returns the maximum rt_priority that can be used
  * by a given scheduling class.
  */
-asmlinkage long sys_sched_get_priority_max(int policy)
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
 {
        int ret = -EINVAL;
 
@@ -5716,7 +5936,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
  * this syscall returns the minimum rt_priority that can be used
  * by a given scheduling class.
  */
-asmlinkage long sys_sched_get_priority_min(int policy)
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
 {
        int ret = -EINVAL;
 
@@ -5741,8 +5961,8 @@ asmlinkage long sys_sched_get_priority_min(int policy)
  * this syscall writes the default timeslice value of a given process
  * into the user-space timespec buffer. A value of '0' means infinity.
  */
-asmlinkage
-long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+               struct timespec __user *, interval)
 {
        struct task_struct *p;
        unsigned int time_slice;
@@ -5811,12 +6031,7 @@ void sched_show_task(struct task_struct *p)
                printk(KERN_CONT " %016lx ", thread_saved_pc(p));
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
-       {
-               unsigned long *n = end_of_stack(p);
-               while (!*n)
-                       n++;
-               free = (unsigned long)n - (unsigned long)end_of_stack(p);
-       }
+       free = stack_not_used(p);
 #endif
        printk(KERN_CONT "%5lu %5d %6d\n", free,
                task_pid_nr(p), task_pid_nr(p->real_parent));
@@ -6137,9 +6352,7 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
        int dest_cpu;
-       /* FIXME: Use cpumask_of_node here. */
-       cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu));
-       const struct cpumask *nodemask = &_nodemask;
+       const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
 
 again:
        /* Look for allowed, online CPU in same node. */
@@ -6813,20 +7026,26 @@ static void free_rootdomain(struct root_domain *rd)
 
 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 {
+       struct root_domain *old_rd = NULL;
        unsigned long flags;
 
        spin_lock_irqsave(&rq->lock, flags);
 
        if (rq->rd) {
-               struct root_domain *old_rd = rq->rd;
+               old_rd = rq->rd;
 
                if (cpumask_test_cpu(rq->cpu, old_rd->online))
                        set_rq_offline(rq);
 
                cpumask_clear_cpu(rq->cpu, old_rd->span);
 
-               if (atomic_dec_and_test(&old_rd->refcount))
-                       free_rootdomain(old_rd);
+               /*
+                * If we dont want to free the old_rt yet then
+                * set old_rd to NULL to skip the freeing later
+                * in this function:
+                */
+               if (!atomic_dec_and_test(&old_rd->refcount))
+                       old_rd = NULL;
        }
 
        atomic_inc(&rd->refcount);
@@ -6837,9 +7056,12 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
                set_rq_online(rq);
 
        spin_unlock_irqrestore(&rq->lock, flags);
+
+       if (old_rd)
+               free_rootdomain(old_rd);
 }
 
-static int init_rootdomain(struct root_domain *rd, bool bootmem)
+static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
 {
        memset(rd, 0, sizeof(*rd));
 
@@ -6852,7 +7074,7 @@ static int init_rootdomain(struct root_domain *rd, bool bootmem)
        }
 
        if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
-               goto free_rd;
+               goto out;
        if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
                goto free_span;
        if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
@@ -6868,8 +7090,7 @@ free_online:
        free_cpumask_var(rd->online);
 free_span:
        free_cpumask_var(rd->span);
-free_rd:
-       kfree(rd);
+out:
        return -ENOMEM;
 }
 
@@ -7050,21 +7271,18 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
 static void sched_domain_node_span(int node, struct cpumask *span)
 {
        nodemask_t used_nodes;
-       /* FIXME: use cpumask_of_node() */
-       node_to_cpumask_ptr(nodemask, node);
        int i;
 
-       cpus_clear(*span);
+       cpumask_clear(span);
        nodes_clear(used_nodes);
 
-       cpus_or(*span, *span, *nodemask);
+       cpumask_or(span, span, cpumask_of_node(node));
        node_set(node, used_nodes);
 
        for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
                int next_node = find_next_best_node(node, &used_nodes);
 
-               node_to_cpumask_ptr_next(nodemask, next_node);
-               cpus_or(*span, *span, *nodemask);
+               cpumask_or(span, span, cpumask_of_node(next_node));
        }
 }
 #endif /* CONFIG_NUMA */
@@ -7144,9 +7362,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
 {
        int group;
 #ifdef CONFIG_SCHED_MC
-       /* FIXME: Use cpu_coregroup_mask. */
-       *mask = cpu_coregroup_map(cpu);
-       cpus_and(*mask, *mask, *cpu_map);
+       cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
        group = cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
        cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
@@ -7165,10 +7381,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
  * groups, so roll our own. Now each node has its own list of groups which
  * gets dynamically allocated.
  */
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
 static struct sched_group ***sched_group_nodes_bycpu;
 
-static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
 
 static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
@@ -7176,10 +7392,8 @@ static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
                                 struct cpumask *nodemask)
 {
        int group;
-       /* FIXME: use cpumask_of_node */
-       node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
 
-       cpumask_and(nodemask, pnodemask, cpu_map);
+       cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
        group = cpumask_first(nodemask);
 
        if (sg)
@@ -7230,10 +7444,8 @@ static void free_sched_groups(const struct cpumask *cpu_map,
 
                for (i = 0; i < nr_node_ids; i++) {
                        struct sched_group *oldsg, *sg = sched_group_nodes[i];
-                       /* FIXME: Use cpumask_of_node */
-                       node_to_cpumask_ptr(pnodemask, i);
 
-                       cpus_and(*nodemask, *pnodemask, *cpu_map);
+                       cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
                        if (cpumask_empty(nodemask))
                                continue;
 
@@ -7442,14 +7654,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
        for_each_cpu(i, cpu_map) {
                struct sched_domain *sd = NULL, *p;
 
-               /* FIXME: use cpumask_of_node */
-               *nodemask = node_to_cpumask(cpu_to_node(i));
-               cpus_and(*nodemask, *nodemask, *cpu_map);
+               cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
 
 #ifdef CONFIG_NUMA
                if (cpumask_weight(cpu_map) >
                                SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
-                       sd = &per_cpu(allnodes_domains, i);
+                       sd = &per_cpu(allnodes_domains, i).sd;
                        SD_INIT(sd, ALLNODES);
                        set_domain_attribute(sd, attr);
                        cpumask_copy(sched_domain_span(sd), cpu_map);
@@ -7459,7 +7669,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                } else
                        p = NULL;
 
-               sd = &per_cpu(node_domains, i);
+               sd = &per_cpu(node_domains, i).sd;
                SD_INIT(sd, NODE);
                set_domain_attribute(sd, attr);
                sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
@@ -7485,9 +7695,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                sd = &per_cpu(core_domains, i).sd;
                SD_INIT(sd, MC);
                set_domain_attribute(sd, attr);
-               *sched_domain_span(sd) = cpu_coregroup_map(i);
-               cpumask_and(sched_domain_span(sd),
-                           sched_domain_span(sd), cpu_map);
+               cpumask_and(sched_domain_span(sd), cpu_map,
+                                                  cpu_coregroup_mask(i));
                sd->parent = p;
                p->child = sd;
                cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7523,9 +7732,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 #ifdef CONFIG_SCHED_MC
        /* Set up multi-core groups */
        for_each_cpu(i, cpu_map) {
-               /* FIXME: Use cpu_coregroup_mask */
-               *this_core_map = cpu_coregroup_map(i);
-               cpus_and(*this_core_map, *this_core_map, *cpu_map);
+               cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
                if (i != cpumask_first(this_core_map))
                        continue;
 
@@ -7537,9 +7744,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 
        /* Set up physical groups */
        for (i = 0; i < nr_node_ids; i++) {
-               /* FIXME: Use cpumask_of_node */
-               *nodemask = node_to_cpumask(i);
-               cpus_and(*nodemask, *nodemask, *cpu_map);
+               cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
                if (cpumask_empty(nodemask))
                        continue;
 
@@ -7561,11 +7766,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                struct sched_group *sg, *prev;
                int j;
 
-               /* FIXME: Use cpumask_of_node */
-               *nodemask = node_to_cpumask(i);
                cpumask_clear(covered);
-
-               cpus_and(*nodemask, *nodemask, *cpu_map);
+               cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
                if (cpumask_empty(nodemask)) {
                        sched_group_nodes[i] = NULL;
                        continue;
@@ -7585,7 +7787,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                for_each_cpu(j, nodemask) {
                        struct sched_domain *sd;
 
-                       sd = &per_cpu(node_domains, j);
+                       sd = &per_cpu(node_domains, j).sd;
                        sd->groups = sg;
                }
                sg->__cpu_power = 0;
@@ -7596,8 +7798,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 
                for (j = 0; j < nr_node_ids; j++) {
                        int n = (i + j) % nr_node_ids;
-                       /* FIXME: Use cpumask_of_node */
-                       node_to_cpumask_ptr(pnodemask, n);
 
                        cpumask_complement(notcovered, covered);
                        cpumask_and(tmpmask, notcovered, cpu_map);
@@ -7605,7 +7805,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                        if (cpumask_empty(tmpmask))
                                break;
 
-                       cpumask_and(tmpmask, tmpmask, pnodemask);
+                       cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
                        if (cpumask_empty(tmpmask))
                                continue;
 
@@ -7890,7 +8090,7 @@ match2:
 }
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-int arch_reinit_sched_domains(void)
+static void arch_reinit_sched_domains(void)
 {
        get_online_cpus();
 
@@ -7899,25 +8099,33 @@ int arch_reinit_sched_domains(void)
 
        rebuild_sched_domains();
        put_online_cpus();
-
-       return 0;
 }
 
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 {
-       int ret;
+       unsigned int level = 0;
+
+       if (sscanf(buf, "%u", &level) != 1)
+               return -EINVAL;
+
+       /*
+        * level is always be positive so don't check for
+        * level < POWERSAVINGS_BALANCE_NONE which is 0
+        * What happens on 0 or 1 byte write,
+        * need to check for count as well?
+        */
 
-       if (buf[0] != '0' && buf[0] != '1')
+       if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
                return -EINVAL;
 
        if (smt)
-               sched_smt_power_savings = (buf[0] == '1');
+               sched_smt_power_savings = level;
        else
-               sched_mc_power_savings = (buf[0] == '1');
+               sched_mc_power_savings = level;
 
-       ret = arch_reinit_sched_domains();
+       arch_reinit_sched_domains();
 
-       return ret ? ret : count;
+       return count;
 }
 
 #ifdef CONFIG_SCHED_MC
@@ -7952,7 +8160,7 @@ static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
                   sched_smt_power_savings_store);
 #endif
 
-int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 {
        int err = 0;
 
@@ -8938,6 +9146,13 @@ static int tg_schedulable(struct task_group *tg, void *data)
                runtime = d->rt_runtime;
        }
 
+#ifdef CONFIG_USER_SCHED
+       if (tg == &root_task_group) {
+               period = global_rt_period();
+               runtime = global_rt_runtime();
+       }
+#endif
+
        /*
         * Cannot have more runtime than the period.
         */
@@ -9091,6 +9306,16 @@ static int sched_rt_global_constraints(void)
 
        return ret;
 }
+
+int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+{
+       /* Don't accept realtime tasks when there is no way for them to run */
+       if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+               return 0;
+
+       return 1;
+}
+
 #else /* !CONFIG_RT_GROUP_SCHED */
 static int sched_rt_global_constraints(void)
 {
@@ -9184,8 +9409,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
                      struct task_struct *tsk)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
-       /* Don't accept realtime tasks when there is no way for them to run */
-       if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
+       if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
                return -EINVAL;
 #else
        /* We don't support RT-tasks being in separate groups */
@@ -9346,6 +9570,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
        kfree(ca);
 }
 
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+{
+       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+       u64 data;
+
+#ifndef CONFIG_64BIT
+       /*
+        * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+        */
+       spin_lock_irq(&cpu_rq(cpu)->lock);
+       data = *cpuusage;
+       spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+       data = *cpuusage;
+#endif
+
+       return data;
+}
+
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+
+#ifndef CONFIG_64BIT
+       /*
+        * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+        */
+       spin_lock_irq(&cpu_rq(cpu)->lock);
+       *cpuusage = val;
+       spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+       *cpuusage = val;
+#endif
+}
+
 /* return total cpu usage (in nanoseconds) of a group */
 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
 {
@@ -9353,17 +9612,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
        u64 totalcpuusage = 0;
        int i;
 
-       for_each_possible_cpu(i) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
-
-               /*
-                * Take rq->lock to make 64-bit addition safe on 32-bit
-                * platforms.
-                */
-               spin_lock_irq(&cpu_rq(i)->lock);
-               totalcpuusage += *cpuusage;
-               spin_unlock_irq(&cpu_rq(i)->lock);
-       }
+       for_each_present_cpu(i)
+               totalcpuusage += cpuacct_cpuusage_read(ca, i);
 
        return totalcpuusage;
 }
@@ -9380,23 +9630,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
                goto out;
        }
 
-       for_each_possible_cpu(i) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
+       for_each_present_cpu(i)
+               cpuacct_cpuusage_write(ca, i, 0);
 
-               spin_lock_irq(&cpu_rq(i)->lock);
-               *cpuusage = 0;
-               spin_unlock_irq(&cpu_rq(i)->lock);
-       }
 out:
        return err;
 }
 
+static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+                                  struct seq_file *m)
+{
+       struct cpuacct *ca = cgroup_ca(cgroup);
+       u64 percpu;
+       int i;
+
+       for_each_present_cpu(i) {
+               percpu = cpuacct_cpuusage_read(ca, i);
+               seq_printf(m, "%llu ", (unsigned long long) percpu);
+       }
+       seq_printf(m, "\n");
+       return 0;
+}
+
 static struct cftype files[] = {
        {
                .name = "usage",
                .read_u64 = cpuusage_read,
                .write_u64 = cpuusage_write,
        },
+       {
+               .name = "usage_percpu",
+               .read_seq_string = cpuacct_percpu_seq_read,
+       },
+
 };
 
 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)