sched: Enable might_sleep before initializing drivers.
[linux-2.6.git] / kernel / sched.c
index e309dba..d0f600c 100644 (file)
@@ -75,6 +75,9 @@
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include <asm/mutex.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
 
 #include "sched_cpupri.h"
 #include "workqueue_sched.h"
 
 static inline int rt_policy(int policy)
 {
-       if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
+       if (policy == SCHED_FIFO || policy == SCHED_RR)
                return 1;
        return 0;
 }
@@ -231,7 +234,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 #endif
 
 /*
- * sched_domains_mutex serializes calls to arch_init_sched_domains,
+ * sched_domains_mutex serializes calls to init_sched_domains,
  * detach_destroy_domains and partition_sched_domains.
  */
 static DEFINE_MUTEX(sched_domains_mutex);
@@ -292,7 +295,7 @@ static DEFINE_SPINLOCK(task_group_lock);
  * (The default weight is 1024 - so there's no practical
  *  limitation from this.)
  */
-#define MIN_SHARES     2
+#define MIN_SHARES     (1UL <<  1)
 #define MAX_SHARES     (1UL << 18)
 
 static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
@@ -328,7 +331,9 @@ struct cfs_rq {
         */
        struct sched_entity *curr, *next, *last, *skip;
 
+#ifdef CONFIG_SCHED_DEBUG
        unsigned int nr_spread_over;
+#endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
@@ -420,6 +425,8 @@ struct rt_rq {
  */
 struct root_domain {
        atomic_t refcount;
+       atomic_t rto_count;
+       struct rcu_head rcu;
        cpumask_var_t span;
        cpumask_var_t online;
 
@@ -428,7 +435,6 @@ struct root_domain {
         * one runnable RT task.
         */
        cpumask_var_t rto_mask;
-       atomic_t rto_count;
        struct cpupri cpupri;
 };
 
@@ -463,7 +469,7 @@ struct rq {
        u64 nohz_stamp;
        unsigned char nohz_balance_kick;
 #endif
-       unsigned int skip_clock_update;
+       int skip_clock_update;
 
        /* capture load from *all* tasks on this cpu: */
        struct load_weight load;
@@ -525,6 +531,12 @@ struct rq {
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
        u64 prev_irq_time;
 #endif
+#ifdef CONFIG_PARAVIRT
+       u64 prev_steal_time;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+       u64 prev_steal_time_rq;
+#endif
 
        /* calc_load related fields */
        unsigned long calc_load_update;
@@ -556,6 +568,10 @@ struct rq {
        unsigned int ttwu_count;
        unsigned int ttwu_local;
 #endif
+
+#ifdef CONFIG_SMP
+       struct task_struct *wake_list;
+#endif
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -574,7 +590,6 @@ static inline int cpu_of(struct rq *rq)
 
 #define rcu_dereference_check_sched_domain(p) \
        rcu_dereference_check((p), \
-                             rcu_read_lock_sched_held() || \
                              lockdep_is_held(&sched_domains_mutex))
 
 /*
@@ -598,10 +613,10 @@ static inline int cpu_of(struct rq *rq)
 /*
  * Return the group to which this tasks belongs.
  *
- * We use task_subsys_state_check() and extend the RCU verification
- * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
- * holds that lock for each task it moves into the cgroup. Therefore
- * by holding that lock, we pin the task to the current cgroup.
+ * We use task_subsys_state_check() and extend the RCU verification with
+ * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
+ * task it moves into the cgroup. Therefore by holding either of those locks,
+ * we pin the task to the current cgroup.
  */
 static inline struct task_group *task_group(struct task_struct *p)
 {
@@ -609,7 +624,8 @@ static inline struct task_group *task_group(struct task_struct *p)
        struct cgroup_subsys_state *css;
 
        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
-                       lockdep_is_held(&p->pi_lock));
+                       lockdep_is_held(&p->pi_lock) ||
+                       lockdep_is_held(&task_rq(p)->lock));
        tg = container_of(css, struct task_group, css);
 
        return autogroup_task_group(p, tg);
@@ -645,7 +661,7 @@ static void update_rq_clock(struct rq *rq)
 {
        s64 delta;
 
-       if (rq->skip_clock_update)
+       if (rq->skip_clock_update > 0)
                return;
 
        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -1203,11 +1219,17 @@ int get_nohz_timer_target(void)
        int i;
        struct sched_domain *sd;
 
+       rcu_read_lock();
        for_each_domain(cpu, sd) {
-               for_each_cpu(i, sched_domain_span(sd))
-                       if (!idle_cpu(i))
-                               return i;
+               for_each_cpu(i, sched_domain_span(sd)) {
+                       if (!idle_cpu(i)) {
+                               cpu = i;
+                               goto unlock;
+                       }
+               }
        }
+unlock:
+       rcu_read_unlock();
        return cpu;
 }
 /*
@@ -1317,15 +1339,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 {
        u64 tmp;
 
+       /*
+        * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
+        * entities since MIN_SHARES = 2. Treat weight as 1 if less than
+        * 2^SCHED_LOAD_RESOLUTION.
+        */
+       if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
+               tmp = (u64)delta_exec * scale_load_down(weight);
+       else
+               tmp = (u64)delta_exec;
+
        if (!lw->inv_weight) {
-               if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
+               unsigned long w = scale_load_down(lw->weight);
+
+               if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
                        lw->inv_weight = 1;
+               else if (unlikely(!w))
+                       lw->inv_weight = WMULT_CONST;
                else
-                       lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
-                               / (lw->weight+1);
+                       lw->inv_weight = WMULT_CONST / w;
        }
 
-       tmp = (u64)delta_exec * weight;
        /*
         * Check whether we'd overflow the 64-bit multiplication:
         */
@@ -1542,38 +1576,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
        return rq->avg_load_per_task;
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/*
- * Compute the cpu's hierarchical load factor for each task group.
- * This needs to be done in a top-down fashion because the load of a child
- * group is a fraction of its parents load.
- */
-static int tg_load_down(struct task_group *tg, void *data)
-{
-       unsigned long load;
-       long cpu = (long)data;
-
-       if (!tg->parent) {
-               load = cpu_rq(cpu)->load.weight;
-       } else {
-               load = tg->parent->cfs_rq[cpu]->h_load;
-               load *= tg->se[cpu]->load.weight;
-               load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
-       }
-
-       tg->cfs_rq[cpu]->h_load = load;
-
-       return 0;
-}
-
-static void update_h_load(long cpu)
-{
-       walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
-}
-
-#endif
-
 #ifdef CONFIG_PREEMPT
 
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -1765,17 +1767,20 @@ static void dec_nr_running(struct rq *rq)
 
 static void set_load_weight(struct task_struct *p)
 {
+       int prio = p->static_prio - MAX_RT_PRIO;
+       struct load_weight *load = &p->se.load;
+
        /*
         * SCHED_IDLE tasks get minimal weight:
         */
        if (p->policy == SCHED_IDLE) {
-               p->se.load.weight = WEIGHT_IDLEPRIO;
-               p->se.load.inv_weight = WMULT_IDLEPRIO;
+               load->weight = scale_load(WEIGHT_IDLEPRIO);
+               load->inv_weight = WMULT_IDLEPRIO;
                return;
        }
 
-       p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
-       p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
+       load->weight = scale_load(prio_to_weight[prio]);
+       load->inv_weight = prio_to_wmult[prio];
 }
 
 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1924,10 +1929,28 @@ void account_system_vtime(struct task_struct *curr)
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
 
-static void update_rq_clock_task(struct rq *rq, s64 delta)
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
 {
-       s64 irq_delta;
+       if (unlikely(steal > NSEC_PER_SEC))
+               return div_u64(steal, TICK_NSEC);
+
+       return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
 
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+       s64 steal = 0, irq_delta = 0;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
        irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
 
        /*
@@ -1950,12 +1973,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 
        rq->prev_irq_time += irq_delta;
        delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+       if (static_branch((&paravirt_steal_rq_enabled))) {
+               u64 st;
+
+               steal = paravirt_steal_clock(cpu_of(rq));
+               steal -= rq->prev_steal_time_rq;
+
+               if (unlikely(steal > delta))
+                       steal = delta;
+
+               st = steal_ticks(steal);
+               steal = st * TICK_NSEC;
+
+               rq->prev_steal_time_rq += steal;
+
+               delta -= steal;
+       }
+#endif
+
        rq->clock_task += delta;
 
-       if (irq_delta && sched_feat(NONIRQ_POWER))
-               sched_rt_avg_update(rq, irq_delta);
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+       if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+               sched_rt_avg_update(rq, irq_delta + steal);
+#endif
 }
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 static int irqtime_account_hi_update(void)
 {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@ -1990,12 +2036,7 @@ static int irqtime_account_si_update(void)
 
 #define sched_clock_irqtime    (0)
 
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-       rq->clock_task += delta;
-}
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#endif
 
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -2172,6 +2213,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                        !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
 
 #ifdef CONFIG_LOCKDEP
+       /*
+        * The caller should hold either p->pi_lock or rq->lock, when changing
+        * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+        *
+        * sched_move_task() holds both and thus holding either pins the cgroup,
+        * see set_task_rq().
+        *
+        * Furthermore, all task_rq users should acquire both locks, see
+        * task_rq_lock().
+        */
        WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
                                      lockdep_is_held(&task_rq(p)->lock)));
 #endif
@@ -2181,7 +2232,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
        if (task_cpu(p) != new_cpu) {
                p->se.nr_migrations++;
-               perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
+               perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
        }
 
        __set_task_cpu(p, new_cpu);
@@ -2195,21 +2246,6 @@ struct migration_arg {
 static int migration_cpu_stop(void *data);
 
 /*
- * The task's runqueue lock must be held.
- * Returns true if you have to wait for migration thread.
- */
-static bool need_migrate_task(struct task_struct *p)
-{
-       /*
-        * If the task is not on a runqueue (and not running), then
-        * the next wake-up will properly place the task.
-        */
-       bool running = p->on_rq || p->on_cpu;
-       smp_rmb(); /* finish_lock_switch() */
-       return running;
-}
-
-/*
  * wait_task_inactive - wait for a thread to unschedule.
  *
  * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2425,13 +2461,19 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
                struct sched_domain *sd;
 
                schedstat_inc(p, se.statistics.nr_wakeups_remote);
+               rcu_read_lock();
                for_each_domain(this_cpu, sd) {
                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                                schedstat_inc(sd, ttwu_wake_remote);
                                break;
                        }
                }
+               rcu_read_unlock();
        }
+
+       if (wake_flags & WF_MIGRATED)
+               schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+
 #endif /* CONFIG_SMP */
 
        schedstat_inc(rq, ttwu_count);
@@ -2440,9 +2482,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
        if (wake_flags & WF_SYNC)
                schedstat_inc(p, se.statistics.nr_wakeups_sync);
 
-       if (cpu != task_cpu(p))
-               schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-
 #endif /* CONFIG_SCHEDSTATS */
 }
 
@@ -2470,7 +2509,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
        if (p->sched_class->task_woken)
                p->sched_class->task_woken(rq, p);
 
-       if (unlikely(rq->idle_stamp)) {
+       if (rq->idle_stamp) {
                u64 delta = rq->clock - rq->idle_stamp;
                u64 max = 2*sysctl_sched_migration_cost;
 
@@ -2483,6 +2522,151 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 #endif
 }
 
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+#ifdef CONFIG_SMP
+       if (p->sched_contributes_to_load)
+               rq->nr_uninterruptible--;
+#endif
+
+       ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+       ttwu_do_wakeup(rq, p, wake_flags);
+}
+
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+       struct rq *rq;
+       int ret = 0;
+
+       rq = __task_rq_lock(p);
+       if (p->on_rq) {
+               ttwu_do_wakeup(rq, p, wake_flags);
+               ret = 1;
+       }
+       __task_rq_unlock(rq);
+
+       return ret;
+}
+
+#ifdef CONFIG_SMP
+static void sched_ttwu_do_pending(struct task_struct *list)
+{
+       struct rq *rq = this_rq();
+
+       raw_spin_lock(&rq->lock);
+
+       while (list) {
+               struct task_struct *p = list;
+               list = list->wake_entry;
+               ttwu_do_activate(rq, p, 0);
+       }
+
+       raw_spin_unlock(&rq->lock);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void sched_ttwu_pending(void)
+{
+       struct rq *rq = this_rq();
+       struct task_struct *list = xchg(&rq->wake_list, NULL);
+
+       if (!list)
+               return;
+
+       sched_ttwu_do_pending(list);
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void scheduler_ipi(void)
+{
+       struct rq *rq = this_rq();
+       struct task_struct *list = xchg(&rq->wake_list, NULL);
+
+       if (!list)
+               return;
+
+       /*
+        * Not all reschedule IPI handlers call irq_enter/irq_exit, since
+        * traditionally all their work was done from the interrupt return
+        * path. Now that we actually do some work, we need to make sure
+        * we do call them.
+        *
+        * Some archs already do call them, luckily irq_enter/exit nest
+        * properly.
+        *
+        * Arguably we should visit all archs and update all handlers,
+        * however a fair share of IPIs are still resched only so this would
+        * somewhat pessimize the simple resched case.
+        */
+       irq_enter();
+       sched_ttwu_do_pending(list);
+       irq_exit();
+}
+
+static void ttwu_queue_remote(struct task_struct *p, int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       struct task_struct *next = rq->wake_list;
+
+       for (;;) {
+               struct task_struct *old = next;
+
+               p->wake_entry = next;
+               next = cmpxchg(&rq->wake_list, old, p);
+               if (next == old)
+                       break;
+       }
+
+       if (!next)
+               smp_send_reschedule(cpu);
+}
+
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
+{
+       struct rq *rq;
+       int ret = 0;
+
+       rq = __task_rq_lock(p);
+       if (p->on_cpu) {
+               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+               ttwu_do_wakeup(rq, p, wake_flags);
+               ret = 1;
+       }
+       __task_rq_unlock(rq);
+
+       return ret;
+
+}
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+#endif /* CONFIG_SMP */
+
+static void ttwu_queue(struct task_struct *p, int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+
+#if defined(CONFIG_SMP)
+       if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+               sched_clock_cpu(cpu); /* sync clocks x-cpu */
+               ttwu_queue_remote(p, cpu);
+               return;
+       }
+#endif
+
+       raw_spin_lock(&rq->lock);
+       ttwu_do_activate(rq, p, 0);
+       raw_spin_unlock(&rq->lock);
+}
+
 /**
  * try_to_wake_up - wake up a thread
  * @p: the thread to be awakened
@@ -2501,39 +2685,39 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 static int
 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 {
-       int cpu, this_cpu, success = 0;
        unsigned long flags;
-       struct rq *rq;
-
-       this_cpu = get_cpu();
+       int cpu, success = 0;
 
        smp_wmb();
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        if (!(p->state & state))
                goto out;
 
+       success = 1; /* we're going to change ->state */
        cpu = task_cpu(p);
 
-       if (p->on_rq) {
-               rq = __task_rq_lock(p);
-               if (p->on_rq)
-                       goto out_running;
-               __task_rq_unlock(rq);
-       }
+       if (p->on_rq && ttwu_remote(p, wake_flags))
+               goto stat;
 
 #ifdef CONFIG_SMP
+       /*
+        * If the owning (remote) cpu is still in the middle of schedule() with
+        * this task as prev, wait until its done referencing the task.
+        */
        while (p->on_cpu) {
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
                /*
-                * If called from interrupt context we could have landed in the
-                * middle of schedule(), in this case we should take care not
-                * to spin on ->on_cpu if p is current, since that would
-                * deadlock.
+                * In case the architecture enables interrupts in
+                * context_switch(), we cannot busy wait, since that
+                * would lead to deadlocks when an interrupt hits and
+                * tries to wake up @prev. So bail and do a complete
+                * remote wakeup.
                 */
-               if (p == current)
-                       goto out_activate;
-#endif
+               if (ttwu_activate_remote(p, wake_flags))
+                       goto stat;
+#else
                cpu_relax();
+#endif
        }
        /*
         * Pairs with the smp_wmb() in finish_lock_switch().
@@ -2547,32 +2731,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
                p->sched_class->task_waking(p);
 
        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-out_activate:
-#endif
-#endif /* CONFIG_SMP */
-
-       rq = cpu_rq(cpu);
-       raw_spin_lock(&rq->lock);
-
-#ifdef CONFIG_SMP
-       if (cpu != task_cpu(p))
+       if (task_cpu(p) != cpu) {
+               wake_flags |= WF_MIGRATED;
                set_task_cpu(p, cpu);
+       }
+#endif /* CONFIG_SMP */
 
-       if (p->sched_contributes_to_load)
-               rq->nr_uninterruptible--;
-#endif
-
-       ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
-out_running:
-       ttwu_do_wakeup(rq, p, wake_flags);
-       success = 1;
-       __task_rq_unlock(rq);
-
+       ttwu_queue(p, cpu);
+stat:
        ttwu_stat(p, cpu, wake_flags);
 out:
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-       put_cpu();
 
        return success;
 }
@@ -2665,7 +2834,7 @@ static void __sched_fork(struct task_struct *p)
 /*
  * fork()/clone()-time setup:
  */
-void sched_fork(struct task_struct *p, int clone_flags)
+void sched_fork(struct task_struct *p)
 {
        unsigned long flags;
        int cpu = get_cpu();
@@ -2729,7 +2898,7 @@ void sched_fork(struct task_struct *p, int clone_flags)
 #if defined(CONFIG_SMP)
        p->on_cpu = 0;
 #endif
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
 #endif
@@ -2747,7 +2916,7 @@ void sched_fork(struct task_struct *p, int clone_flags)
  * that must be done for every newly created context, then puts the task
  * on the runqueue and wakes it.
  */
-void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
+void wake_up_new_task(struct task_struct *p)
 {
        unsigned long flags;
        struct rq *rq;
@@ -2896,7 +3065,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
        local_irq_disable();
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
-       perf_event_task_sched_in(current);
+       perf_event_task_sched_in(prev, current);
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
        local_irq_enable();
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
@@ -3556,30 +3725,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 }
 
 /*
- * Return sum_exec_runtime for the thread group.
- * In case the task is currently running, return the sum plus current's
- * pending runtime that have not been accounted yet.
- *
- * Note that the thread group might have other running tasks as well,
- * so the return value not includes other pending runtime that other
- * running tasks might have.
- */
-unsigned long long thread_group_sched_runtime(struct task_struct *p)
-{
-       struct task_cputime totals;
-       unsigned long flags;
-       struct rq *rq;
-       u64 ns;
-
-       rq = task_rq_lock(p, &flags);
-       thread_group_cputime(p, &totals);
-       ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
-       task_rq_unlock(rq, p, &flags);
-
-       return ns;
-}
-
-/*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in user space since the last update
@@ -3720,6 +3865,25 @@ void account_idle_time(cputime_t cputime)
                cpustat->idle = cputime64_add(cpustat->idle, cputime64);
 }
 
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+       if (static_branch(&paravirt_steal_enabled)) {
+               u64 steal, st = 0;
+
+               steal = paravirt_steal_clock(smp_processor_id());
+               steal -= this_rq()->prev_steal_time;
+
+               st = steal_ticks(steal);
+               this_rq()->prev_steal_time += st * TICK_NSEC;
+
+               account_steal_time(st);
+               return st;
+       }
+#endif
+       return false;
+}
+
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3751,6 +3915,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
        cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 
+       if (steal_account_process_tick())
+               return;
+
        if (irqtime_account_hi_update()) {
                cpustat->irq = cputime64_add(cpustat->irq, tmp);
        } else if (irqtime_account_si_update()) {
@@ -3804,6 +3971,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
                return;
        }
 
+       if (steal_account_process_tick())
+               return;
+
        if (user_tick)
                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
        else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3928,9 +4098,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
- *
- * It also gets called by the fork code, when changing the parent's
- * timeslices.
  */
 void scheduler_tick(void)
 {
@@ -4050,17 +4217,11 @@ static inline void schedule_debug(struct task_struct *prev)
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
        schedstat_inc(this_rq(), sched_count);
-#ifdef CONFIG_SCHEDSTATS
-       if (unlikely(prev->lock_depth >= 0)) {
-               schedstat_inc(this_rq(), rq_sched_info.bkl_count);
-               schedstat_inc(prev, sched_info.bkl_count);
-       }
-#endif
 }
 
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-       if (prev->on_rq)
+       if (prev->on_rq || rq->skip_clock_update < 0)
                update_rq_clock(rq);
        prev->sched_class->put_prev_task(rq, prev);
 }
@@ -4094,9 +4255,9 @@ pick_next_task(struct rq *rq)
 }
 
 /*
- * schedule() is the main scheduler function.
+ * __schedule() is the main scheduler function.
  */
-asmlinkage void __sched schedule(void)
+static void __sched __schedule(void)
 {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
@@ -4137,16 +4298,6 @@ need_resched:
                                if (to_wakeup)
                                        try_to_wake_up_local(to_wakeup);
                        }
-
-                       /*
-                        * If we are going to sleep and we have plugged IO
-                        * queued, make sure to submit it to avoid deadlocks.
-                        */
-                       if (blk_needs_flush_plug(prev)) {
-                               raw_spin_unlock(&rq->lock);
-                               blk_flush_plug(prev);
-                               raw_spin_lock(&rq->lock);
-                       }
                }
                switch_count = &prev->nvcsw;
        }
@@ -4184,17 +4335,34 @@ need_resched:
        if (need_resched())
                goto need_resched;
 }
+
+static inline void sched_submit_work(struct task_struct *tsk)
+{
+       if (!tsk->state)
+               return;
+       /*
+        * If we are going to sleep and we have plugged IO queued,
+        * make sure to submit it to avoid deadlocks.
+        */
+       if (blk_needs_flush_plug(tsk))
+               blk_schedule_flush_plug(tsk);
+}
+
+asmlinkage void __sched schedule(void)
+{
+       struct task_struct *tsk = current;
+
+       sched_submit_work(tsk);
+       __schedule();
+}
 EXPORT_SYMBOL(schedule);
 
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 
 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
 {
-       bool ret = false;
-
-       rcu_read_lock();
        if (lock->owner != owner)
-               goto fail;
+               return false;
 
        /*
         * Ensure we emit the owner->on_cpu, dereference _after_ checking
@@ -4204,11 +4372,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
         */
        barrier();
 
-       ret = owner->on_cpu;
-fail:
-       rcu_read_unlock();
-
-       return ret;
+       return owner->on_cpu;
 }
 
 /*
@@ -4220,21 +4384,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
        if (!sched_feat(OWNER_SPIN))
                return 0;
 
+       rcu_read_lock();
        while (owner_running(lock, owner)) {
                if (need_resched())
-                       return 0;
+                       break;
 
                arch_mutex_cpu_relax();
        }
+       rcu_read_unlock();
 
        /*
-        * If the owner changed to another task there is likely
-        * heavy contention, stop spinning.
+        * We break out the loop above on need_resched() and when the
+        * owner changed, which is a sign for heavy contention. Return
+        * success only when lock->owner is NULL.
         */
-       if (lock->owner)
-               return 0;
-
-       return 1;
+       return lock->owner == NULL;
 }
 #endif
 
@@ -4257,7 +4421,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
 
        do {
                add_preempt_count_notrace(PREEMPT_ACTIVE);
-               schedule();
+               __schedule();
                sub_preempt_count_notrace(PREEMPT_ACTIVE);
 
                /*
@@ -4285,7 +4449,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
        do {
                add_preempt_count(PREEMPT_ACTIVE);
                local_irq_enable();
-               schedule();
+               __schedule();
                local_irq_disable();
                sub_preempt_count(PREEMPT_ACTIVE);
 
@@ -5410,7 +5574,7 @@ static inline int should_resched(void)
 static void __cond_resched(void)
 {
        add_preempt_count(PREEMPT_ACTIVE);
-       schedule();
+       __schedule();
        sub_preempt_count(PREEMPT_ACTIVE);
 }
 
@@ -5759,7 +5923,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        idle->state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
 
-       cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+       do_set_cpus_allowed(idle, cpumask_of(cpu));
        /*
         * We're having a chicken and egg problem, even though we are
         * holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -5781,11 +5945,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 
        /* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT)
-       task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
-#else
        task_thread_info(idle)->preempt_count = 0;
-#endif
+
        /*
         * The idle tasks have their own, simple scheduling class:
         */
@@ -5850,6 +6011,16 @@ static inline void sched_init_granularity(void)
 }
 
 #ifdef CONFIG_SMP
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+       if (p->sched_class && p->sched_class->set_cpus_allowed)
+               p->sched_class->set_cpus_allowed(p, new_mask);
+       else {
+               cpumask_copy(&p->cpus_allowed, new_mask);
+               p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+       }
+}
+
 /*
  * This is how migration works:
  *
@@ -5882,30 +6053,27 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 
        rq = task_rq_lock(p, &flags);
 
+       if (cpumask_equal(&p->cpus_allowed, new_mask))
+               goto out;
+
        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                ret = -EINVAL;
                goto out;
        }
 
-       if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
-                    !cpumask_equal(&p->cpus_allowed, new_mask))) {
+       if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
                ret = -EINVAL;
                goto out;
        }
 
-       if (p->sched_class->set_cpus_allowed)
-               p->sched_class->set_cpus_allowed(p, new_mask);
-       else {
-               cpumask_copy(&p->cpus_allowed, new_mask);
-               p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
-       }
+       do_set_cpus_allowed(p, new_mask);
 
        /* Can the task run on the task's current CPU? If so, we're done */
        if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
 
        dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-       if (need_migrate_task(p)) {
+       if (p->on_rq) {
                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
                task_rq_unlock(rq, p, &flags);
@@ -6306,6 +6474,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_DYING:
+               sched_ttwu_pending();
                /* Update our root-domain */
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
@@ -6384,6 +6553,8 @@ early_initcall(migration_init);
 
 #ifdef CONFIG_SMP
 
+static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
+
 #ifdef CONFIG_SCHED_DEBUG
 
 static __read_mostly int sched_domain_debug_enabled;
@@ -6434,7 +6605,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
 
-               if (!group->cpu_power) {
+               if (!group->sgp->power) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: domain->cpu_power not "
                                        "set\n");
@@ -6458,9 +6629,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
 
                printk(KERN_CONT " %s", str);
-               if (group->cpu_power != SCHED_LOAD_SCALE) {
+               if (group->sgp->power != SCHED_POWER_SCALE) {
                        printk(KERN_CONT " (cpu_power = %d)",
-                               group->cpu_power);
+                               group->sgp->power);
                }
 
                group = group->next;
@@ -6479,7 +6650,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
-       cpumask_var_t groupmask;
        int level = 0;
 
        if (!sched_domain_debug_enabled)
@@ -6492,20 +6662,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 
        printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
 
-       if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
-               printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
-               return;
-       }
-
        for (;;) {
-               if (sched_domain_debug_one(sd, cpu, level, groupmask))
+               if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
                        break;
                level++;
                sd = sd->parent;
                if (!sd)
                        break;
        }
-       free_cpumask_var(groupmask);
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6562,12 +6726,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
        return 1;
 }
 
-static void free_rootdomain(struct root_domain *rd)
+static void free_rootdomain(struct rcu_head *rcu)
 {
-       synchronize_sched();
+       struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
 
        cpupri_cleanup(&rd->cpupri);
-
        free_cpumask_var(rd->rto_mask);
        free_cpumask_var(rd->online);
        free_cpumask_var(rd->span);
@@ -6608,7 +6771,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 
        if (old_rd)
-               free_rootdomain(old_rd);
+               call_rcu_sched(&old_rd->rcu, free_rootdomain);
 }
 
 static int init_rootdomain(struct root_domain *rd)
@@ -6659,6 +6822,53 @@ static struct root_domain *alloc_rootdomain(void)
        return rd;
 }
 
+static void free_sched_groups(struct sched_group *sg, int free_sgp)
+{
+       struct sched_group *tmp, *first;
+
+       if (!sg)
+               return;
+
+       first = sg;
+       do {
+               tmp = sg->next;
+
+               if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
+                       kfree(sg->sgp);
+
+               kfree(sg);
+               sg = tmp;
+       } while (sg != first);
+}
+
+static void free_sched_domain(struct rcu_head *rcu)
+{
+       struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+       /*
+        * If its an overlapping domain it has private groups, iterate and
+        * nuke them all.
+        */
+       if (sd->flags & SD_OVERLAP) {
+               free_sched_groups(sd->groups, 1);
+       } else if (atomic_dec_and_test(&sd->groups->ref)) {
+               kfree(sd->groups->sgp);
+               kfree(sd->groups);
+       }
+       kfree(sd);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+{
+       call_rcu(&sd->rcu, free_sched_domain);
+}
+
+static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+{
+       for (; sd; sd = sd->parent)
+               destroy_sched_domain(sd, cpu);
+}
+
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
  * hold the hotplug lock.
@@ -6669,9 +6879,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        struct rq *rq = cpu_rq(cpu);
        struct sched_domain *tmp;
 
-       for (tmp = sd; tmp; tmp = tmp->parent)
-               tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
-
        /* Remove the sched domains which do not contribute to scheduling. */
        for (tmp = sd; tmp; ) {
                struct sched_domain *parent = tmp->parent;
@@ -6682,12 +6889,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
                        tmp->parent = parent->parent;
                        if (parent->parent)
                                parent->parent->child = tmp;
+                       destroy_sched_domain(parent, cpu);
                } else
                        tmp = tmp->parent;
        }
 
        if (sd && sd_degenerate(sd)) {
+               tmp = sd;
                sd = sd->parent;
+               destroy_sched_domain(tmp, cpu);
                if (sd)
                        sd->child = NULL;
        }
@@ -6695,7 +6905,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        sched_domain_debug(sd, cpu);
 
        rq_attach_root(rq, rd);
+       tmp = rq->sd;
        rcu_assign_pointer(rq->sd, sd);
+       destroy_sched_domains(tmp, cpu);
 }
 
 /* cpus with isolated domains */
@@ -6711,56 +6923,6 @@ static int __init isolated_cpu_setup(char *str)
 
 __setup("isolcpus=", isolated_cpu_setup);
 
-/*
- * init_sched_build_groups takes the cpumask we wish to span, and a pointer
- * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- * (due to the fact that we keep track of groups covered with a struct cpumask).
- *
- * init_sched_build_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
- */
-static void
-init_sched_build_groups(const struct cpumask *span,
-                       const struct cpumask *cpu_map,
-                       int (*group_fn)(int cpu, const struct cpumask *cpu_map,
-                                       struct sched_group **sg,
-                                       struct cpumask *tmpmask),
-                       struct cpumask *covered, struct cpumask *tmpmask)
-{
-       struct sched_group *first = NULL, *last = NULL;
-       int i;
-
-       cpumask_clear(covered);
-
-       for_each_cpu(i, span) {
-               struct sched_group *sg;
-               int group = group_fn(i, cpu_map, &sg, tmpmask);
-               int j;
-
-               if (cpumask_test_cpu(i, covered))
-                       continue;
-
-               cpumask_clear(sched_group_cpus(sg));
-               sg->cpu_power = 0;
-
-               for_each_cpu(j, span) {
-                       if (group_fn(j, cpu_map, NULL, tmpmask) != group)
-                               continue;
-
-                       cpumask_set_cpu(j, covered);
-                       cpumask_set_cpu(j, sched_group_cpus(sg));
-               }
-               if (!first)
-                       first = sg;
-               if (last)
-                       last->next = sg;
-               last = sg;
-       }
-       last->next = first;
-}
-
 #define SD_NODES_PER_DOMAIN 16
 
 #ifdef CONFIG_NUMA
@@ -6777,7 +6939,7 @@ init_sched_build_groups(const struct cpumask *span,
  */
 static int find_next_best_node(int node, nodemask_t *used_nodes)
 {
-       int i, n, val, min_val, best_node = 0;
+       int i, n, val, min_val, best_node = -1;
 
        min_val = INT_MAX;
 
@@ -6801,7 +6963,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
                }
        }
 
-       node_set(best_node, *used_nodes);
+       if (best_node != -1)
+               node_set(best_node, *used_nodes);
        return best_node;
 }
 
@@ -6827,315 +6990,197 @@ static void sched_domain_node_span(int node, struct cpumask *span)
 
        for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
                int next_node = find_next_best_node(node, &used_nodes);
-
+               if (next_node < 0)
+                       break;
                cpumask_or(span, span, cpumask_of_node(next_node));
        }
 }
+
+static const struct cpumask *cpu_node_mask(int cpu)
+{
+       lockdep_assert_held(&sched_domains_mutex);
+
+       sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
+
+       return sched_domains_tmpmask;
+}
+
+static const struct cpumask *cpu_allnodes_mask(int cpu)
+{
+       return cpu_possible_mask;
+}
 #endif /* CONFIG_NUMA */
 
-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+static const struct cpumask *cpu_cpu_mask(int cpu)
+{
+       return cpumask_of_node(cpu_to_node(cpu));
+}
 
-/*
- * The cpus mask in sched_group and sched_domain hangs off the end.
- *
- * ( See the the comments in include/linux/sched.h:struct sched_group
- *   and struct sched_domain. )
- */
-struct static_sched_group {
-       struct sched_group sg;
-       DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
-};
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 
-struct static_sched_domain {
-       struct sched_domain sd;
-       DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+struct sd_data {
+       struct sched_domain **__percpu sd;
+       struct sched_group **__percpu sg;
+       struct sched_group_power **__percpu sgp;
 };
 
 struct s_data {
-#ifdef CONFIG_NUMA
-       int                     sd_allnodes;
-       cpumask_var_t           domainspan;
-       cpumask_var_t           covered;
-       cpumask_var_t           notcovered;
-#endif
-       cpumask_var_t           nodemask;
-       cpumask_var_t           this_sibling_map;
-       cpumask_var_t           this_core_map;
-       cpumask_var_t           this_book_map;
-       cpumask_var_t           send_covered;
-       cpumask_var_t           tmpmask;
-       struct sched_group      **sched_group_nodes;
+       struct sched_domain ** __percpu sd;
        struct root_domain      *rd;
 };
 
 enum s_alloc {
-       sa_sched_groups = 0,
        sa_rootdomain,
-       sa_tmpmask,
-       sa_send_covered,
-       sa_this_book_map,
-       sa_this_core_map,
-       sa_this_sibling_map,
-       sa_nodemask,
-       sa_sched_group_nodes,
-#ifdef CONFIG_NUMA
-       sa_notcovered,
-       sa_covered,
-       sa_domainspan,
-#endif
+       sa_sd,
+       sa_sd_storage,
        sa_none,
 };
 
-/*
- * SMT sched-domains:
- */
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
+struct sched_domain_topology_level;
 
-static int
-cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
-                struct sched_group **sg, struct cpumask *unused)
-{
-       if (sg)
-               *sg = &per_cpu(sched_groups, cpu).sg;
-       return cpu;
-}
-#endif /* CONFIG_SCHED_SMT */
+typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
+typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 
-/*
- * multi-core sched-domains:
- */
-#ifdef CONFIG_SCHED_MC
-static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
+#define SDTL_OVERLAP   0x01
+
+struct sched_domain_topology_level {
+       sched_domain_init_f init;
+       sched_domain_mask_f mask;
+       int                 flags;
+       struct sd_data      data;
+};
 
 static int
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
-                 struct sched_group **sg, struct cpumask *mask)
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 {
-       int group;
-#ifdef CONFIG_SCHED_SMT
-       cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
-       group = cpumask_first(mask);
-#else
-       group = cpu;
-#endif
-       if (sg)
-               *sg = &per_cpu(sched_group_core, group).sg;
-       return group;
-}
-#endif /* CONFIG_SCHED_MC */
+       struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+       const struct cpumask *span = sched_domain_span(sd);
+       struct cpumask *covered = sched_domains_tmpmask;
+       struct sd_data *sdd = sd->private;
+       struct sched_domain *child;
+       int i;
 
-/*
- * book sched-domains:
- */
-#ifdef CONFIG_SCHED_BOOK
-static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
+       cpumask_clear(covered);
 
-static int
-cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
-                 struct sched_group **sg, struct cpumask *mask)
-{
-       int group = cpu;
-#ifdef CONFIG_SCHED_MC
-       cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
-       group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_SMT)
-       cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
-       group = cpumask_first(mask);
-#endif
-       if (sg)
-               *sg = &per_cpu(sched_group_book, group).sg;
-       return group;
-}
-#endif /* CONFIG_SCHED_BOOK */
+       for_each_cpu(i, span) {
+               struct cpumask *sg_span;
 
-static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
+               if (cpumask_test_cpu(i, covered))
+                       continue;
 
-static int
-cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
-                 struct sched_group **sg, struct cpumask *mask)
-{
-       int group;
-#ifdef CONFIG_SCHED_BOOK
-       cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
-       group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_MC)
-       cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
-       group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_SMT)
-       cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
-       group = cpumask_first(mask);
-#else
-       group = cpu;
-#endif
-       if (sg)
-               *sg = &per_cpu(sched_group_phys, group).sg;
-       return group;
-}
+               sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                               GFP_KERNEL, cpu_to_node(i));
 
-#ifdef CONFIG_NUMA
-/*
- * The init_sched_build_groups can't handle what we want to do with node
- * groups, so roll our own. Now each node has its own list of groups which
- * gets dynamically allocated.
- */
-static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
-static struct sched_group ***sched_group_nodes_bycpu;
+               if (!sg)
+                       goto fail;
 
-static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
+               sg_span = sched_group_cpus(sg);
 
-static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
-                                struct sched_group **sg,
-                                struct cpumask *nodemask)
-{
-       int group;
+               child = *per_cpu_ptr(sdd->sd, i);
+               if (child->child) {
+                       child = child->child;
+                       cpumask_copy(sg_span, sched_domain_span(child));
+               } else
+                       cpumask_set_cpu(i, sg_span);
 
-       cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
-       group = cpumask_first(nodemask);
+               cpumask_or(covered, covered, sg_span);
 
-       if (sg)
-               *sg = &per_cpu(sched_group_allnodes, group).sg;
-       return group;
-}
+               sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+               atomic_inc(&sg->sgp->ref);
 
-static void init_numa_sched_groups_power(struct sched_group *group_head)
-{
-       struct sched_group *sg = group_head;
-       int j;
+               if (cpumask_test_cpu(cpu, sg_span))
+                       groups = sg;
 
-       if (!sg)
-               return;
-       do {
-               for_each_cpu(j, sched_group_cpus(sg)) {
-                       struct sched_domain *sd;
+               if (!first)
+                       first = sg;
+               if (last)
+                       last->next = sg;
+               last = sg;
+               last->next = first;
+       }
+       sd->groups = groups;
 
-                       sd = &per_cpu(phys_domains, j).sd;
-                       if (j != group_first_cpu(sd->groups)) {
-                               /*
-                                * Only add "power" once for each
-                                * physical package.
-                                */
-                               continue;
-                       }
+       return 0;
 
-                       sg->cpu_power += sd->groups->cpu_power;
-               }
-               sg = sg->next;
-       } while (sg != group_head);
+fail:
+       free_sched_groups(first, 0);
+
+       return -ENOMEM;
 }
 
-static int build_numa_sched_groups(struct s_data *d,
-                                  const struct cpumask *cpu_map, int num)
+static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 {
-       struct sched_domain *sd;
-       struct sched_group *sg, *prev;
-       int n, j;
+       struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+       struct sched_domain *child = sd->child;
 
-       cpumask_clear(d->covered);
-       cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
-       if (cpumask_empty(d->nodemask)) {
-               d->sched_group_nodes[num] = NULL;
-               goto out;
+       if (child)
+               cpu = cpumask_first(sched_domain_span(child));
+
+       if (sg) {
+               *sg = *per_cpu_ptr(sdd->sg, cpu);
+               (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+               atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
        }
 
-       sched_domain_node_span(num, d->domainspan);
-       cpumask_and(d->domainspan, d->domainspan, cpu_map);
+       return cpu;
+}
 
-       sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
-                         GFP_KERNEL, num);
-       if (!sg) {
-               printk(KERN_WARNING "Can not alloc domain group for node %d\n",
-                      num);
-               return -ENOMEM;
-       }
-       d->sched_group_nodes[num] = sg;
+/*
+ * build_sched_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_power to 0.
+ *
+ * Assumes the sched_domain tree is fully constructed
+ */
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
+{
+       struct sched_group *first = NULL, *last = NULL;
+       struct sd_data *sdd = sd->private;
+       const struct cpumask *span = sched_domain_span(sd);
+       struct cpumask *covered;
+       int i;
 
-       for_each_cpu(j, d->nodemask) {
-               sd = &per_cpu(node_domains, j).sd;
-               sd->groups = sg;
-       }
+       get_group(cpu, sdd, &sd->groups);
+       atomic_inc(&sd->groups->ref);
 
-       sg->cpu_power = 0;
-       cpumask_copy(sched_group_cpus(sg), d->nodemask);
-       sg->next = sg;
-       cpumask_or(d->covered, d->covered, d->nodemask);
+       if (cpu != cpumask_first(sched_domain_span(sd)))
+               return 0;
 
-       prev = sg;
-       for (j = 0; j < nr_node_ids; j++) {
-               n = (num + j) % nr_node_ids;
-               cpumask_complement(d->notcovered, d->covered);
-               cpumask_and(d->tmpmask, d->notcovered, cpu_map);
-               cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
-               if (cpumask_empty(d->tmpmask))
-                       break;
-               cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
-               if (cpumask_empty(d->tmpmask))
-                       continue;
-               sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
-                                 GFP_KERNEL, num);
-               if (!sg) {
-                       printk(KERN_WARNING
-                              "Can not alloc domain group for node %d\n", j);
-                       return -ENOMEM;
-               }
-               sg->cpu_power = 0;
-               cpumask_copy(sched_group_cpus(sg), d->tmpmask);
-               sg->next = prev->next;
-               cpumask_or(d->covered, d->covered, d->tmpmask);
-               prev->next = sg;
-               prev = sg;
-       }
-out:
-       return 0;
-}
-#endif /* CONFIG_NUMA */
+       lockdep_assert_held(&sched_domains_mutex);
+       covered = sched_domains_tmpmask;
 
-#ifdef CONFIG_NUMA
-/* Free memory allocated for various sched_group structures */
-static void free_sched_groups(const struct cpumask *cpu_map,
-                             struct cpumask *nodemask)
-{
-       int cpu, i;
+       cpumask_clear(covered);
 
-       for_each_cpu(cpu, cpu_map) {
-               struct sched_group **sched_group_nodes
-                       = sched_group_nodes_bycpu[cpu];
+       for_each_cpu(i, span) {
+               struct sched_group *sg;
+               int group = get_group(i, sdd, &sg);
+               int j;
 
-               if (!sched_group_nodes)
+               if (cpumask_test_cpu(i, covered))
                        continue;
 
-               for (i = 0; i < nr_node_ids; i++) {
-                       struct sched_group *oldsg, *sg = sched_group_nodes[i];
+               cpumask_clear(sched_group_cpus(sg));
+               sg->sgp->power = 0;
 
-                       cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
-                       if (cpumask_empty(nodemask))
+               for_each_cpu(j, span) {
+                       if (get_group(j, sdd, NULL) != group)
                                continue;
 
-                       if (sg == NULL)
-                               continue;
-                       sg = sg->next;
-next_sg:
-                       oldsg = sg;
-                       sg = sg->next;
-                       kfree(oldsg);
-                       if (oldsg != sched_group_nodes[i])
-                               goto next_sg;
+                       cpumask_set_cpu(j, covered);
+                       cpumask_set_cpu(j, sched_group_cpus(sg));
                }
-               kfree(sched_group_nodes);
-               sched_group_nodes_bycpu[cpu] = NULL;
+
+               if (!first)
+                       first = sg;
+               if (last)
+                       last->next = sg;
+               last = sg;
        }
+       last->next = first;
+
+       return 0;
 }
-#else /* !CONFIG_NUMA */
-static void free_sched_groups(const struct cpumask *cpu_map,
-                             struct cpumask *nodemask)
-{
-}
-#endif /* CONFIG_NUMA */
 
 /*
  * Initialize sched groups cpu_power.
@@ -7149,48 +7194,19 @@ static void free_sched_groups(const struct cpumask *cpu_map,
  */
 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
-       struct sched_domain *child;
-       struct sched_group *group;
-       long power;
-       int weight;
+       struct sched_group *sg = sd->groups;
 
-       WARN_ON(!sd || !sd->groups);
+       WARN_ON(!sd || !sg);
 
-       if (cpu != group_first_cpu(sd->groups))
-               return;
-
-       sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
-
-       child = sd->child;
-
-       sd->groups->cpu_power = 0;
+       do {
+               sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+               sg = sg->next;
+       } while (sg != sd->groups);
 
-       if (!child) {
-               power = SCHED_LOAD_SCALE;
-               weight = cpumask_weight(sched_domain_span(sd));
-               /*
-                * SMT siblings share the power of a single core.
-                * Usually multiple threads get a better yield out of
-                * that one core than a single thread would have,
-                * reflect that in sd->smt_gain.
-                */
-               if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
-                       power *= sd->smt_gain;
-                       power /= weight;
-                       power >>= SCHED_LOAD_SHIFT;
-               }
-               sd->groups->cpu_power += power;
+       if (cpu != group_first_cpu(sg))
                return;
-       }
 
-       /*
-        * Add cpu_power of each child group to this groups cpu_power.
-        */
-       group = child->groups;
-       do {
-               sd->groups->cpu_power += group->cpu_power;
-               group = group->next;
-       } while (group != child->groups);
+       update_group_power(sd, cpu);
 }
 
 /*
@@ -7204,15 +7220,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 # define SD_INIT_NAME(sd, type)                do { } while (0)
 #endif
 
-#define        SD_INIT(sd, type)       sd_init_##type(sd)
-
-#define SD_INIT_FUNC(type)     \
-static noinline void sd_init_##type(struct sched_domain *sd)   \
-{                                                              \
-       memset(sd, 0, sizeof(*sd));                             \
-       *sd = SD_##type##_INIT;                                 \
-       sd->level = SD_LV_##type;                               \
-       SD_INIT_NAME(sd, type);                                 \
+#define SD_INIT_FUNC(type)                                             \
+static noinline struct sched_domain *                                  \
+sd_init_##type(struct sched_domain_topology_level *tl, int cpu)        \
+{                                                                      \
+       struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);       \
+       *sd = SD_##type##_INIT;                                         \
+       SD_INIT_NAME(sd, type);                                         \
+       sd->private = &tl->data;                                        \
+       return sd;                                                      \
 }
 
 SD_INIT_FUNC(CPU)
@@ -7231,13 +7247,14 @@ SD_INIT_FUNC(CPU)
 #endif
 
 static int default_relax_domain_level = -1;
+int sched_domain_level_max;
 
 static int __init setup_relax_domain_level(char *str)
 {
        unsigned long val;
 
        val = simple_strtoul(str, NULL, 0);
-       if (val < SD_LV_MAX)
+       if (val < sched_domain_level_max)
                default_relax_domain_level = val;
 
        return 1;
@@ -7265,37 +7282,20 @@ static void set_domain_attribute(struct sched_domain *sd,
        }
 }
 
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
                                 const struct cpumask *cpu_map)
 {
        switch (what) {
-       case sa_sched_groups:
-               free_sched_groups(cpu_map, d->tmpmask); /* fall through */
-               d->sched_group_nodes = NULL;
        case sa_rootdomain:
-               free_rootdomain(d->rd); /* fall through */
-       case sa_tmpmask:
-               free_cpumask_var(d->tmpmask); /* fall through */
-       case sa_send_covered:
-               free_cpumask_var(d->send_covered); /* fall through */
-       case sa_this_book_map:
-               free_cpumask_var(d->this_book_map); /* fall through */
-       case sa_this_core_map:
-               free_cpumask_var(d->this_core_map); /* fall through */
-       case sa_this_sibling_map:
-               free_cpumask_var(d->this_sibling_map); /* fall through */
-       case sa_nodemask:
-               free_cpumask_var(d->nodemask); /* fall through */
-       case sa_sched_group_nodes:
-#ifdef CONFIG_NUMA
-               kfree(d->sched_group_nodes); /* fall through */
-       case sa_notcovered:
-               free_cpumask_var(d->notcovered); /* fall through */
-       case sa_covered:
-               free_cpumask_var(d->covered); /* fall through */
-       case sa_domainspan:
-               free_cpumask_var(d->domainspan); /* fall through */
-#endif
+               if (!atomic_read(&d->rd->refcount))
+                       free_rootdomain(&d->rd->rcu); /* fall through */
+       case sa_sd:
+               free_percpu(d->sd); /* fall through */
+       case sa_sd_storage:
+               __sdt_free(cpu_map); /* fall through */
        case sa_none:
                break;
        }
@@ -7304,308 +7304,234 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
                                                   const struct cpumask *cpu_map)
 {
-#ifdef CONFIG_NUMA
-       if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
-               return sa_none;
-       if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
-               return sa_domainspan;
-       if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
-               return sa_covered;
-       /* Allocate the per-node list of sched groups */
-       d->sched_group_nodes = kcalloc(nr_node_ids,
-                                     sizeof(struct sched_group *), GFP_KERNEL);
-       if (!d->sched_group_nodes) {
-               printk(KERN_WARNING "Can not alloc sched group node list\n");
-               return sa_notcovered;
-       }
-       sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
-#endif
-       if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
-               return sa_sched_group_nodes;
-       if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
-               return sa_nodemask;
-       if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
-               return sa_this_sibling_map;
-       if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
-               return sa_this_core_map;
-       if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
-               return sa_this_book_map;
-       if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
-               return sa_send_covered;
+       memset(d, 0, sizeof(*d));
+
+       if (__sdt_alloc(cpu_map))
+               return sa_sd_storage;
+       d->sd = alloc_percpu(struct sched_domain *);
+       if (!d->sd)
+               return sa_sd_storage;
        d->rd = alloc_rootdomain();
-       if (!d->rd) {
-               printk(KERN_WARNING "Cannot alloc root domain\n");
-               return sa_tmpmask;
-       }
+       if (!d->rd)
+               return sa_sd;
        return sa_rootdomain;
 }
 
-static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
-       const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
+/*
+ * NULL the sd_data elements we've used to build the sched_domain and
+ * sched_group structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
 {
-       struct sched_domain *sd = NULL;
-#ifdef CONFIG_NUMA
-       struct sched_domain *parent;
-
-       d->sd_allnodes = 0;
-       if (cpumask_weight(cpu_map) >
-           SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
-               sd = &per_cpu(allnodes_domains, i).sd;
-               SD_INIT(sd, ALLNODES);
-               set_domain_attribute(sd, attr);
-               cpumask_copy(sched_domain_span(sd), cpu_map);
-               cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
-               d->sd_allnodes = 1;
-       }
-       parent = sd;
-
-       sd = &per_cpu(node_domains, i).sd;
-       SD_INIT(sd, NODE);
-       set_domain_attribute(sd, attr);
-       sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
-       sd->parent = parent;
-       if (parent)
-               parent->child = sd;
-       cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
-#endif
-       return sd;
-}
+       struct sd_data *sdd = sd->private;
 
-static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
-       const struct cpumask *cpu_map, struct sched_domain_attr *attr,
-       struct sched_domain *parent, int i)
-{
-       struct sched_domain *sd;
-       sd = &per_cpu(phys_domains, i).sd;
-       SD_INIT(sd, CPU);
-       set_domain_attribute(sd, attr);
-       cpumask_copy(sched_domain_span(sd), d->nodemask);
-       sd->parent = parent;
-       if (parent)
-               parent->child = sd;
-       cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
-       return sd;
-}
+       WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+       *per_cpu_ptr(sdd->sd, cpu) = NULL;
 
-static struct sched_domain *__build_book_sched_domain(struct s_data *d,
-       const struct cpumask *cpu_map, struct sched_domain_attr *attr,
-       struct sched_domain *parent, int i)
-{
-       struct sched_domain *sd = parent;
-#ifdef CONFIG_SCHED_BOOK
-       sd = &per_cpu(book_domains, i).sd;
-       SD_INIT(sd, BOOK);
-       set_domain_attribute(sd, attr);
-       cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
-       sd->parent = parent;
-       parent->child = sd;
-       cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
-       return sd;
-}
+       if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
+               *per_cpu_ptr(sdd->sg, cpu) = NULL;
 
-static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
-       const struct cpumask *cpu_map, struct sched_domain_attr *attr,
-       struct sched_domain *parent, int i)
-{
-       struct sched_domain *sd = parent;
-#ifdef CONFIG_SCHED_MC
-       sd = &per_cpu(core_domains, i).sd;
-       SD_INIT(sd, MC);
-       set_domain_attribute(sd, attr);
-       cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
-       sd->parent = parent;
-       parent->child = sd;
-       cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
-       return sd;
+       if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
+               *per_cpu_ptr(sdd->sgp, cpu) = NULL;
 }
 
-static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
-       const struct cpumask *cpu_map, struct sched_domain_attr *attr,
-       struct sched_domain *parent, int i)
-{
-       struct sched_domain *sd = parent;
 #ifdef CONFIG_SCHED_SMT
-       sd = &per_cpu(cpu_domains, i).sd;
-       SD_INIT(sd, SIBLING);
-       set_domain_attribute(sd, attr);
-       cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
-       sd->parent = parent;
-       parent->child = sd;
-       cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
-       return sd;
+static const struct cpumask *cpu_smt_mask(int cpu)
+{
+       return topology_thread_cpumask(cpu);
 }
+#endif
 
-static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
-                              const struct cpumask *cpu_map, int cpu)
-{
-       switch (l) {
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
 #ifdef CONFIG_SCHED_SMT
-       case SD_LV_SIBLING: /* set up CPU (sibling) groups */
-               cpumask_and(d->this_sibling_map, cpu_map,
-                           topology_thread_cpumask(cpu));
-               if (cpu == cpumask_first(d->this_sibling_map))
-                       init_sched_build_groups(d->this_sibling_map, cpu_map,
-                                               &cpu_to_cpu_group,
-                                               d->send_covered, d->tmpmask);
-               break;
+       { sd_init_SIBLING, cpu_smt_mask, },
 #endif
 #ifdef CONFIG_SCHED_MC
-       case SD_LV_MC: /* set up multi-core groups */
-               cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
-               if (cpu == cpumask_first(d->this_core_map))
-                       init_sched_build_groups(d->this_core_map, cpu_map,
-                                               &cpu_to_core_group,
-                                               d->send_covered, d->tmpmask);
-               break;
+       { sd_init_MC, cpu_coregroup_mask, },
 #endif
 #ifdef CONFIG_SCHED_BOOK
-       case SD_LV_BOOK: /* set up book groups */
-               cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
-               if (cpu == cpumask_first(d->this_book_map))
-                       init_sched_build_groups(d->this_book_map, cpu_map,
-                                               &cpu_to_book_group,
-                                               d->send_covered, d->tmpmask);
-               break;
+       { sd_init_BOOK, cpu_book_mask, },
 #endif
-       case SD_LV_CPU: /* set up physical groups */
-               cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
-               if (!cpumask_empty(d->nodemask))
-                       init_sched_build_groups(d->nodemask, cpu_map,
-                                               &cpu_to_phys_group,
-                                               d->send_covered, d->tmpmask);
-               break;
+       { sd_init_CPU, cpu_cpu_mask, },
 #ifdef CONFIG_NUMA
-       case SD_LV_ALLNODES:
-               init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
-                                       d->send_covered, d->tmpmask);
-               break;
+       { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
+       { sd_init_ALLNODES, cpu_allnodes_mask, },
 #endif
-       default:
-               break;
+       { NULL, },
+};
+
+static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+       struct sched_domain_topology_level *tl;
+       int j;
+
+       for (tl = sched_domain_topology; tl->init; tl++) {
+               struct sd_data *sdd = &tl->data;
+
+               sdd->sd = alloc_percpu(struct sched_domain *);
+               if (!sdd->sd)
+                       return -ENOMEM;
+
+               sdd->sg = alloc_percpu(struct sched_group *);
+               if (!sdd->sg)
+                       return -ENOMEM;
+
+               sdd->sgp = alloc_percpu(struct sched_group_power *);
+               if (!sdd->sgp)
+                       return -ENOMEM;
+
+               for_each_cpu(j, cpu_map) {
+                       struct sched_domain *sd;
+                       struct sched_group *sg;
+                       struct sched_group_power *sgp;
+
+                       sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+                                       GFP_KERNEL, cpu_to_node(j));
+                       if (!sd)
+                               return -ENOMEM;
+
+                       *per_cpu_ptr(sdd->sd, j) = sd;
+
+                       sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                                       GFP_KERNEL, cpu_to_node(j));
+                       if (!sg)
+                               return -ENOMEM;
+
+                       *per_cpu_ptr(sdd->sg, j) = sg;
+
+                       sgp = kzalloc_node(sizeof(struct sched_group_power),
+                                       GFP_KERNEL, cpu_to_node(j));
+                       if (!sgp)
+                               return -ENOMEM;
+
+                       *per_cpu_ptr(sdd->sgp, j) = sgp;
+               }
+       }
+
+       return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+       struct sched_domain_topology_level *tl;
+       int j;
+
+       for (tl = sched_domain_topology; tl->init; tl++) {
+               struct sd_data *sdd = &tl->data;
+
+               for_each_cpu(j, cpu_map) {
+                       struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
+                       if (sd && (sd->flags & SD_OVERLAP))
+                               free_sched_groups(sd->groups, 0);
+                       kfree(*per_cpu_ptr(sdd->sd, j));
+                       kfree(*per_cpu_ptr(sdd->sg, j));
+                       kfree(*per_cpu_ptr(sdd->sgp, j));
+               }
+               free_percpu(sdd->sd);
+               free_percpu(sdd->sg);
+               free_percpu(sdd->sgp);
        }
 }
 
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+               struct s_data *d, const struct cpumask *cpu_map,
+               struct sched_domain_attr *attr, struct sched_domain *child,
+               int cpu)
+{
+       struct sched_domain *sd = tl->init(tl, cpu);
+       if (!sd)
+               return child;
+
+       set_domain_attribute(sd, attr);
+       cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+       if (child) {
+               sd->level = child->level + 1;
+               sched_domain_level_max = max(sched_domain_level_max, sd->level);
+               child->parent = sd;
+       }
+       sd->child = child;
+
+       return sd;
+}
+
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-static int __build_sched_domains(const struct cpumask *cpu_map,
-                                struct sched_domain_attr *attr)
+static int build_sched_domains(const struct cpumask *cpu_map,
+                              struct sched_domain_attr *attr)
 {
        enum s_alloc alloc_state = sa_none;
-       struct s_data d;
        struct sched_domain *sd;
-       int i;
-#ifdef CONFIG_NUMA
-       d.sd_allnodes = 0;
-#endif
+       struct s_data d;
+       int i, ret = -ENOMEM;
 
        alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
        if (alloc_state != sa_rootdomain)
                goto error;
-       alloc_state = sa_sched_groups;
 
-       /*
-        * Set up domains for cpus specified by the cpu_map.
-        */
+       /* Set up domains for cpus specified by the cpu_map. */
        for_each_cpu(i, cpu_map) {
-               cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
-                           cpu_map);
+               struct sched_domain_topology_level *tl;
+
+               sd = NULL;
+               for (tl = sched_domain_topology; tl->init; tl++) {
+                       sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+                       if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+                               sd->flags |= SD_OVERLAP;
+                       if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+                               break;
+               }
 
-               sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
-               sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
-               sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
-               sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
-               sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
-       }
+               while (sd->child)
+                       sd = sd->child;
 
-       for_each_cpu(i, cpu_map) {
-               build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
-               build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
-               build_sched_groups(&d, SD_LV_MC, cpu_map, i);
+               *per_cpu_ptr(d.sd, i) = sd;
        }
 
-       /* Set up physical groups */
-       for (i = 0; i < nr_node_ids; i++)
-               build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
-
-#ifdef CONFIG_NUMA
-       /* Set up node groups */
-       if (d.sd_allnodes)
-               build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
-
-       for (i = 0; i < nr_node_ids; i++)
-               if (build_numa_sched_groups(&d, cpu_map, i))
-                       goto error;
-#endif
-
-       /* Calculate CPU power for physical packages and nodes */
-#ifdef CONFIG_SCHED_SMT
-       for_each_cpu(i, cpu_map) {
-               sd = &per_cpu(cpu_domains, i).sd;
-               init_sched_groups_power(i, sd);
-       }
-#endif
-#ifdef CONFIG_SCHED_MC
+       /* Build the groups for the domains */
        for_each_cpu(i, cpu_map) {
-               sd = &per_cpu(core_domains, i).sd;
-               init_sched_groups_power(i, sd);
-       }
-#endif
-#ifdef CONFIG_SCHED_BOOK
-       for_each_cpu(i, cpu_map) {
-               sd = &per_cpu(book_domains, i).sd;
-               init_sched_groups_power(i, sd);
-       }
-#endif
-
-       for_each_cpu(i, cpu_map) {
-               sd = &per_cpu(phys_domains, i).sd;
-               init_sched_groups_power(i, sd);
+               for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+                       sd->span_weight = cpumask_weight(sched_domain_span(sd));
+                       if (sd->flags & SD_OVERLAP) {
+                               if (build_overlap_sched_groups(sd, i))
+                                       goto error;
+                       } else {
+                               if (build_sched_groups(sd, i))
+                                       goto error;
+                       }
+               }
        }
 
-#ifdef CONFIG_NUMA
-       for (i = 0; i < nr_node_ids; i++)
-               init_numa_sched_groups_power(d.sched_group_nodes[i]);
-
-       if (d.sd_allnodes) {
-               struct sched_group *sg;
+       /* Calculate CPU power for physical packages and nodes */
+       for (i = nr_cpumask_bits-1; i >= 0; i--) {
+               if (!cpumask_test_cpu(i, cpu_map))
+                       continue;
 
-               cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
-                                                               d.tmpmask);
-               init_numa_sched_groups_power(sg);
+               for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+                       claim_allocations(i, sd);
+                       init_sched_groups_power(i, sd);
+               }
        }
-#endif
 
        /* Attach the domains */
+       rcu_read_lock();
        for_each_cpu(i, cpu_map) {
-#ifdef CONFIG_SCHED_SMT
-               sd = &per_cpu(cpu_domains, i).sd;
-#elif defined(CONFIG_SCHED_MC)
-               sd = &per_cpu(core_domains, i).sd;
-#elif defined(CONFIG_SCHED_BOOK)
-               sd = &per_cpu(book_domains, i).sd;
-#else
-               sd = &per_cpu(phys_domains, i).sd;
-#endif
+               sd = *per_cpu_ptr(d.sd, i);
                cpu_attach_domain(sd, d.rd, i);
        }
+       rcu_read_unlock();
 
-       d.sched_group_nodes = NULL; /* don't free this we still need it */
-       __free_domain_allocs(&d, sa_tmpmask, cpu_map);
-       return 0;
-
+       ret = 0;
 error:
        __free_domain_allocs(&d, alloc_state, cpu_map);
-       return -ENOMEM;
-}
-
-static int build_sched_domains(const struct cpumask *cpu_map)
-{
-       return __build_sched_domains(cpu_map, NULL);
+       return ret;
 }
 
 static cpumask_var_t *doms_cur;        /* current sched domains */
@@ -7660,7 +7586,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
  * For now this just excludes isolated cpus, but could be used to
  * exclude other special cases in the future.
  */
-static int arch_init_sched_domains(const struct cpumask *cpu_map)
+static int init_sched_domains(const struct cpumask *cpu_map)
 {
        int err;
 
@@ -7671,32 +7597,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
                doms_cur = &fallback_doms;
        cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
        dattr_cur = NULL;
-       err = build_sched_domains(doms_cur[0]);
+       err = build_sched_domains(doms_cur[0], NULL);
        register_sched_domain_sysctl();
 
        return err;
 }
 
-static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
-                                      struct cpumask *tmpmask)
-{
-       free_sched_groups(cpu_map, tmpmask);
-}
-
 /*
  * Detach sched domains from a group of cpus specified in cpu_map
  * These cpus will now be attached to the NULL domain
  */
 static void detach_destroy_domains(const struct cpumask *cpu_map)
 {
-       /* Save because hotplug lock held. */
-       static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
        int i;
 
+       rcu_read_lock();
        for_each_cpu(i, cpu_map)
                cpu_attach_domain(NULL, &def_root_domain, i);
-       synchronize_sched();
-       arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
+       rcu_read_unlock();
 }
 
 /* handle null as "default" */
@@ -7785,8 +7703,7 @@ match1:
                                goto match2;
                }
                /* no match - add a new doms_new */
-               __build_sched_domains(doms_new[i],
-                                       dattr_new ? dattr_new + i : NULL);
+               build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
 match2:
                ;
        }
@@ -7805,7 +7722,7 @@ match2:
 }
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void arch_reinit_sched_domains(void)
+static void reinit_sched_domains(void)
 {
        get_online_cpus();
 
@@ -7838,7 +7755,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
        else
                sched_mc_power_savings = level;
 
-       arch_reinit_sched_domains();
+       reinit_sched_domains();
 
        return count;
 }
@@ -7957,14 +7874,9 @@ void __init sched_init_smp(void)
        alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
        alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
-#if defined(CONFIG_NUMA)
-       sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
-                                                               GFP_KERNEL);
-       BUG_ON(sched_group_nodes_bycpu == NULL);
-#endif
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
-       arch_init_sched_domains(cpu_active_mask);
+       init_sched_domains(cpu_active_mask);
        cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
        if (cpumask_empty(non_isolated_cpus))
                cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -8003,18 +7915,14 @@ int in_sched_functions(unsigned long addr)
                && addr < (unsigned long)__sched_text_end);
 }
 
-static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+static void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
        cfs_rq->tasks_timeline = RB_ROOT;
        INIT_LIST_HEAD(&cfs_rq->tasks);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       cfs_rq->rq = rq;
-       /* allow initial update_cfs_load() to truncate */
-#ifdef CONFIG_SMP
-       cfs_rq->load_stamp = 1;
-#endif
-#endif
        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+#ifndef CONFIG_64BIT
+       cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
 }
 
 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
@@ -8030,27 +7938,18 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        /* delimiter for bitsearch: */
        __set_bit(MAX_RT_PRIO, array->bitmap);
 
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+#if defined CONFIG_SMP
        rt_rq->highest_prio.curr = MAX_RT_PRIO;
-#ifdef CONFIG_SMP
        rt_rq->highest_prio.next = MAX_RT_PRIO;
-#endif
-#endif
-#ifdef CONFIG_SMP
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
-       plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
+       plist_head_init(&rt_rq->pushable_tasks);
 #endif
 
        rt_rq->rt_time = 0;
        rt_rq->rt_throttled = 0;
        rt_rq->rt_runtime = 0;
        raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-
-#ifdef CONFIG_RT_GROUP_SCHED
-       rt_rq->rt_nr_boosted = 0;
-       rt_rq->rq = rq;
-#endif
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8059,11 +7958,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                                struct sched_entity *parent)
 {
        struct rq *rq = cpu_rq(cpu);
-       tg->cfs_rq[cpu] = cfs_rq;
-       init_cfs_rq(cfs_rq, rq);
+
        cfs_rq->tg = tg;
+       cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+       /* allow initial update_cfs_load() to truncate */
+       cfs_rq->load_stamp = 1;
+#endif
 
+       tg->cfs_rq[cpu] = cfs_rq;
        tg->se[cpu] = se;
+
        /* se could be NULL for root_task_group */
        if (!se)
                return;
@@ -8086,12 +7991,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 {
        struct rq *rq = cpu_rq(cpu);
 
-       tg->rt_rq[cpu] = rt_rq;
-       init_rt_rq(rt_rq, rq);
+       rt_rq->highest_prio.curr = MAX_RT_PRIO;
+       rt_rq->rt_nr_boosted = 0;
+       rt_rq->rq = rq;
        rt_rq->tg = tg;
-       rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 
+       tg->rt_rq[cpu] = rt_rq;
        tg->rt_se[cpu] = rt_se;
+
        if (!rt_se)
                return;
 
@@ -8173,7 +8080,7 @@ void __init sched_init(void)
                rq->nr_running = 0;
                rq->calc_load_active = 0;
                rq->calc_load_update = jiffies + LOAD_FREQ;
-               init_cfs_rq(&rq->cfs, rq);
+               init_cfs_rq(&rq->cfs);
                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
                root_task_group.shares = root_task_group_load;
@@ -8214,7 +8121,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
-               rq->cpu_power = SCHED_LOAD_SCALE;
+               rq->cpu_power = SCHED_POWER_SCALE;
                rq->post_schedule = 0;
                rq->active_balance = 0;
                rq->next_balance = jiffies;
@@ -8244,7 +8151,7 @@ void __init sched_init(void)
 #endif
 
 #ifdef CONFIG_RT_MUTEXES
-       plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
+       plist_head_init(&init_task.pi_waiters);
 #endif
 
        /*
@@ -8271,6 +8178,7 @@ void __init sched_init(void)
        /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
        zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
+       zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
 #ifdef CONFIG_NO_HZ
        zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
        alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -8286,7 +8194,7 @@ void __init sched_init(void)
        scheduler_running = 1;
 }
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
        int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
@@ -8294,13 +8202,23 @@ static inline int preempt_count_equals(int preempt_offset)
        return (nested == preempt_offset);
 }
 
+static int __might_sleep_init_called;
+int __init __might_sleep_init(void)
+{
+       __might_sleep_init_called = 1;
+       return 0;
+}
+early_initcall(__might_sleep_init);
+
 void __might_sleep(const char *file, int line, int preempt_offset)
 {
-#ifdef in_atomic
        static unsigned long prev_jiffy;        /* ratelimiting */
 
        if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
-           system_state != SYSTEM_RUNNING || oops_in_progress)
+           oops_in_progress)
+               return;
+       if (system_state != SYSTEM_RUNNING &&
+           (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))
                return;
        if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                return;
@@ -8318,7 +8236,6 @@ void __might_sleep(const char *file, int line, int preempt_offset)
        if (irqs_disabled())
                print_irqtrace_events(current);
        dump_stack();
-#endif
 }
 EXPORT_SYMBOL(__might_sleep);
 #endif
@@ -8477,6 +8394,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                if (!se)
                        goto err_free_rq;
 
+               init_cfs_rq(cfs_rq);
                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
        }
 
@@ -8504,7 +8422,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
        list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
-#else /* !CONFG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
 {
 }
@@ -8525,7 +8443,8 @@ static void free_rt_sched_group(struct task_group *tg)
 {
        int i;
 
-       destroy_rt_bandwidth(&tg->rt_bandwidth);
+       if (tg->rt_se)
+               destroy_rt_bandwidth(&tg->rt_bandwidth);
 
        for_each_possible_cpu(i) {
                if (tg->rt_rq)
@@ -8543,7 +8462,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct rt_rq *rt_rq;
        struct sched_rt_entity *rt_se;
-       struct rq *rq;
        int i;
 
        tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8557,8 +8475,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                        ktime_to_ns(def_rt_bandwidth.rt_period), 0);
 
        for_each_possible_cpu(i) {
-               rq = cpu_rq(i);
-
                rt_rq = kzalloc_node(sizeof(struct rt_rq),
                                     GFP_KERNEL, cpu_to_node(i));
                if (!rt_rq)
@@ -8569,6 +8485,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                if (!rt_se)
                        goto err_free_rq;
 
+               init_rt_rq(rt_rq, cpu_rq(i));
+               rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
        }
 
@@ -8710,10 +8628,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
        if (!tg->se[0])
                return -EINVAL;
 
-       if (shares < MIN_SHARES)
-               shares = MIN_SHARES;
-       else if (shares > MAX_SHARES)
-               shares = MAX_SHARES;
+       shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
 
        mutex_lock(&shares_mutex);
        if (tg->shares == shares)
@@ -9063,42 +8978,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        return 0;
 }
 
-static int
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                     struct task_struct *tsk, bool threadgroup)
-{
-       int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
-       if (retval)
-               return retval;
-       if (threadgroup) {
-               struct task_struct *c;
-               rcu_read_lock();
-               list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                       retval = cpu_cgroup_can_attach_task(cgrp, c);
-                       if (retval) {
-                               rcu_read_unlock();
-                               return retval;
-                       }
-               }
-               rcu_read_unlock();
-       }
-       return 0;
-}
-
 static void
-cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                 struct cgroup *old_cont, struct task_struct *tsk,
-                 bool threadgroup)
+cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
        sched_move_task(tsk);
-       if (threadgroup) {
-               struct task_struct *c;
-               rcu_read_lock();
-               list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                       sched_move_task(c);
-               }
-               rcu_read_unlock();
-       }
 }
 
 static void
@@ -9120,14 +9003,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
                                u64 shareval)
 {
-       return sched_group_set_shares(cgroup_tg(cgrp), shareval);
+       return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
 }
 
 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 {
        struct task_group *tg = cgroup_tg(cgrp);
 
-       return (u64) tg->shares;
+       return (u64) scale_load_down(tg->shares);
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
@@ -9186,8 +9069,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
        .create         = cpu_cgroup_create,
        .destroy        = cpu_cgroup_destroy,
-       .can_attach     = cpu_cgroup_can_attach,
-       .attach         = cpu_cgroup_attach,
+       .can_attach_task = cpu_cgroup_can_attach_task,
+       .attach_task    = cpu_cgroup_attach_task,
        .exit           = cpu_cgroup_exit,
        .populate       = cpu_cgroup_populate,
        .subsys_id      = cpu_cgroup_subsys_id,