Merge Christoph's freeze cleanup patch
[linux-2.6.git] / kernel / sched.c
index 6fa9ea4..a07cff9 100644 (file)
 #define SCALE_PRIO(x, prio) \
        max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
 
-static inline unsigned int task_timeslice(task_t *p)
+static unsigned int task_timeslice(task_t *p)
 {
        if (p->static_prio < NICE_TO_PRIO(0))
                return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
@@ -206,7 +206,7 @@ struct runqueue {
         */
        unsigned long nr_running;
 #ifdef CONFIG_SMP
-       unsigned long cpu_load;
+       unsigned long cpu_load[3];
 #endif
        unsigned long long nr_switches;
 
@@ -260,22 +260,86 @@ struct runqueue {
 
 static DEFINE_PER_CPU(struct runqueue, runqueues);
 
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
 #define for_each_domain(cpu, domain) \
-       for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
+for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
 
 #define cpu_rq(cpu)            (&per_cpu(runqueues, (cpu)))
 #define this_rq()              (&__get_cpu_var(runqueues))
 #define task_rq(p)             cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
 
-/*
- * Default context-switch locking:
- */
 #ifndef prepare_arch_switch
-# define prepare_arch_switch(rq, next) do { } while (0)
-# define finish_arch_switch(rq, next)  spin_unlock_irq(&(rq)->lock)
-# define task_running(rq, p)           ((rq)->curr == (p))
+# define prepare_arch_switch(next)     do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev)      do { } while (0)
+#endif
+
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+       return rq->curr == p;
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+       spin_unlock_irq(&rq->lock);
+}
+
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+#ifdef CONFIG_SMP
+       return p->oncpu;
+#else
+       return rq->curr == p;
+#endif
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+#ifdef CONFIG_SMP
+       /*
+        * We can optimise this out completely for !SMP, because the
+        * SMP rebalancing from interrupt is the only thing that cares
+        * here.
+        */
+       next->oncpu = 1;
+#endif
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+       spin_unlock_irq(&rq->lock);
+#else
+       spin_unlock(&rq->lock);
+#endif
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+#ifdef CONFIG_SMP
+       /*
+        * After ->oncpu is cleared, the task can be moved to a different CPU.
+        * We must ensure this doesn't happen until the switch is completely
+        * finished.
+        */
+       smp_wmb();
+       prev->oncpu = 0;
 #endif
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+       local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
@@ -309,7 +373,7 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
  * bump this up when changing the output format or the meaning of an existing
  * format, so that tools can adapt (or abort)
  */
-#define SCHEDSTAT_VERSION 11
+#define SCHEDSTAT_VERSION 12
 
 static int show_schedstat(struct seq_file *seq, void *v)
 {
@@ -338,6 +402,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 
 #ifdef CONFIG_SMP
                /* domain-specific stats */
+               preempt_disable();
                for_each_domain(cpu, sd) {
                        enum idle_type itype;
                        char mask_str[NR_CPUS];
@@ -356,11 +421,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
                                    sd->lb_nobusyq[itype],
                                    sd->lb_nobusyg[itype]);
                        }
-                       seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
+                       seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
                            sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
-                           sd->sbe_pushed, sd->sbe_attempts,
+                           sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
+                           sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
                            sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
                }
+               preempt_enable();
 #endif
        }
        return 0;
@@ -414,22 +481,6 @@ static inline runqueue_t *this_rq_lock(void)
        return rq;
 }
 
-#ifdef CONFIG_SCHED_SMT
-static int cpu_and_siblings_are_idle(int cpu)
-{
-       int sib;
-       for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
-               if (idle_cpu(sib))
-                       continue;
-               return 0;
-       }
-
-       return 1;
-}
-#else
-#define cpu_and_siblings_are_idle(A) idle_cpu(A)
-#endif
-
 #ifdef CONFIG_SCHEDSTATS
 /*
  * Called when a process is dequeued from the active array and given
@@ -622,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
        rq->nr_running++;
 }
 
-static void recalc_task_prio(task_t *p, unsigned long long now)
+static int recalc_task_prio(task_t *p, unsigned long long now)
 {
        /* Caller must always ensure 'now >= p->timestamp' */
        unsigned long long __sleep_time = now - p->timestamp;
@@ -681,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now)
                }
        }
 
-       p->prio = effective_prio(p);
+       return effective_prio(p);
 }
 
 /*
@@ -704,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
        }
 #endif
 
-       recalc_task_prio(p, now);
+       p->prio = recalc_task_prio(p, now);
 
        /*
         * This checks to make sure it's not an uninterruptible task
@@ -782,22 +833,12 @@ inline int task_curr(const task_t *p)
 }
 
 #ifdef CONFIG_SMP
-enum request_type {
-       REQ_MOVE_TASK,
-       REQ_SET_DOMAIN,
-};
-
 typedef struct {
        struct list_head list;
-       enum request_type type;
 
-       /* For REQ_MOVE_TASK */
        task_t *task;
        int dest_cpu;
 
-       /* For REQ_SET_DOMAIN */
-       struct sched_domain *sd;
-
        struct completion done;
 } migration_req_t;
 
@@ -819,7 +860,6 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
        }
 
        init_completion(&req->done);
-       req->type = REQ_MOVE_TASK;
        req->task = p;
        req->dest_cpu = dest_cpu;
        list_add(&req->list, &rq->migration_queue);
@@ -886,26 +926,154 @@ void kick_process(task_t *p)
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static inline unsigned long source_load(int cpu)
+static inline unsigned long source_load(int cpu, int type)
 {
        runqueue_t *rq = cpu_rq(cpu);
        unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+       if (type == 0)
+               return load_now;
 
-       return min(rq->cpu_load, load_now);
+       return min(rq->cpu_load[type-1], load_now);
 }
 
 /*
  * Return a high guess at the load of a migration-target cpu
  */
-static inline unsigned long target_load(int cpu)
+static inline unsigned long target_load(int cpu, int type)
 {
        runqueue_t *rq = cpu_rq(cpu);
        unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+       if (type == 0)
+               return load_now;
 
-       return max(rq->cpu_load, load_now);
+       return max(rq->cpu_load[type-1], load_now);
 }
 
-#endif
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+{
+       struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+       unsigned long min_load = ULONG_MAX, this_load = 0;
+       int load_idx = sd->forkexec_idx;
+       int imbalance = 100 + (sd->imbalance_pct-100)/2;
+
+       do {
+               unsigned long load, avg_load;
+               int local_group;
+               int i;
+
+               local_group = cpu_isset(this_cpu, group->cpumask);
+               /* XXX: put a cpus allowed check */
+
+               /* Tally up the load of all CPUs in the group */
+               avg_load = 0;
+
+               for_each_cpu_mask(i, group->cpumask) {
+                       /* Bias balancing toward cpus of our domain */
+                       if (local_group)
+                               load = source_load(i, load_idx);
+                       else
+                               load = target_load(i, load_idx);
+
+                       avg_load += load;
+               }
+
+               /* Adjust by relative CPU power of the group */
+               avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+               if (local_group) {
+                       this_load = avg_load;
+                       this = group;
+               } else if (avg_load < min_load) {
+                       min_load = avg_load;
+                       idlest = group;
+               }
+               group = group->next;
+       } while (group != sd->groups);
+
+       if (!idlest || 100*this_load < imbalance*min_load)
+               return NULL;
+       return idlest;
+}
+
+/*
+ * find_idlest_queue - find the idlest runqueue among the cpus in group.
+ */
+static int find_idlest_cpu(struct sched_group *group, int this_cpu)
+{
+       unsigned long load, min_load = ULONG_MAX;
+       int idlest = -1;
+       int i;
+
+       for_each_cpu_mask(i, group->cpumask) {
+               load = source_load(i, 0);
+
+               if (load < min_load || (load == min_load && i == this_cpu)) {
+                       min_load = load;
+                       idlest = i;
+               }
+       }
+
+       return idlest;
+}
+
+/*
+ * sched_balance_self: balance the current task (running on cpu) in domains
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int sched_balance_self(int cpu, int flag)
+{
+       struct task_struct *t = current;
+       struct sched_domain *tmp, *sd = NULL;
+
+       for_each_domain(cpu, tmp)
+               if (tmp->flags & flag)
+                       sd = tmp;
+
+       while (sd) {
+               cpumask_t span;
+               struct sched_group *group;
+               int new_cpu;
+               int weight;
+
+               span = sd->span;
+               group = find_idlest_group(sd, t, cpu);
+               if (!group)
+                       goto nextlevel;
+
+               new_cpu = find_idlest_cpu(group, cpu);
+               if (new_cpu == -1 || new_cpu == cpu)
+                       goto nextlevel;
+
+               /* Now try balancing at a lower domain level */
+               cpu = new_cpu;
+nextlevel:
+               sd = NULL;
+               weight = cpus_weight(span);
+               for_each_domain(cpu, tmp) {
+                       if (weight <= cpus_weight(tmp->span))
+                               break;
+                       if (tmp->flags & flag)
+                               sd = tmp;
+               }
+               /* while loop will break here if sd == NULL */
+       }
+
+       return cpu;
+}
+
+#endif /* CONFIG_SMP */
 
 /*
  * wake_idle() will wake a task on an idle cpu if task->cpu is
@@ -927,14 +1095,14 @@ static int wake_idle(int cpu, task_t *p)
 
        for_each_domain(cpu, sd) {
                if (sd->flags & SD_WAKE_IDLE) {
-                       cpus_and(tmp, sd->span, cpu_online_map);
-                       cpus_and(tmp, tmp, p->cpus_allowed);
+                       cpus_and(tmp, sd->span, p->cpus_allowed);
                        for_each_cpu_mask(i, tmp) {
                                if (idle_cpu(i))
                                        return i;
                        }
                }
-               else break;
+               else
+                       break;
        }
        return cpu;
 }
@@ -967,7 +1135,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
        runqueue_t *rq;
 #ifdef CONFIG_SMP
        unsigned long load, this_load;
-       struct sched_domain *sd;
+       struct sched_domain *sd, *this_sd = NULL;
        int new_cpu;
 #endif
 
@@ -986,70 +1154,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
        if (unlikely(task_running(rq, p)))
                goto out_activate;
 
-#ifdef CONFIG_SCHEDSTATS
+       new_cpu = cpu;
+
        schedstat_inc(rq, ttwu_cnt);
        if (cpu == this_cpu) {
                schedstat_inc(rq, ttwu_local);
-       } else {
-               for_each_domain(this_cpu, sd) {
-                       if (cpu_isset(cpu, sd->span)) {
-                               schedstat_inc(sd, ttwu_wake_remote);
-                               break;
-                       }
+               goto out_set_cpu;
+       }
+
+       for_each_domain(this_cpu, sd) {
+               if (cpu_isset(cpu, sd->span)) {
+                       schedstat_inc(sd, ttwu_wake_remote);
+                       this_sd = sd;
+                       break;
                }
        }
-#endif
 
-       new_cpu = cpu;
-       if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+       if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
                goto out_set_cpu;
 
-       load = source_load(cpu);
-       this_load = target_load(this_cpu);
-
        /*
-        * If sync wakeup then subtract the (maximum possible) effect of
-        * the currently running task from the load of the current CPU:
+        * Check for affine wakeup and passive balancing possibilities.
         */
-       if (sync)
-               this_load -= SCHED_LOAD_SCALE;
+       if (this_sd) {
+               int idx = this_sd->wake_idx;
+               unsigned int imbalance;
 
-       /* Don't pull the task off an idle CPU to a busy one */
-       if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
-               goto out_set_cpu;
+               imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
 
-       new_cpu = this_cpu; /* Wake to this CPU if we can */
+               load = source_load(cpu, idx);
+               this_load = target_load(this_cpu, idx);
 
-       /*
-        * Scan domains for affine wakeup and passive balancing
-        * possibilities.
-        */
-       for_each_domain(this_cpu, sd) {
-               unsigned int imbalance;
-               /*
-                * Start passive balancing when half the imbalance_pct
-                * limit is reached.
-                */
-               imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
+               new_cpu = this_cpu; /* Wake to this CPU if we can */
 
-               if ((sd->flags & SD_WAKE_AFFINE) &&
-                               !task_hot(p, rq->timestamp_last_tick, sd)) {
+               if (this_sd->flags & SD_WAKE_AFFINE) {
+                       unsigned long tl = this_load;
                        /*
-                        * This domain has SD_WAKE_AFFINE and p is cache cold
-                        * in this domain.
+                        * If sync wakeup then subtract the (maximum possible)
+                        * effect of the currently running task from the load
+                        * of the current CPU:
                         */
-                       if (cpu_isset(cpu, sd->span)) {
-                               schedstat_inc(sd, ttwu_move_affine);
+                       if (sync)
+                               tl -= SCHED_LOAD_SCALE;
+
+                       if ((tl <= load &&
+                               tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
+                               100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
+                               /*
+                                * This domain has SD_WAKE_AFFINE and
+                                * p is cache cold in this domain, and
+                                * there is no bad imbalance.
+                                */
+                               schedstat_inc(this_sd, ttwu_move_affine);
                                goto out_set_cpu;
                        }
-               } else if ((sd->flags & SD_WAKE_BALANCE) &&
-                               imbalance*this_load <= 100*load) {
-                       /*
-                        * This domain has SD_WAKE_BALANCE and there is
-                        * an imbalance.
-                        */
-                       if (cpu_isset(cpu, sd->span)) {
-                               schedstat_inc(sd, ttwu_move_balance);
+               }
+
+               /*
+                * Start passive balancing when half the imbalance_pct
+                * limit is reached.
+                */
+               if (this_sd->flags & SD_WAKE_BALANCE) {
+                       if (imbalance*this_load <= 100*load) {
+                               schedstat_inc(this_sd, ttwu_move_balance);
                                goto out_set_cpu;
                        }
                }
@@ -1120,17 +1287,19 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
        return try_to_wake_up(p, state, 0);
 }
 
-#ifdef CONFIG_SMP
-static int find_idlest_cpu(struct task_struct *p, int this_cpu,
-                          struct sched_domain *sd);
-#endif
-
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
  */
-void fastcall sched_fork(task_t *p)
+void fastcall sched_fork(task_t *p, int clone_flags)
 {
+       int cpu = get_cpu();
+
+#ifdef CONFIG_SMP
+       cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+#endif
+       set_task_cpu(p, cpu);
+
        /*
         * We mark the process as running here, but have not actually
         * inserted it onto the runqueue yet. This guarantees that
@@ -1140,17 +1309,14 @@ void fastcall sched_fork(task_t *p)
        p->state = TASK_RUNNING;
        INIT_LIST_HEAD(&p->run_list);
        p->array = NULL;
-       spin_lock_init(&p->switch_lock);
 #ifdef CONFIG_SCHEDSTATS
        memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+       p->oncpu = 0;
+#endif
 #ifdef CONFIG_PREEMPT
-       /*
-        * During context-switch we hold precisely one spinlock, which
-        * schedule_tail drops. (in the common case it's this_rq()->lock,
-        * but it also can be p->switch_lock.) So we compensate with a count
-        * of 1. Also, we want to start with kernel preemption disabled.
-        */
+       /* Want to start with kernel preemption disabled. */
        p->thread_info->preempt_count = 1;
 #endif
        /*
@@ -1174,12 +1340,10 @@ void fastcall sched_fork(task_t *p)
                 * runqueue lock is not a problem.
                 */
                current->time_slice = 1;
-               preempt_disable();
                scheduler_tick();
-               local_irq_enable();
-               preempt_enable();
-       } else
-               local_irq_enable();
+       }
+       local_irq_enable();
+       put_cpu();
 }
 
 /*
@@ -1196,10 +1360,9 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
        runqueue_t *rq, *this_rq;
 
        rq = task_rq_lock(p, &flags);
-       cpu = task_cpu(p);
-       this_cpu = smp_processor_id();
-
        BUG_ON(p->state != TASK_RUNNING);
+       this_cpu = smp_processor_id();
+       cpu = task_cpu(p);
 
        /*
         * We decrease the sleep average of forking parents
@@ -1296,22 +1459,40 @@ void fastcall sched_exit(task_t * p)
 }
 
 /**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
+{
+       prepare_lock_switch(rq, next);
+       prepare_arch_switch(next);
+}
+
+/**
  * finish_task_switch - clean up after a task-switch
  * @prev: the thread we just switched away from.
  *
- * We enter this with the runqueue still locked, and finish_arch_switch()
- * will unlock it along with doing any other architecture-specific cleanup
- * actions.
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
  *
  * Note that we may have delayed dropping an mm in context_switch(). If
  * so, we finish that here outside of the runqueue lock.  (Doing it
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
-static inline void finish_task_switch(task_t *prev)
+static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
        __releases(rq->lock)
 {
-       runqueue_t *rq = this_rq();
        struct mm_struct *mm = rq->prev_mm;
        unsigned long prev_task_flags;
 
@@ -1329,7 +1510,8 @@ static inline void finish_task_switch(task_t *prev)
         *              Manfred Spraul <manfred@colorfullife.com>
         */
        prev_task_flags = prev->flags;
-       finish_arch_switch(rq, prev);
+       finish_arch_switch(prev);
+       finish_lock_switch(rq, prev);
        if (mm)
                mmdrop(mm);
        if (unlikely(prev_task_flags & PF_DEAD))
@@ -1343,8 +1525,12 @@ static inline void finish_task_switch(task_t *prev)
 asmlinkage void schedule_tail(task_t *prev)
        __releases(rq->lock)
 {
-       finish_task_switch(prev);
-
+       runqueue_t *rq = this_rq();
+       finish_task_switch(rq, prev);
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+       /* In this case, finish_task_switch does not reenable preemption */
+       preempt_enable();
+#endif
        if (current->set_child_tid)
                put_user(current->pid, current->set_child_tid);
 }
@@ -1494,51 +1680,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
 }
 
 /*
- * find_idlest_cpu - find the least busy runqueue.
- */
-static int find_idlest_cpu(struct task_struct *p, int this_cpu,
-                          struct sched_domain *sd)
-{
-       unsigned long load, min_load, this_load;
-       int i, min_cpu;
-       cpumask_t mask;
-
-       min_cpu = UINT_MAX;
-       min_load = ULONG_MAX;
-
-       cpus_and(mask, sd->span, p->cpus_allowed);
-
-       for_each_cpu_mask(i, mask) {
-               load = target_load(i);
-
-               if (load < min_load) {
-                       min_cpu = i;
-                       min_load = load;
-
-                       /* break out early on an idle CPU: */
-                       if (!min_load)
-                               break;
-               }
-       }
-
-       /* add +1 to account for the new task */
-       this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
-
-       /*
-        * Would with the addition of the new task to the
-        * current CPU there be an imbalance between this
-        * CPU and the idlest CPU?
-        *
-        * Use half of the balancing threshold - new-context is
-        * a good opportunity to balance.
-        */
-       if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
-               return min_cpu;
-
-       return this_cpu;
-}
-
-/*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
@@ -1571,37 +1712,16 @@ out:
 }
 
 /*
- * sched_exec(): find the highest-level, exec-balance-capable
- * domain and try to migrate the task to the least loaded CPU.
- *
- * execve() is a valuable balancing opportunity, because at this point
- * the task has the smallest effective memory and cache footprint.
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
  */
 void sched_exec(void)
 {
-       struct sched_domain *tmp, *sd = NULL;
        int new_cpu, this_cpu = get_cpu();
-
-       /* Prefer the current CPU if there's only this task running */
-       if (this_rq()->nr_running <= 1)
-               goto out;
-
-       for_each_domain(this_cpu, tmp)
-               if (tmp->flags & SD_BALANCE_EXEC)
-                       sd = tmp;
-
-       if (sd) {
-               schedstat_inc(sd, sbe_attempts);
-               new_cpu = find_idlest_cpu(current, this_cpu, sd);
-               if (new_cpu != this_cpu) {
-                       schedstat_inc(sd, sbe_pushed);
-                       put_cpu();
-                       sched_migrate_task(current, new_cpu);
-                       return;
-               }
-       }
-out:
+       new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
        put_cpu();
+       if (new_cpu != this_cpu)
+               sched_migrate_task(current, new_cpu);
 }
 
 /*
@@ -1632,7 +1752,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
  */
 static inline
 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
-                    struct sched_domain *sd, enum idle_type idle)
+            struct sched_domain *sd, enum idle_type idle, int *all_pinned)
 {
        /*
         * We do not migrate tasks that are:
@@ -1640,23 +1760,24 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
         * 2) cannot be migrated to this CPU due to cpus_allowed, or
         * 3) are cache-hot on their current CPU.
         */
-       if (task_running(rq, p))
-               return 0;
        if (!cpu_isset(this_cpu, p->cpus_allowed))
                return 0;
+       *all_pinned = 0;
+
+       if (task_running(rq, p))
+               return 0;
 
        /*
         * Aggressive migration if:
-        * 1) the [whole] cpu is idle, or
+        * 1) task is cache cold, or
         * 2) too many balance attempts have failed.
         */
 
-       if (cpu_and_siblings_are_idle(this_cpu) || \
-                       sd->nr_balance_failed > sd->cache_nice_tries)
+       if (sd->nr_balance_failed > sd->cache_nice_tries)
                return 1;
 
        if (task_hot(p, rq->timestamp_last_tick, sd))
-                       return 0;
+               return 0;
        return 1;
 }
 
@@ -1669,16 +1790,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
  */
 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
                      unsigned long max_nr_move, struct sched_domain *sd,
-                     enum idle_type idle)
+                     enum idle_type idle, int *all_pinned)
 {
        prio_array_t *array, *dst_array;
        struct list_head *head, *curr;
-       int idx, pulled = 0;
+       int idx, pulled = 0, pinned = 0;
        task_t *tmp;
 
-       if (max_nr_move <= 0 || busiest->nr_running <= 1)
+       if (max_nr_move == 0)
                goto out;
 
+       pinned = 1;
+
        /*
         * We first consider expired tasks. Those will likely not be
         * executed in the near future, and they are most likely to
@@ -1717,7 +1840,7 @@ skip_queue:
 
        curr = curr->prev;
 
-       if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+       if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
                if (curr != head)
                        goto skip_queue;
                idx++;
@@ -1746,6 +1869,9 @@ out:
         * inside pull_task().
         */
        schedstat_add(sd, lb_gained[idle], pulled);
+
+       if (all_pinned)
+               *all_pinned = pinned;
        return pulled;
 }
 
@@ -1760,8 +1886,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 {
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+       int load_idx;
 
        max_load = this_load = total_load = total_pwr = 0;
+       if (idle == NOT_IDLE)
+               load_idx = sd->busy_idx;
+       else if (idle == NEWLY_IDLE)
+               load_idx = sd->newidle_idx;
+       else
+               load_idx = sd->idle_idx;
 
        do {
                unsigned long load;
@@ -1776,9 +1909,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                for_each_cpu_mask(i, group->cpumask) {
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
-                               load = target_load(i);
+                               load = target_load(i, load_idx);
                        else
-                               load = source_load(i);
+                               load = source_load(i, load_idx);
 
                        avg_load += load;
                }
@@ -1792,12 +1925,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                if (local_group) {
                        this_load = avg_load;
                        this = group;
-                       goto nextgroup;
                } else if (avg_load > max_load) {
                        max_load = avg_load;
                        busiest = group;
                }
-nextgroup:
                group = group->next;
        } while (group != sd->groups);
 
@@ -1870,15 +2001,9 @@ nextgroup:
 
        /* Get rid of the scaling factor, rounding down as we divide */
        *imbalance = *imbalance / SCHED_LOAD_SCALE;
-
        return busiest;
 
 out_balanced:
-       if (busiest && (idle == NEWLY_IDLE ||
-                       (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
-               *imbalance = 1;
-               return busiest;
-       }
 
        *imbalance = 0;
        return NULL;
@@ -1894,7 +2019,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
        int i;
 
        for_each_cpu_mask(i, group->cpumask) {
-               load = source_load(i);
+               load = source_load(i, 0);
 
                if (load > max_load) {
                        max_load = load;
@@ -1906,6 +2031,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
 }
 
 /*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL    512
+
+/*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  *
@@ -1917,7 +2048,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
        struct sched_group *group;
        runqueue_t *busiest;
        unsigned long imbalance;
-       int nr_moved;
+       int nr_moved, all_pinned = 0;
+       int active_balance = 0;
 
        spin_lock(&this_rq->lock);
        schedstat_inc(sd, lb_cnt[idle]);
@@ -1934,15 +2066,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                goto out_balanced;
        }
 
-       /*
-        * This should be "impossible", but since load
-        * balancing is inherently racy and statistical,
-        * it could happen in theory.
-        */
-       if (unlikely(busiest == this_rq)) {
-               WARN_ON(1);
-               goto out_balanced;
-       }
+       BUG_ON(busiest == this_rq);
 
        schedstat_add(sd, lb_imbalance[idle], imbalance);
 
@@ -1956,9 +2080,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                 */
                double_lock_balance(this_rq, busiest);
                nr_moved = move_tasks(this_rq, this_cpu, busiest,
-                                               imbalance, sd, idle);
+                                               imbalance, sd, idle,
+                                               &all_pinned);
                spin_unlock(&busiest->lock);
+
+               /* All tasks on this runqueue were pinned by CPU affinity */
+               if (unlikely(all_pinned))
+                       goto out_balanced;
        }
+
        spin_unlock(&this_rq->lock);
 
        if (!nr_moved) {
@@ -1966,36 +2096,38 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                sd->nr_balance_failed++;
 
                if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
-                       int wake = 0;
 
                        spin_lock(&busiest->lock);
                        if (!busiest->active_balance) {
                                busiest->active_balance = 1;
                                busiest->push_cpu = this_cpu;
-                               wake = 1;
+                               active_balance = 1;
                        }
                        spin_unlock(&busiest->lock);
-                       if (wake)
+                       if (active_balance)
                                wake_up_process(busiest->migration_thread);
 
                        /*
                         * We've kicked active balancing, reset the failure
                         * counter.
                         */
-                       sd->nr_balance_failed = sd->cache_nice_tries;
+                       sd->nr_balance_failed = sd->cache_nice_tries+1;
                }
-
-               /*
-                * We were unbalanced, but unsuccessful in move_tasks(),
-                * so bump the balance_interval to lessen the lock contention.
-                */
-               if (sd->balance_interval < sd->max_interval)
-                       sd->balance_interval++;
-       } else {
+       } else
                sd->nr_balance_failed = 0;
 
+       if (likely(!active_balance)) {
                /* We were unbalanced, so reset the balancing interval */
                sd->balance_interval = sd->min_interval;
+       } else {
+               /*
+                * If we've begun active balancing, start to back off. This
+                * case may not be covered by the all_pinned logic if there
+                * is only 1 task on the busy runqueue (because we don't call
+                * move_tasks).
+                */
+               if (sd->balance_interval < sd->max_interval)
+                       sd->balance_interval *= 2;
        }
 
        return nr_moved;
@@ -2005,8 +2137,10 @@ out_balanced:
 
        schedstat_inc(sd, lb_balanced[idle]);
 
+       sd->nr_balance_failed = 0;
        /* tune up the balancing interval */
-       if (sd->balance_interval < sd->max_interval)
+       if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+                       (sd->balance_interval < sd->max_interval))
                sd->balance_interval *= 2;
 
        return 0;
@@ -2030,31 +2164,36 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
        group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
        if (!group) {
-               schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
                schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
-               goto out;
+               goto out_balanced;
        }
 
        busiest = find_busiest_queue(group);
-       if (!busiest || busiest == this_rq) {
-               schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+       if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
-               goto out;
+               goto out_balanced;
        }
 
+       BUG_ON(busiest == this_rq);
+
        /* Attempt to move tasks */
        double_lock_balance(this_rq, busiest);
 
        schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
        nr_moved = move_tasks(this_rq, this_cpu, busiest,
-                                       imbalance, sd, NEWLY_IDLE);
+                                       imbalance, sd, NEWLY_IDLE, NULL);
        if (!nr_moved)
                schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+       else
+               sd->nr_balance_failed = 0;
 
        spin_unlock(&busiest->lock);
-
-out:
        return nr_moved;
+
+out_balanced:
+       schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+       sd->nr_balance_failed = 0;
+       return 0;
 }
 
 /*
@@ -2086,56 +2225,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
 static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
 {
        struct sched_domain *sd;
-       struct sched_group *cpu_group;
        runqueue_t *target_rq;
-       cpumask_t visited_cpus;
-       int cpu;
+       int target_cpu = busiest_rq->push_cpu;
+
+       if (busiest_rq->nr_running <= 1)
+               /* no task to move */
+               return;
+
+       target_rq = cpu_rq(target_cpu);
 
        /*
-        * Search for suitable CPUs to push tasks to in successively higher
-        * domains with SD_LOAD_BALANCE set.
+        * This condition is "impossible", if it occurs
+        * we need to fix it.  Originally reported by
+        * Bjorn Helgaas on a 128-cpu setup.
         */
-       visited_cpus = CPU_MASK_NONE;
-       for_each_domain(busiest_cpu, sd) {
-               if (!(sd->flags & SD_LOAD_BALANCE))
-                       /* no more domains to search */
-                       break;
+       BUG_ON(busiest_rq == target_rq);
 
-               schedstat_inc(sd, alb_cnt);
+       /* move a task from busiest_rq to target_rq */
+       double_lock_balance(busiest_rq, target_rq);
 
-               cpu_group = sd->groups;
-               do {
-                       for_each_cpu_mask(cpu, cpu_group->cpumask) {
-                               if (busiest_rq->nr_running <= 1)
-                                       /* no more tasks left to move */
-                                       return;
-                               if (cpu_isset(cpu, visited_cpus))
-                                       continue;
-                               cpu_set(cpu, visited_cpus);
-                               if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
-                                       continue;
-
-                               target_rq = cpu_rq(cpu);
-                               /*
-                                * This condition is "impossible", if it occurs
-                                * we need to fix it.  Originally reported by
-                                * Bjorn Helgaas on a 128-cpu setup.
-                                */
-                               BUG_ON(busiest_rq == target_rq);
-
-                               /* move a task from busiest_rq to target_rq */
-                               double_lock_balance(busiest_rq, target_rq);
-                               if (move_tasks(target_rq, cpu, busiest_rq,
-                                               1, sd, SCHED_IDLE)) {
-                                       schedstat_inc(sd, alb_pushed);
-                               } else {
-                                       schedstat_inc(sd, alb_failed);
-                               }
-                               spin_unlock(&target_rq->lock);
-                       }
-                       cpu_group = cpu_group->next;
-               } while (cpu_group != sd->groups);
-       }
+       /* Search for an sd spanning us and the target CPU. */
+       for_each_domain(target_cpu, sd)
+               if ((sd->flags & SD_LOAD_BALANCE) &&
+                       cpu_isset(busiest_cpu, sd->span))
+                               break;
+
+       if (unlikely(sd == NULL))
+               goto out;
+
+       schedstat_inc(sd, alb_cnt);
+
+       if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
+               schedstat_inc(sd, alb_pushed);
+       else
+               schedstat_inc(sd, alb_failed);
+out:
+       spin_unlock(&target_rq->lock);
 }
 
 /*
@@ -2156,18 +2281,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
        unsigned long old_load, this_load;
        unsigned long j = jiffies + CPU_OFFSET(this_cpu);
        struct sched_domain *sd;
+       int i;
 
-       /* Update our load */
-       old_load = this_rq->cpu_load;
        this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
-       /*
-        * Round up the averaging division if load is increasing. This
-        * prevents us from getting stuck on 9 if the load is 10, for
-        * example.
-        */
-       if (this_load > old_load)
-               old_load++;
-       this_rq->cpu_load = (old_load + this_load) / 2;
+       /* Update our load */
+       for (i = 0; i < 3; i++) {
+               unsigned long new_load = this_load;
+               int scale = 1 << i;
+               old_load = this_rq->cpu_load[i];
+               /*
+                * Round up the averaging division if load is increasing. This
+                * prevents us from getting stuck on 9 if the load is 10, for
+                * example.
+                */
+               if (new_load > old_load)
+                       new_load += scale-1;
+               this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
+       }
 
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
@@ -2447,11 +2577,15 @@ out:
 #ifdef CONFIG_SCHED_SMT
 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
-       struct sched_domain *sd = this_rq->sd;
+       struct sched_domain *tmp, *sd = NULL;
        cpumask_t sibling_map;
        int i;
 
-       if (!(sd->flags & SD_SHARE_CPUPOWER))
+       for_each_domain(this_cpu, tmp)
+               if (tmp->flags & SD_SHARE_CPUPOWER)
+                       sd = tmp;
+
+       if (!sd)
                return;
 
        /*
@@ -2492,13 +2626,17 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 
 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
-       struct sched_domain *sd = this_rq->sd;
+       struct sched_domain *tmp, *sd = NULL;
        cpumask_t sibling_map;
        prio_array_t *array;
        int ret = 0, i;
        task_t *p;
 
-       if (!(sd->flags & SD_SHARE_CPUPOWER))
+       for_each_domain(this_cpu, tmp)
+               if (tmp->flags & SD_SHARE_CPUPOWER)
+                       sd = tmp;
+
+       if (!sd)
                return 0;
 
        /*
@@ -2613,7 +2751,7 @@ asmlinkage void __sched schedule(void)
        struct list_head *queue;
        unsigned long long now;
        unsigned long run_time;
-       int cpu, idx;
+       int cpu, idx, new_prio;
 
        /*
         * Test if we are atomic.  Since do_exit() needs to call into
@@ -2735,9 +2873,14 @@ go_idle:
                        delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
 
                array = next->array;
-               dequeue_task(next, array);
-               recalc_task_prio(next, next->timestamp + delta);
-               enqueue_task(next, array);
+               new_prio = recalc_task_prio(next, next->timestamp + delta);
+
+               if (unlikely(next->prio != new_prio)) {
+                       dequeue_task(next, array);
+                       next->prio = new_prio;
+                       enqueue_task(next, array);
+               } else
+                       requeue_task(next, array);
        }
        next->activated = 0;
 switch_tasks:
@@ -2761,11 +2904,15 @@ switch_tasks:
                rq->curr = next;
                ++*switch_count;
 
-               prepare_arch_switch(rq, next);
+               prepare_task_switch(rq, next);
                prev = context_switch(rq, prev, next);
                barrier();
-
-               finish_task_switch(prev);
+               /*
+                * this_rq must be evaluated again because prev may have moved
+                * CPUs since it called schedule(), thus the 'rq' on its stack
+                * frame will be invalid.
+                */
+               finish_task_switch(this_rq(), prev);
        } else
                spin_unlock_irq(&rq->lock);
 
@@ -3384,13 +3531,24 @@ recheck:
        if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
                return -EINVAL;
 
-       if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
-           param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur &&
-           !capable(CAP_SYS_NICE))
-               return -EPERM;
-       if ((current->euid != p->euid) && (current->euid != p->uid) &&
-           !capable(CAP_SYS_NICE))
-               return -EPERM;
+       /*
+        * Allow unprivileged RT tasks to decrease priority:
+        */
+       if (!capable(CAP_SYS_NICE)) {
+               /* can't change policy */
+               if (policy != p->policy)
+                       return -EPERM;
+               /* can't increase priority */
+               if (policy != SCHED_NORMAL &&
+                   param->sched_priority > p->rt_priority &&
+                   param->sched_priority >
+                               p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+                       return -EPERM;
+               /* can't change other user's priorities */
+               if ((current->euid != p->euid) &&
+                   (current->euid != p->uid))
+                       return -EPERM;
+       }
 
        retval = security_task_setscheduler(p, policy, param);
        if (retval)
@@ -4030,6 +4188,9 @@ void __devinit init_idle(task_t *idle, int cpu)
 
        spin_lock_irqsave(&rq->lock, flags);
        rq->curr = rq->idle = idle;
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+       idle->oncpu = 1;
+#endif
        set_tsk_need_resched(idle);
        spin_unlock_irqrestore(&rq->lock, flags);
 
@@ -4199,17 +4360,9 @@ static int migration_thread(void * data)
                req = list_entry(head->next, migration_req_t, list);
                list_del_init(head->next);
 
-               if (req->type == REQ_MOVE_TASK) {
-                       spin_unlock(&rq->lock);
-                       __migrate_task(req->task, cpu, req->dest_cpu);
-                       local_irq_enable();
-               } else if (req->type == REQ_SET_DOMAIN) {
-                       rq->sd = req->sd;
-                       spin_unlock_irq(&rq->lock);
-               } else {
-                       spin_unlock_irq(&rq->lock);
-                       WARN_ON(1);
-               }
+               spin_unlock(&rq->lock);
+               __migrate_task(req->task, cpu, req->dest_cpu);
+               local_irq_enable();
 
                complete(&req->done);
        }
@@ -4440,7 +4593,6 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
                        migration_req_t *req;
                        req = list_entry(rq->migration_queue.next,
                                         migration_req_t, list);
-                       BUG_ON(req->type != REQ_MOVE_TASK);
                        list_del_init(&req->list);
                        complete(&req->done);
                }
@@ -4471,12 +4623,17 @@ int __init migration_init(void)
 #endif
 
 #ifdef CONFIG_SMP
-#define SCHED_DOMAIN_DEBUG
+#undef SCHED_DOMAIN_DEBUG
 #ifdef SCHED_DOMAIN_DEBUG
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
        int level = 0;
 
+       if (!sd) {
+               printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+               return;
+       }
+
        printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
 
        do {
@@ -4559,37 +4716,81 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 #define sched_domain_debug(sd, cpu) {}
 #endif
 
+static int sd_degenerate(struct sched_domain *sd)
+{
+       if (cpus_weight(sd->span) == 1)
+               return 1;
+
+       /* Following flags need at least 2 groups */
+       if (sd->flags & (SD_LOAD_BALANCE |
+                        SD_BALANCE_NEWIDLE |
+                        SD_BALANCE_FORK |
+                        SD_BALANCE_EXEC)) {
+               if (sd->groups != sd->groups->next)
+                       return 0;
+       }
+
+       /* Following flags don't use groups */
+       if (sd->flags & (SD_WAKE_IDLE |
+                        SD_WAKE_AFFINE |
+                        SD_WAKE_BALANCE))
+               return 0;
+
+       return 1;
+}
+
+static int sd_parent_degenerate(struct sched_domain *sd,
+                                               struct sched_domain *parent)
+{
+       unsigned long cflags = sd->flags, pflags = parent->flags;
+
+       if (sd_degenerate(parent))
+               return 1;
+
+       if (!cpus_equal(sd->span, parent->span))
+               return 0;
+
+       /* Does parent contain flags not in child? */
+       /* WAKE_BALANCE is a subset of WAKE_AFFINE */
+       if (cflags & SD_WAKE_AFFINE)
+               pflags &= ~SD_WAKE_BALANCE;
+       /* Flags needing groups don't count if only 1 group in parent */
+       if (parent->groups == parent->groups->next) {
+               pflags &= ~(SD_LOAD_BALANCE |
+                               SD_BALANCE_NEWIDLE |
+                               SD_BALANCE_FORK |
+                               SD_BALANCE_EXEC);
+       }
+       if (~cflags & pflags)
+               return 0;
+
+       return 1;
+}
+
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
-void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
+void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
-       migration_req_t req;
-       unsigned long flags;
        runqueue_t *rq = cpu_rq(cpu);
-       int local = 1;
-
-       sched_domain_debug(sd, cpu);
+       struct sched_domain *tmp;
 
-       spin_lock_irqsave(&rq->lock, flags);
-
-       if (cpu == smp_processor_id() || !cpu_online(cpu)) {
-               rq->sd = sd;
-       } else {
-               init_completion(&req.done);
-               req.type = REQ_SET_DOMAIN;
-               req.sd = sd;
-               list_add(&req.list, &rq->migration_queue);
-               local = 0;
+       /* Remove the sched domains which do not contribute to scheduling. */
+       for (tmp = sd; tmp; tmp = tmp->parent) {
+               struct sched_domain *parent = tmp->parent;
+               if (!parent)
+                       break;
+               if (sd_parent_degenerate(tmp, parent))
+                       tmp->parent = parent->parent;
        }
 
-       spin_unlock_irqrestore(&rq->lock, flags);
+       if (sd && sd_degenerate(sd))
+               sd = sd->parent;
 
-       if (!local) {
-               wake_up_process(rq->migration_thread);
-               wait_for_completion(&req.done);
-       }
+       sched_domain_debug(sd, cpu);
+
+       rcu_assign_pointer(rq->sd, sd);
 }
 
 /* cpus with isolated domains */
@@ -4621,7 +4822,7 @@ __setup ("isolcpus=", isolated_cpu_setup);
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
-void __devinit init_sched_build_groups(struct sched_group groups[],
+void init_sched_build_groups(struct sched_group groups[],
                        cpumask_t span, int (*group_fn)(int cpu))
 {
        struct sched_group *first = NULL, *last = NULL;
@@ -4657,13 +4858,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[],
 
 
 #ifdef ARCH_HAS_SCHED_DOMAIN
-extern void __devinit arch_init_sched_domains(void);
-extern void __devinit arch_destroy_sched_domains(void);
+extern void build_sched_domains(const cpumask_t *cpu_map);
+extern void arch_init_sched_domains(const cpumask_t *cpu_map);
+extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
 #else
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
-static int __devinit cpu_to_cpu_group(int cpu)
+static int cpu_to_cpu_group(int cpu)
 {
        return cpu;
 }
@@ -4671,7 +4873,7 @@ static int __devinit cpu_to_cpu_group(int cpu)
 
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static struct sched_group sched_group_phys[NR_CPUS];
-static int __devinit cpu_to_phys_group(int cpu)
+static int cpu_to_phys_group(int cpu)
 {
 #ifdef CONFIG_SCHED_SMT
        return first_cpu(cpu_sibling_map[cpu]);
@@ -4684,7 +4886,7 @@ static int __devinit cpu_to_phys_group(int cpu)
 
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int __devinit cpu_to_node_group(int cpu)
+static int cpu_to_node_group(int cpu)
 {
        return cpu_to_node(cpu);
 }
@@ -4715,39 +4917,28 @@ static void check_sibling_maps(void)
 #endif
 
 /*
- * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
  */
-static void __devinit arch_init_sched_domains(void)
+static void build_sched_domains(const cpumask_t *cpu_map)
 {
        int i;
-       cpumask_t cpu_default_map;
-
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-       check_sibling_maps();
-#endif
-       /*
-        * Setup mask for cpus without special case scheduling requirements.
-        * For now this just excludes isolated cpus, but could be used to
-        * exclude other special cases in the future.
-        */
-       cpus_complement(cpu_default_map, cpu_isolated_map);
-       cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
 
        /*
-        * Set up domains. Isolated domains just stay on the dummy domain.
+        * Set up domains for cpus specified by the cpu_map.
         */
-       for_each_cpu_mask(i, cpu_default_map) {
+       for_each_cpu_mask(i, *cpu_map) {
                int group;
                struct sched_domain *sd = NULL, *p;
                cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
 
-               cpus_and(nodemask, nodemask, cpu_default_map);
+               cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
                sd = &per_cpu(node_domains, i);
                group = cpu_to_node_group(i);
                *sd = SD_NODE_INIT;
-               sd->span = cpu_default_map;
+               sd->span = *cpu_map;
                sd->groups = &sched_group_nodes[group];
 #endif
 
@@ -4765,7 +4956,7 @@ static void __devinit arch_init_sched_domains(void)
                group = cpu_to_cpu_group(i);
                *sd = SD_SIBLING_INIT;
                sd->span = cpu_sibling_map[i];
-               cpus_and(sd->span, sd->span, cpu_default_map);
+               cpus_and(sd->span, sd->span, *cpu_map);
                sd->parent = p;
                sd->groups = &sched_group_cpus[group];
 #endif
@@ -4775,7 +4966,7 @@ static void __devinit arch_init_sched_domains(void)
        /* Set up CPU (sibling) groups */
        for_each_online_cpu(i) {
                cpumask_t this_sibling_map = cpu_sibling_map[i];
-               cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
+               cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
                if (i != first_cpu(this_sibling_map))
                        continue;
 
@@ -4788,7 +4979,7 @@ static void __devinit arch_init_sched_domains(void)
        for (i = 0; i < MAX_NUMNODES; i++) {
                cpumask_t nodemask = node_to_cpumask(i);
 
-               cpus_and(nodemask, nodemask, cpu_default_map);
+               cpus_and(nodemask, nodemask, *cpu_map);
                if (cpus_empty(nodemask))
                        continue;
 
@@ -4798,12 +4989,12 @@ static void __devinit arch_init_sched_domains(void)
 
 #ifdef CONFIG_NUMA
        /* Set up node groups */
-       init_sched_build_groups(sched_group_nodes, cpu_default_map,
+       init_sched_build_groups(sched_group_nodes, *cpu_map,
                                        &cpu_to_node_group);
 #endif
 
        /* Calculate CPU power for physical packages and nodes */
-       for_each_cpu_mask(i, cpu_default_map) {
+       for_each_cpu_mask(i, *cpu_map) {
                int power;
                struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
@@ -4827,7 +5018,7 @@ static void __devinit arch_init_sched_domains(void)
        }
 
        /* Attach the domains */
-       for_each_online_cpu(i) {
+       for_each_cpu_mask(i, *cpu_map) {
                struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
                sd = &per_cpu(cpu_domains, i);
@@ -4837,41 +5028,85 @@ static void __devinit arch_init_sched_domains(void)
                cpu_attach_domain(sd, i);
        }
 }
+/*
+ * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ */
+static void arch_init_sched_domains(cpumask_t *cpu_map)
+{
+       cpumask_t cpu_default_map;
+
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+       check_sibling_maps();
+#endif
+       /*
+        * Setup mask for cpus without special case scheduling requirements.
+        * For now this just excludes isolated cpus, but could be used to
+        * exclude other special cases in the future.
+        */
+       cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
+
+       build_sched_domains(&cpu_default_map);
+}
 
-#ifdef CONFIG_HOTPLUG_CPU
-static void __devinit arch_destroy_sched_domains(void)
+static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
        /* Do nothing: everything is statically allocated. */
 }
-#endif
 
 #endif /* ARCH_HAS_SCHED_DOMAIN */
 
 /*
- * Initial dummy domain for early boot and for hotplug cpu. Being static,
- * it is initialized to zero, so all balancing flags are cleared which is
- * what we want.
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
  */
-static struct sched_domain sched_domain_dummy;
+static inline void detach_destroy_domains(const cpumask_t *cpu_map)
+{
+       int i;
+
+       for_each_cpu_mask(i, *cpu_map)
+               cpu_attach_domain(NULL, i);
+       synchronize_sched();
+       arch_destroy_sched_domains(cpu_map);
+}
+
+/*
+ * Partition sched domains as specified by the cpumasks below.
+ * This attaches all cpus from the cpumasks to the NULL domain,
+ * waits for a RCU quiescent period, recalculates sched
+ * domain information and then attaches them back to the
+ * correct sched domains
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+{
+       cpumask_t change_map;
+
+       cpus_and(*partition1, *partition1, cpu_online_map);
+       cpus_and(*partition2, *partition2, cpu_online_map);
+       cpus_or(change_map, *partition1, *partition2);
+
+       /* Detach sched domains from all of the affected cpus */
+       detach_destroy_domains(&change_map);
+       if (!cpus_empty(*partition1))
+               build_sched_domains(partition1);
+       if (!cpus_empty(*partition2))
+               build_sched_domains(partition2);
+}
 
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Force a reinitialization of the sched domains hierarchy.  The domains
  * and groups cannot be updated in place without racing with the balancing
- * code, so we temporarily attach all running cpus to a "dummy" domain
+ * code, so we temporarily attach all running cpus to the NULL domain
  * which will prevent rebalancing while the sched domains are recalculated.
  */
 static int update_sched_domains(struct notifier_block *nfb,
                                unsigned long action, void *hcpu)
 {
-       int i;
-
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_DOWN_PREPARE:
-               for_each_online_cpu(i)
-                       cpu_attach_domain(&sched_domain_dummy, i);
-               arch_destroy_sched_domains();
+               detach_destroy_domains(&cpu_online_map);
                return NOTIFY_OK;
 
        case CPU_UP_CANCELED:
@@ -4887,7 +5122,7 @@ static int update_sched_domains(struct notifier_block *nfb,
        }
 
        /* The hotplug lock is already held by cpu_up/cpu_down */
-       arch_init_sched_domains();
+       arch_init_sched_domains(&cpu_online_map);
 
        return NOTIFY_OK;
 }
@@ -4896,7 +5131,7 @@ static int update_sched_domains(struct notifier_block *nfb,
 void __init sched_init_smp(void)
 {
        lock_cpu_hotplug();
-       arch_init_sched_domains();
+       arch_init_sched_domains(&cpu_online_map);
        unlock_cpu_hotplug();
        /* XXX: Theoretical race here - CPU may be hotplugged now */
        hotcpu_notifier(update_sched_domains, 0);
@@ -4926,13 +5161,15 @@ void __init sched_init(void)
 
                rq = cpu_rq(i);
                spin_lock_init(&rq->lock);
+               rq->nr_running = 0;
                rq->active = rq->arrays;
                rq->expired = rq->arrays + 1;
                rq->best_expired_prio = MAX_PRIO;
 
 #ifdef CONFIG_SMP
-               rq->sd = &sched_domain_dummy;
-               rq->cpu_load = 0;
+               rq->sd = NULL;
+               for (j = 1; j < 3; j++)
+                       rq->cpu_load[j] = 0;
                rq->active_balance = 0;
                rq->push_cpu = 0;
                rq->migration_thread = NULL;