Merge branch 'ipi-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip...
Linus Torvalds [Sat, 4 Apr 2009 00:33:30 +0000 (17:33 -0700)]
* 'ipi-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  s390: remove arch specific smp_send_stop()
  panic: clean up kernel/panic.c
  panic, smp: provide smp_send_stop() wrapper on UP too
  panic: decrease oops_in_progress only after having done the panic
  generic-ipi: eliminate WARN_ON()s during oops/panic
  generic-ipi: cleanups
  generic-ipi: remove CSD_FLAG_WAIT
  generic-ipi: remove kmalloc()
  generic IPI: simplify barriers and locking

1  2 
arch/s390/include/asm/smp.h
kernel/sched.c
kernel/softirq.c

@@@ -50,7 -50,12 +50,7 @@@ extern void machine_power_off_smp(void)
   
  #define PROC_CHANGE_PENALTY   20              /* Schedule penalty */
  
 -#define raw_smp_processor_id()        (S390_lowcore.cpu_data.cpu_nr)
 -
 -static inline __u16 hard_smp_processor_id(void)
 -{
 -      return stap();
 -}
 +#define raw_smp_processor_id()        (S390_lowcore.cpu_nr)
  
  /*
   * returns 1 if cpu is in stopped/check stopped state or not operational
@@@ -92,12 -97,6 +92,6 @@@ extern void arch_send_call_function_ipi
  #endif
  
  #ifndef CONFIG_SMP
- static inline void smp_send_stop(void)
- {
-       /* Disable all interrupts/machine checks */
-       __load_psw_mask(psw_kernel_bits & ~PSW_MASK_MCHECK);
- }
  #define hard_smp_processor_id()               0
  #define smp_cpu_not_running(cpu)      1
  #endif
diff --combined kernel/sched.c
@@@ -331,13 -331,6 +331,13 @@@ static DEFINE_PER_CPU(struct rt_rq, ini
   */
  static DEFINE_SPINLOCK(task_group_lock);
  
 +#ifdef CONFIG_SMP
 +static int root_task_group_empty(void)
 +{
 +      return list_empty(&root_task_group.children);
 +}
 +#endif
 +
  #ifdef CONFIG_FAIR_GROUP_SCHED
  #ifdef CONFIG_USER_SCHED
  # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@@ -398,13 -391,6 +398,13 @@@ static inline void set_task_rq(struct t
  
  #else
  
 +#ifdef CONFIG_SMP
 +static int root_task_group_empty(void)
 +{
 +      return 1;
 +}
 +#endif
 +
  static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
  static inline struct task_group *task_group(struct task_struct *p)
  {
@@@ -481,17 -467,11 +481,17 @@@ struct rt_rq 
        struct rt_prio_array active;
        unsigned long rt_nr_running;
  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 -      int highest_prio; /* highest queued rt task prio */
 +      struct {
 +              int curr; /* highest queued rt task prio */
 +#ifdef CONFIG_SMP
 +              int next; /* next highest */
 +#endif
 +      } highest_prio;
  #endif
  #ifdef CONFIG_SMP
        unsigned long rt_nr_migratory;
        int overloaded;
 +      struct plist_head pushable_tasks;
  #endif
        int rt_throttled;
        u64 rt_time;
@@@ -569,6 -549,7 +569,6 @@@ struct rq 
        unsigned long nr_running;
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 -      unsigned char idle_at_tick;
  #ifdef CONFIG_NO_HZ
        unsigned long last_tick_seen;
        unsigned char in_nohz_recently;
        struct root_domain *rd;
        struct sched_domain *sd;
  
 +      unsigned char idle_at_tick;
        /* For active balancing */
        int active_balance;
        int push_cpu;
        /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
  
        /* sys_sched_yield() stats */
 -      unsigned int yld_exp_empty;
 -      unsigned int yld_act_empty;
 -      unsigned int yld_both_empty;
        unsigned int yld_count;
  
        /* schedule() stats */
@@@ -1110,7 -1093,7 +1110,7 @@@ static void hrtick_start(struct rq *rq
        if (rq == this_rq()) {
                hrtimer_restart(timer);
        } else if (!rq->hrtick_csd_pending) {
-               __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+               __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
                rq->hrtick_csd_pending = 1;
        }
  }
@@@ -1200,10 -1183,10 +1200,10 @@@ static void resched_task(struct task_st
  
        assert_spin_locked(&task_rq(p)->lock);
  
 -      if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 +      if (test_tsk_need_resched(p))
                return;
  
 -      set_tsk_thread_flag(p, TIF_NEED_RESCHED);
 +      set_tsk_need_resched(p);
  
        cpu = task_cpu(p);
        if (cpu == smp_processor_id())
@@@ -1259,7 -1242,7 +1259,7 @@@ void wake_up_idle_cpu(int cpu
         * lockless. The worst case is that the other CPU runs the
         * idle task through an additional NOOP schedule()
         */
 -      set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
 +      set_tsk_need_resched(rq->idle);
  
        /* NEED_RESCHED must be visible before we test polling */
        smp_mb();
@@@ -1627,42 -1610,21 +1627,42 @@@ static inline void update_shares_locked
  
  #endif
  
 +#ifdef CONFIG_PREEMPT
 +
  /*
 - * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
 + * fair double_lock_balance: Safely acquires both rq->locks in a fair
 + * way at the expense of forcing extra atomic operations in all
 + * invocations.  This assures that the double_lock is acquired using the
 + * same underlying policy as the spinlock_t on this architecture, which
 + * reduces latency compared to the unfair variant below.  However, it
 + * also adds more overhead and therefore may reduce throughput.
   */
 -static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
 +      __releases(this_rq->lock)
 +      __acquires(busiest->lock)
 +      __acquires(this_rq->lock)
 +{
 +      spin_unlock(&this_rq->lock);
 +      double_rq_lock(this_rq, busiest);
 +
 +      return 1;
 +}
 +
 +#else
 +/*
 + * Unfair double_lock_balance: Optimizes throughput at the expense of
 + * latency by eliminating extra atomic operations when the locks are
 + * already in proper order on entry.  This favors lower cpu-ids and will
 + * grant the double lock to lower cpus over higher ids under contention,
 + * regardless of entry order into the function.
 + */
 +static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(this_rq->lock)
        __acquires(busiest->lock)
        __acquires(this_rq->lock)
  {
        int ret = 0;
  
 -      if (unlikely(!irqs_disabled())) {
 -              /* printk() doesn't work good under rq->lock */
 -              spin_unlock(&this_rq->lock);
 -              BUG_ON(1);
 -      }
        if (unlikely(!spin_trylock(&busiest->lock))) {
                if (busiest < this_rq) {
                        spin_unlock(&this_rq->lock);
        return ret;
  }
  
 +#endif /* CONFIG_PREEMPT */
 +
 +/*
 + * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
 + */
 +static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 +{
 +      if (unlikely(!irqs_disabled())) {
 +              /* printk() doesn't work good under rq->lock */
 +              spin_unlock(&this_rq->lock);
 +              BUG_ON(1);
 +      }
 +
 +      return _double_lock_balance(this_rq, busiest);
 +}
 +
  static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(busiest->lock)
  {
@@@ -1759,9 -1705,6 +1759,9 @@@ static void update_avg(u64 *avg, u64 sa
  
  static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
  {
 +      if (wakeup)
 +              p->se.start_runtime = p->se.sum_exec_runtime;
 +
        sched_info_queued(p);
        p->sched_class->enqueue_task(rq, p, wakeup);
        p->se.on_rq = 1;
  
  static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
  {
 -      if (sleep && p->se.last_wakeup) {
 -              update_avg(&p->se.avg_overlap,
 -                         p->se.sum_exec_runtime - p->se.last_wakeup);
 -              p->se.last_wakeup = 0;
 +      if (sleep) {
 +              if (p->se.last_wakeup) {
 +                      update_avg(&p->se.avg_overlap,
 +                              p->se.sum_exec_runtime - p->se.last_wakeup);
 +                      p->se.last_wakeup = 0;
 +              } else {
 +                      update_avg(&p->se.avg_wakeup,
 +                              sysctl_sched_wakeup_granularity);
 +              }
        }
  
        sched_info_dequeued(p);
@@@ -2079,7 -2017,7 +2079,7 @@@ unsigned long wait_task_inactive(struc
                 * it must be off the runqueue _entirely_, and not
                 * preempted!
                 *
 -               * So if it wa still runnable (but just not actively
 +               * So if it was still runnable (but just not actively
                 * running right now), it's preempted, and we should
                 * yield - it could be a while.
                 */
@@@ -2329,7 -2267,7 +2329,7 @@@ static int try_to_wake_up(struct task_s
                sync = 0;
  
  #ifdef CONFIG_SMP
 -      if (sched_feat(LB_WAKEUP_UPDATE)) {
 +      if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
                struct sched_domain *sd;
  
                this_cpu = raw_smp_processor_id();
@@@ -2407,22 -2345,6 +2407,22 @@@ out_activate
        activate_task(rq, p, 1);
        success = 1;
  
 +      /*
 +       * Only attribute actual wakeups done by this task.
 +       */
 +      if (!in_interrupt()) {
 +              struct sched_entity *se = &current->se;
 +              u64 sample = se->sum_exec_runtime;
 +
 +              if (se->last_wakeup)
 +                      sample -= se->last_wakeup;
 +              else
 +                      sample -= se->start_runtime;
 +              update_avg(&se->avg_wakeup, sample);
 +
 +              se->last_wakeup = se->sum_exec_runtime;
 +      }
 +
  out_running:
        trace_sched_wakeup(rq, p, success);
        check_preempt_curr(rq, p, sync);
                p->sched_class->task_wake_up(rq, p);
  #endif
  out:
 -      current->se.last_wakeup = current->se.sum_exec_runtime;
 -
        task_rq_unlock(rq, &flags);
  
        return success;
@@@ -2462,8 -2386,6 +2462,8 @@@ static void __sched_fork(struct task_st
        p->se.prev_sum_exec_runtime     = 0;
        p->se.last_wakeup               = 0;
        p->se.avg_overlap               = 0;
 +      p->se.start_runtime             = 0;
 +      p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
  
  #ifdef CONFIG_SCHEDSTATS
        p->se.wait_start                = 0;
@@@ -2526,8 -2448,6 +2526,8 @@@ void sched_fork(struct task_struct *p, 
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
  #endif
 +      plist_node_init(&p->pushable_tasks, MAX_PRIO);
 +
        put_cpu();
  }
  
@@@ -2571,7 -2491,7 +2571,7 @@@ void wake_up_new_task(struct task_struc
  #ifdef CONFIG_PREEMPT_NOTIFIERS
  
  /**
 - * preempt_notifier_register - tell me when current is being being preempted & rescheduled
 + * preempt_notifier_register - tell me when current is being preempted & rescheduled
   * @notifier: notifier struct to register
   */
  void preempt_notifier_register(struct preempt_notifier *notifier)
@@@ -2668,12 -2588,6 +2668,12 @@@ static void finish_task_switch(struct r
  {
        struct mm_struct *mm = rq->prev_mm;
        long prev_state;
 +#ifdef CONFIG_SMP
 +      int post_schedule = 0;
 +
 +      if (current->sched_class->needs_post_schedule)
 +              post_schedule = current->sched_class->needs_post_schedule(rq);
 +#endif
  
        rq->prev_mm = NULL;
  
        finish_arch_switch(prev);
        finish_lock_switch(rq, prev);
  #ifdef CONFIG_SMP
 -      if (current->sched_class->post_schedule)
 +      if (post_schedule)
                current->sched_class->post_schedule(rq);
  #endif
  
@@@ -2999,7 -2913,6 +2999,7 @@@ int can_migrate_task(struct task_struc
                     struct sched_domain *sd, enum cpu_idle_type idle,
                     int *all_pinned)
  {
 +      int tsk_cache_hot = 0;
        /*
         * We do not migrate tasks that are:
         * 1) running (obviously), or
         * 2) too many balance attempts have failed.
         */
  
 -      if (!task_hot(p, rq->clock, sd) ||
 -                      sd->nr_balance_failed > sd->cache_nice_tries) {
 +      tsk_cache_hot = task_hot(p, rq->clock, sd);
 +      if (!tsk_cache_hot ||
 +              sd->nr_balance_failed > sd->cache_nice_tries) {
  #ifdef CONFIG_SCHEDSTATS
 -              if (task_hot(p, rq->clock, sd)) {
 +              if (tsk_cache_hot) {
                        schedstat_inc(sd, lb_hot_gained[idle]);
                        schedstat_inc(p, se.nr_forced_migrations);
                }
                return 1;
        }
  
 -      if (task_hot(p, rq->clock, sd)) {
 +      if (tsk_cache_hot) {
                schedstat_inc(p, se.nr_failed_migrations_hot);
                return 0;
        }
@@@ -3075,16 -2987,6 +3075,16 @@@ next
        pulled++;
        rem_load_move -= p->se.load.weight;
  
 +#ifdef CONFIG_PREEMPT
 +      /*
 +       * NEWIDLE balancing is a source of latency, so preemptible kernels
 +       * will stop after the first task is pulled to minimize the critical
 +       * section.
 +       */
 +      if (idle == CPU_NEWLY_IDLE)
 +              goto out;
 +#endif
 +
        /*
         * We only want to steal up to the prescribed amount of weighted load.
         */
@@@ -3131,15 -3033,9 +3131,15 @@@ static int move_tasks(struct rq *this_r
                                sd, idle, all_pinned, &this_best_prio);
                class = class->next;
  
 +#ifdef CONFIG_PREEMPT
 +              /*
 +               * NEWIDLE balancing is a source of latency, so preemptible
 +               * kernels will stop after the first task is pulled to minimize
 +               * the critical section.
 +               */
                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
                        break;
 -
 +#endif
        } while (class && max_load_move > total_load_moved);
  
        return total_load_moved > 0;
@@@ -3189,480 -3085,246 +3189,480 @@@ static int move_one_task(struct rq *thi
  
        return 0;
  }
 -
 +/********** Helpers for find_busiest_group ************************/
  /*
 - * find_busiest_group finds and returns the busiest CPU group within the
 - * domain. It calculates and returns the amount of weighted load which
 - * should be moved to restore balance via the imbalance parameter.
 + * sd_lb_stats - Structure to store the statistics of a sched_domain
 + *            during load balancing.
   */
 -static struct sched_group *
 -find_busiest_group(struct sched_domain *sd, int this_cpu,
 -                 unsigned long *imbalance, enum cpu_idle_type idle,
 -                 int *sd_idle, const struct cpumask *cpus, int *balance)
 -{
 -      struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 -      unsigned long max_load, avg_load, total_load, this_load, total_pwr;
 -      unsigned long max_pull;
 -      unsigned long busiest_load_per_task, busiest_nr_running;
 -      unsigned long this_load_per_task, this_nr_running;
 -      int load_idx, group_imb = 0;
 +struct sd_lb_stats {
 +      struct sched_group *busiest; /* Busiest group in this sd */
 +      struct sched_group *this;  /* Local group in this sd */
 +      unsigned long total_load;  /* Total load of all groups in sd */
 +      unsigned long total_pwr;   /*   Total power of all groups in sd */
 +      unsigned long avg_load;    /* Average load across all groups in sd */
 +
 +      /** Statistics of this group */
 +      unsigned long this_load;
 +      unsigned long this_load_per_task;
 +      unsigned long this_nr_running;
 +
 +      /* Statistics of the busiest group */
 +      unsigned long max_load;
 +      unsigned long busiest_load_per_task;
 +      unsigned long busiest_nr_running;
 +
 +      int group_imb; /* Is there imbalance in this sd */
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 -      int power_savings_balance = 1;
 -      unsigned long leader_nr_running = 0, min_load_per_task = 0;
 -      unsigned long min_nr_running = ULONG_MAX;
 -      struct sched_group *group_min = NULL, *group_leader = NULL;
 +      int power_savings_balance; /* Is powersave balance needed for this sd */
 +      struct sched_group *group_min; /* Least loaded group in sd */
 +      struct sched_group *group_leader; /* Group which relieves group_min */
 +      unsigned long min_load_per_task; /* load_per_task in group_min */
 +      unsigned long leader_nr_running; /* Nr running of group_leader */
 +      unsigned long min_nr_running; /* Nr running of group_min */
  #endif
 +};
 +
 +/*
 + * sg_lb_stats - stats of a sched_group required for load_balancing
 + */
 +struct sg_lb_stats {
 +      unsigned long avg_load; /*Avg load across the CPUs of the group */
 +      unsigned long group_load; /* Total load over the CPUs of the group */
 +      unsigned long sum_nr_running; /* Nr tasks running in the group */
 +      unsigned long sum_weighted_load; /* Weighted load of group's tasks */
 +      unsigned long group_capacity;
 +      int group_imb; /* Is there an imbalance in the group ? */
 +};
 +
 +/**
 + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
 + * @group: The group whose first cpu is to be returned.
 + */
 +static inline unsigned int group_first_cpu(struct sched_group *group)
 +{
 +      return cpumask_first(sched_group_cpus(group));
 +}
  
 -      max_load = this_load = total_load = total_pwr = 0;
 -      busiest_load_per_task = busiest_nr_running = 0;
 -      this_load_per_task = this_nr_running = 0;
 +/**
 + * get_sd_load_idx - Obtain the load index for a given sched domain.
 + * @sd: The sched_domain whose load_idx is to be obtained.
 + * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
 + */
 +static inline int get_sd_load_idx(struct sched_domain *sd,
 +                                      enum cpu_idle_type idle)
 +{
 +      int load_idx;
  
 -      if (idle == CPU_NOT_IDLE)
 +      switch (idle) {
 +      case CPU_NOT_IDLE:
                load_idx = sd->busy_idx;
 -      else if (idle == CPU_NEWLY_IDLE)
 +              break;
 +
 +      case CPU_NEWLY_IDLE:
                load_idx = sd->newidle_idx;
 -      else
 +              break;
 +      default:
                load_idx = sd->idle_idx;
 +              break;
 +      }
  
 -      do {
 -              unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
 -              int local_group;
 -              int i;
 -              int __group_imb = 0;
 -              unsigned int balance_cpu = -1, first_idle_cpu = 0;
 -              unsigned long sum_nr_running, sum_weighted_load;
 -              unsigned long sum_avg_load_per_task;
 -              unsigned long avg_load_per_task;
 +      return load_idx;
 +}
  
 -              local_group = cpumask_test_cpu(this_cpu,
 -                                             sched_group_cpus(group));
  
 -              if (local_group)
 -                      balance_cpu = cpumask_first(sched_group_cpus(group));
 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 +/**
 + * init_sd_power_savings_stats - Initialize power savings statistics for
 + * the given sched_domain, during load balancing.
 + *
 + * @sd: Sched domain whose power-savings statistics are to be initialized.
 + * @sds: Variable containing the statistics for sd.
 + * @idle: Idle status of the CPU at which we're performing load-balancing.
 + */
 +static inline void init_sd_power_savings_stats(struct sched_domain *sd,
 +      struct sd_lb_stats *sds, enum cpu_idle_type idle)
 +{
 +      /*
 +       * Busy processors will not participate in power savings
 +       * balance.
 +       */
 +      if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
 +              sds->power_savings_balance = 0;
 +      else {
 +              sds->power_savings_balance = 1;
 +              sds->min_nr_running = ULONG_MAX;
 +              sds->leader_nr_running = 0;
 +      }
 +}
  
 -              /* Tally up the load of all CPUs in the group */
 -              sum_weighted_load = sum_nr_running = avg_load = 0;
 -              sum_avg_load_per_task = avg_load_per_task = 0;
 +/**
 + * update_sd_power_savings_stats - Update the power saving stats for a
 + * sched_domain while performing load balancing.
 + *
 + * @group: sched_group belonging to the sched_domain under consideration.
 + * @sds: Variable containing the statistics of the sched_domain
 + * @local_group: Does group contain the CPU for which we're performing
 + *            load balancing ?
 + * @sgs: Variable containing the statistics of the group.
 + */
 +static inline void update_sd_power_savings_stats(struct sched_group *group,
 +      struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
 +{
  
 -              max_cpu_load = 0;
 -              min_cpu_load = ~0UL;
 +      if (!sds->power_savings_balance)
 +              return;
  
 -              for_each_cpu_and(i, sched_group_cpus(group), cpus) {
 -                      struct rq *rq = cpu_rq(i);
 +      /*
 +       * If the local group is idle or completely loaded
 +       * no need to do power savings balance at this domain
 +       */
 +      if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
 +                              !sds->this_nr_running))
 +              sds->power_savings_balance = 0;
  
 -                      if (*sd_idle && rq->nr_running)
 -                              *sd_idle = 0;
 +      /*
 +       * If a group is already running at full capacity or idle,
 +       * don't include that group in power savings calculations
 +       */
 +      if (!sds->power_savings_balance ||
 +              sgs->sum_nr_running >= sgs->group_capacity ||
 +              !sgs->sum_nr_running)
 +              return;
  
 -                      /* Bias balancing toward cpus of our domain */
 -                      if (local_group) {
 -                              if (idle_cpu(i) && !first_idle_cpu) {
 -                                      first_idle_cpu = 1;
 -                                      balance_cpu = i;
 -                              }
 +      /*
 +       * Calculate the group which has the least non-idle load.
 +       * This is the group from where we need to pick up the load
 +       * for saving power
 +       */
 +      if ((sgs->sum_nr_running < sds->min_nr_running) ||
 +          (sgs->sum_nr_running == sds->min_nr_running &&
 +           group_first_cpu(group) > group_first_cpu(sds->group_min))) {
 +              sds->group_min = group;
 +              sds->min_nr_running = sgs->sum_nr_running;
 +              sds->min_load_per_task = sgs->sum_weighted_load /
 +                                              sgs->sum_nr_running;
 +      }
  
 -                              load = target_load(i, load_idx);
 -                      } else {
 -                              load = source_load(i, load_idx);
 -                              if (load > max_cpu_load)
 -                                      max_cpu_load = load;
 -                              if (min_cpu_load > load)
 -                                      min_cpu_load = load;
 -                      }
 +      /*
 +       * Calculate the group which is almost near its
 +       * capacity but still has some space to pick up some load
 +       * from other group and save more power
 +       */
 +      if (sgs->sum_nr_running > sgs->group_capacity - 1)
 +              return;
  
 -                      avg_load += load;
 -                      sum_nr_running += rq->nr_running;
 -                      sum_weighted_load += weighted_cpuload(i);
 +      if (sgs->sum_nr_running > sds->leader_nr_running ||
 +          (sgs->sum_nr_running == sds->leader_nr_running &&
 +           group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
 +              sds->group_leader = group;
 +              sds->leader_nr_running = sgs->sum_nr_running;
 +      }
 +}
  
 -                      sum_avg_load_per_task += cpu_avg_load_per_task(i);
 -              }
 +/**
 + * check_power_save_busiest_group - see if there is potential for some power-savings balance
 + * @sds: Variable containing the statistics of the sched_domain
 + *    under consideration.
 + * @this_cpu: Cpu at which we're currently performing load-balancing.
 + * @imbalance: Variable to store the imbalance.
 + *
 + * Description:
 + * Check if we have potential to perform some power-savings balance.
 + * If yes, set the busiest group to be the least loaded group in the
 + * sched_domain, so that it's CPUs can be put to idle.
 + *
 + * Returns 1 if there is potential to perform power-savings balance.
 + * Else returns 0.
 + */
 +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
 +                                      int this_cpu, unsigned long *imbalance)
 +{
 +      if (!sds->power_savings_balance)
 +              return 0;
  
 -              /*
 -               * First idle cpu or the first cpu(busiest) in this sched group
 -               * is eligible for doing load balancing at this and above
 -               * domains. In the newly idle case, we will allow all the cpu's
 -               * to do the newly idle load balance.
 -               */
 -              if (idle != CPU_NEWLY_IDLE && local_group &&
 -                  balance_cpu != this_cpu && balance) {
 -                      *balance = 0;
 -                      goto ret;
 -              }
 +      if (sds->this != sds->group_leader ||
 +                      sds->group_leader == sds->group_min)
 +              return 0;
  
 -              total_load += avg_load;
 -              total_pwr += group->__cpu_power;
 +      *imbalance = sds->min_load_per_task;
 +      sds->busiest = sds->group_min;
  
 -              /* Adjust by relative CPU power of the group */
 -              avg_load = sg_div_cpu_power(group,
 -                              avg_load * SCHED_LOAD_SCALE);
 +      if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
 +              cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
 +                      group_first_cpu(sds->group_leader);
 +      }
  
 +      return 1;
  
 -              /*
 -               * Consider the group unbalanced when the imbalance is larger
 -               * than the average weight of two tasks.
 -               *
 -               * APZ: with cgroup the avg task weight can vary wildly and
 -               *      might not be a suitable number - should we keep a
 -               *      normalized nr_running number somewhere that negates
 -               *      the hierarchy?
 -               */
 -              avg_load_per_task = sg_div_cpu_power(group,
 -                              sum_avg_load_per_task * SCHED_LOAD_SCALE);
 +}
 +#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 +static inline void init_sd_power_savings_stats(struct sched_domain *sd,
 +      struct sd_lb_stats *sds, enum cpu_idle_type idle)
 +{
 +      return;
 +}
  
 -              if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
 -                      __group_imb = 1;
 +static inline void update_sd_power_savings_stats(struct sched_group *group,
 +      struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
 +{
 +      return;
 +}
 +
 +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
 +                                      int this_cpu, unsigned long *imbalance)
 +{
 +      return 0;
 +}
 +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 +
 +
 +/**
 + * update_sg_lb_stats - Update sched_group's statistics for load balancing.
 + * @group: sched_group whose statistics are to be updated.
 + * @this_cpu: Cpu for which load balance is currently performed.
 + * @idle: Idle status of this_cpu
 + * @load_idx: Load index of sched_domain of this_cpu for load calc.
 + * @sd_idle: Idle status of the sched_domain containing group.
 + * @local_group: Does group contain this_cpu.
 + * @cpus: Set of cpus considered for load balancing.
 + * @balance: Should we balance.
 + * @sgs: variable to hold the statistics for this group.
 + */
 +static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
 +                      enum cpu_idle_type idle, int load_idx, int *sd_idle,
 +                      int local_group, const struct cpumask *cpus,
 +                      int *balance, struct sg_lb_stats *sgs)
 +{
 +      unsigned long load, max_cpu_load, min_cpu_load;
 +      int i;
 +      unsigned int balance_cpu = -1, first_idle_cpu = 0;
 +      unsigned long sum_avg_load_per_task;
 +      unsigned long avg_load_per_task;
  
 -              group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
 +      if (local_group)
 +              balance_cpu = group_first_cpu(group);
  
 +      /* Tally up the load of all CPUs in the group */
 +      sum_avg_load_per_task = avg_load_per_task = 0;
 +      max_cpu_load = 0;
 +      min_cpu_load = ~0UL;
 +
 +      for_each_cpu_and(i, sched_group_cpus(group), cpus) {
 +              struct rq *rq = cpu_rq(i);
 +
 +              if (*sd_idle && rq->nr_running)
 +                      *sd_idle = 0;
 +
 +              /* Bias balancing toward cpus of our domain */
                if (local_group) {
 -                      this_load = avg_load;
 -                      this = group;
 -                      this_nr_running = sum_nr_running;
 -                      this_load_per_task = sum_weighted_load;
 -              } else if (avg_load > max_load &&
 -                         (sum_nr_running > group_capacity || __group_imb)) {
 -                      max_load = avg_load;
 -                      busiest = group;
 -                      busiest_nr_running = sum_nr_running;
 -                      busiest_load_per_task = sum_weighted_load;
 -                      group_imb = __group_imb;
 +                      if (idle_cpu(i) && !first_idle_cpu) {
 +                              first_idle_cpu = 1;
 +                              balance_cpu = i;
 +                      }
 +
 +                      load = target_load(i, load_idx);
 +              } else {
 +                      load = source_load(i, load_idx);
 +                      if (load > max_cpu_load)
 +                              max_cpu_load = load;
 +                      if (min_cpu_load > load)
 +                              min_cpu_load = load;
                }
  
 -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 -              /*
 -               * Busy processors will not participate in power savings
 -               * balance.
 -               */
 -              if (idle == CPU_NOT_IDLE ||
 -                              !(sd->flags & SD_POWERSAVINGS_BALANCE))
 -                      goto group_next;
 +              sgs->group_load += load;
 +              sgs->sum_nr_running += rq->nr_running;
 +              sgs->sum_weighted_load += weighted_cpuload(i);
  
 -              /*
 -               * If the local group is idle or completely loaded
 -               * no need to do power savings balance at this domain
 -               */
 -              if (local_group && (this_nr_running >= group_capacity ||
 -                                  !this_nr_running))
 -                      power_savings_balance = 0;
 +              sum_avg_load_per_task += cpu_avg_load_per_task(i);
 +      }
  
 -              /*
 -               * If a group is already running at full capacity or idle,
 -               * don't include that group in power savings calculations
 -               */
 -              if (!power_savings_balance || sum_nr_running >= group_capacity
 -                  || !sum_nr_running)
 -                      goto group_next;
 +      /*
 +       * First idle cpu or the first cpu(busiest) in this sched group
 +       * is eligible for doing load balancing at this and above
 +       * domains. In the newly idle case, we will allow all the cpu's
 +       * to do the newly idle load balance.
 +       */
 +      if (idle != CPU_NEWLY_IDLE && local_group &&
 +          balance_cpu != this_cpu && balance) {
 +              *balance = 0;
 +              return;
 +      }
  
 -              /*
 -               * Calculate the group which has the least non-idle load.
 -               * This is the group from where we need to pick up the load
 -               * for saving power
 -               */
 -              if ((sum_nr_running < min_nr_running) ||
 -                  (sum_nr_running == min_nr_running &&
 -                   cpumask_first(sched_group_cpus(group)) >
 -                   cpumask_first(sched_group_cpus(group_min)))) {
 -                      group_min = group;
 -                      min_nr_running = sum_nr_running;
 -                      min_load_per_task = sum_weighted_load /
 -                                              sum_nr_running;
 -              }
 +      /* Adjust by relative CPU power of the group */
 +      sgs->avg_load = sg_div_cpu_power(group,
 +                      sgs->group_load * SCHED_LOAD_SCALE);
  
 -              /*
 -               * Calculate the group which is almost near its
 -               * capacity but still has some space to pick up some load
 -               * from other group and save more power
 -               */
 -              if (sum_nr_running <= group_capacity - 1) {
 -                      if (sum_nr_running > leader_nr_running ||
 -                          (sum_nr_running == leader_nr_running &&
 -                           cpumask_first(sched_group_cpus(group)) <
 -                           cpumask_first(sched_group_cpus(group_leader)))) {
 -                              group_leader = group;
 -                              leader_nr_running = sum_nr_running;
 -                      }
 +
 +      /*
 +       * Consider the group unbalanced when the imbalance is larger
 +       * than the average weight of two tasks.
 +       *
 +       * APZ: with cgroup the avg task weight can vary wildly and
 +       *      might not be a suitable number - should we keep a
 +       *      normalized nr_running number somewhere that negates
 +       *      the hierarchy?
 +       */
 +      avg_load_per_task = sg_div_cpu_power(group,
 +                      sum_avg_load_per_task * SCHED_LOAD_SCALE);
 +
 +      if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
 +              sgs->group_imb = 1;
 +
 +      sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
 +
 +}
 +
 +/**
 + * update_sd_lb_stats - Update sched_group's statistics for load balancing.
 + * @sd: sched_domain whose statistics are to be updated.
 + * @this_cpu: Cpu for which load balance is currently performed.
 + * @idle: Idle status of this_cpu
 + * @sd_idle: Idle status of the sched_domain containing group.
 + * @cpus: Set of cpus considered for load balancing.
 + * @balance: Should we balance.
 + * @sds: variable to hold the statistics for this sched_domain.
 + */
 +static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 +                      enum cpu_idle_type idle, int *sd_idle,
 +                      const struct cpumask *cpus, int *balance,
 +                      struct sd_lb_stats *sds)
 +{
 +      struct sched_group *group = sd->groups;
 +      struct sg_lb_stats sgs;
 +      int load_idx;
 +
 +      init_sd_power_savings_stats(sd, sds, idle);
 +      load_idx = get_sd_load_idx(sd, idle);
 +
 +      do {
 +              int local_group;
 +
 +              local_group = cpumask_test_cpu(this_cpu,
 +                                             sched_group_cpus(group));
 +              memset(&sgs, 0, sizeof(sgs));
 +              update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
 +                              local_group, cpus, balance, &sgs);
 +
 +              if (local_group && balance && !(*balance))
 +                      return;
 +
 +              sds->total_load += sgs.group_load;
 +              sds->total_pwr += group->__cpu_power;
 +
 +              if (local_group) {
 +                      sds->this_load = sgs.avg_load;
 +                      sds->this = group;
 +                      sds->this_nr_running = sgs.sum_nr_running;
 +                      sds->this_load_per_task = sgs.sum_weighted_load;
 +              } else if (sgs.avg_load > sds->max_load &&
 +                         (sgs.sum_nr_running > sgs.group_capacity ||
 +                              sgs.group_imb)) {
 +                      sds->max_load = sgs.avg_load;
 +                      sds->busiest = group;
 +                      sds->busiest_nr_running = sgs.sum_nr_running;
 +                      sds->busiest_load_per_task = sgs.sum_weighted_load;
 +                      sds->group_imb = sgs.group_imb;
                }
 -group_next:
 -#endif
 +
 +              update_sd_power_savings_stats(group, sds, local_group, &sgs);
                group = group->next;
        } while (group != sd->groups);
  
 -      if (!busiest || this_load >= max_load || busiest_nr_running == 0)
 -              goto out_balanced;
 -
 -      avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
 +}
  
 -      if (this_load >= avg_load ||
 -                      100*max_load <= sd->imbalance_pct*this_load)
 -              goto out_balanced;
 +/**
 + * fix_small_imbalance - Calculate the minor imbalance that exists
 + *                    amongst the groups of a sched_domain, during
 + *                    load balancing.
 + * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
 + * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
 + * @imbalance: Variable to store the imbalance.
 + */
 +static inline void fix_small_imbalance(struct sd_lb_stats *sds,
 +                              int this_cpu, unsigned long *imbalance)
 +{
 +      unsigned long tmp, pwr_now = 0, pwr_move = 0;
 +      unsigned int imbn = 2;
 +
 +      if (sds->this_nr_running) {
 +              sds->this_load_per_task /= sds->this_nr_running;
 +              if (sds->busiest_load_per_task >
 +                              sds->this_load_per_task)
 +                      imbn = 1;
 +      } else
 +              sds->this_load_per_task =
 +                      cpu_avg_load_per_task(this_cpu);
  
 -      busiest_load_per_task /= busiest_nr_running;
 -      if (group_imb)
 -              busiest_load_per_task = min(busiest_load_per_task, avg_load);
 +      if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
 +                      sds->busiest_load_per_task * imbn) {
 +              *imbalance = sds->busiest_load_per_task;
 +              return;
 +      }
  
        /*
 -       * We're trying to get all the cpus to the average_load, so we don't
 -       * want to push ourselves above the average load, nor do we wish to
 -       * reduce the max loaded cpu below the average load, as either of these
 -       * actions would just result in more rebalancing later, and ping-pong
 -       * tasks around. Thus we look for the minimum possible imbalance.
 -       * Negative imbalances (*we* are more loaded than anyone else) will
 -       * be counted as no imbalance for these purposes -- we can't fix that
 -       * by pulling tasks to us. Be careful of negative numbers as they'll
 -       * appear as very large values with unsigned longs.
 +       * OK, we don't have enough imbalance to justify moving tasks,
 +       * however we may be able to increase total CPU power used by
 +       * moving them.
         */
 -      if (max_load <= busiest_load_per_task)
 -              goto out_balanced;
  
 +      pwr_now += sds->busiest->__cpu_power *
 +                      min(sds->busiest_load_per_task, sds->max_load);
 +      pwr_now += sds->this->__cpu_power *
 +                      min(sds->this_load_per_task, sds->this_load);
 +      pwr_now /= SCHED_LOAD_SCALE;
 +
 +      /* Amount of load we'd subtract */
 +      tmp = sg_div_cpu_power(sds->busiest,
 +                      sds->busiest_load_per_task * SCHED_LOAD_SCALE);
 +      if (sds->max_load > tmp)
 +              pwr_move += sds->busiest->__cpu_power *
 +                      min(sds->busiest_load_per_task, sds->max_load - tmp);
 +
 +      /* Amount of load we'd add */
 +      if (sds->max_load * sds->busiest->__cpu_power <
 +              sds->busiest_load_per_task * SCHED_LOAD_SCALE)
 +              tmp = sg_div_cpu_power(sds->this,
 +                      sds->max_load * sds->busiest->__cpu_power);
 +      else
 +              tmp = sg_div_cpu_power(sds->this,
 +                      sds->busiest_load_per_task * SCHED_LOAD_SCALE);
 +      pwr_move += sds->this->__cpu_power *
 +                      min(sds->this_load_per_task, sds->this_load + tmp);
 +      pwr_move /= SCHED_LOAD_SCALE;
 +
 +      /* Move if we gain throughput */
 +      if (pwr_move > pwr_now)
 +              *imbalance = sds->busiest_load_per_task;
 +}
 +
 +/**
 + * calculate_imbalance - Calculate the amount of imbalance present within the
 + *                     groups of a given sched_domain during load balance.
 + * @sds: statistics of the sched_domain whose imbalance is to be calculated.
 + * @this_cpu: Cpu for which currently load balance is being performed.
 + * @imbalance: The variable to store the imbalance.
 + */
 +static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 +              unsigned long *imbalance)
 +{
 +      unsigned long max_pull;
        /*
         * In the presence of smp nice balancing, certain scenarios can have
         * max load less than avg load(as we skip the groups at or below
         * its cpu_power, while calculating max_load..)
         */
 -      if (max_load < avg_load) {
 +      if (sds->max_load < sds->avg_load) {
                *imbalance = 0;
 -              goto small_imbalance;
 +              return fix_small_imbalance(sds, this_cpu, imbalance);
        }
  
        /* Don't want to pull so many tasks that a group would go idle */
 -      max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
 +      max_pull = min(sds->max_load - sds->avg_load,
 +                      sds->max_load - sds->busiest_load_per_task);
  
        /* How much load to actually move to equalise the imbalance */
 -      *imbalance = min(max_pull * busiest->__cpu_power,
 -                              (avg_load - this_load) * this->__cpu_power)
 +      *imbalance = min(max_pull * sds->busiest->__cpu_power,
 +              (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
                        / SCHED_LOAD_SCALE;
  
        /*
         * a think about bumping its value to force at least one task to be
         * moved
         */
 -      if (*imbalance < busiest_load_per_task) {
 -              unsigned long tmp, pwr_now, pwr_move;
 -              unsigned int imbn;
 -
 -small_imbalance:
 -              pwr_move = pwr_now = 0;
 -              imbn = 2;
 -              if (this_nr_running) {
 -                      this_load_per_task /= this_nr_running;
 -                      if (busiest_load_per_task > this_load_per_task)
 -                              imbn = 1;
 -              } else
 -                      this_load_per_task = cpu_avg_load_per_task(this_cpu);
 +      if (*imbalance < sds->busiest_load_per_task)
 +              return fix_small_imbalance(sds, this_cpu, imbalance);
  
 -              if (max_load - this_load + busiest_load_per_task >=
 -                                      busiest_load_per_task * imbn) {
 -                      *imbalance = busiest_load_per_task;
 -                      return busiest;
 -              }
 +}
 +/******* find_busiest_group() helpers end here *********************/
  
 -              /*
 -               * OK, we don't have enough imbalance to justify moving tasks,
 -               * however we may be able to increase total CPU power used by
 -               * moving them.
 -               */
 +/**
 + * find_busiest_group - Returns the busiest group within the sched_domain
 + * if there is an imbalance. If there isn't an imbalance, and
 + * the user has opted for power-savings, it returns a group whose
 + * CPUs can be put to idle by rebalancing those tasks elsewhere, if
 + * such a group exists.
 + *
 + * Also calculates the amount of weighted load which should be moved
 + * to restore balance.
 + *
 + * @sd: The sched_domain whose busiest group is to be returned.
 + * @this_cpu: The cpu for which load balancing is currently being performed.
 + * @imbalance: Variable which stores amount of weighted load which should
 + *            be moved to restore balance/put a group to idle.
 + * @idle: The idle status of this_cpu.
 + * @sd_idle: The idleness of sd
 + * @cpus: The set of CPUs under consideration for load-balancing.
 + * @balance: Pointer to a variable indicating if this_cpu
 + *    is the appropriate cpu to perform load balancing at this_level.
 + *
 + * Returns:   - the busiest group if imbalance exists.
 + *            - If no imbalance and user has opted for power-savings balance,
 + *               return the least loaded group whose CPUs can be
 + *               put to idle by rebalancing its tasks onto our group.
 + */
 +static struct sched_group *
 +find_busiest_group(struct sched_domain *sd, int this_cpu,
 +                 unsigned long *imbalance, enum cpu_idle_type idle,
 +                 int *sd_idle, const struct cpumask *cpus, int *balance)
 +{
 +      struct sd_lb_stats sds;
  
 -              pwr_now += busiest->__cpu_power *
 -                              min(busiest_load_per_task, max_load);
 -              pwr_now += this->__cpu_power *
 -                              min(this_load_per_task, this_load);
 -              pwr_now /= SCHED_LOAD_SCALE;
 -
 -              /* Amount of load we'd subtract */
 -              tmp = sg_div_cpu_power(busiest,
 -                              busiest_load_per_task * SCHED_LOAD_SCALE);
 -              if (max_load > tmp)
 -                      pwr_move += busiest->__cpu_power *
 -                              min(busiest_load_per_task, max_load - tmp);
 -
 -              /* Amount of load we'd add */
 -              if (max_load * busiest->__cpu_power <
 -                              busiest_load_per_task * SCHED_LOAD_SCALE)
 -                      tmp = sg_div_cpu_power(this,
 -                                      max_load * busiest->__cpu_power);
 -              else
 -                      tmp = sg_div_cpu_power(this,
 -                              busiest_load_per_task * SCHED_LOAD_SCALE);
 -              pwr_move += this->__cpu_power *
 -                              min(this_load_per_task, this_load + tmp);
 -              pwr_move /= SCHED_LOAD_SCALE;
 +      memset(&sds, 0, sizeof(sds));
  
 -              /* Move if we gain throughput */
 -              if (pwr_move > pwr_now)
 -                      *imbalance = busiest_load_per_task;
 -      }
 +      /*
 +       * Compute the various statistics relavent for load balancing at
 +       * this level.
 +       */
 +      update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
 +                                      balance, &sds);
 +
 +      /* Cases where imbalance does not exist from POV of this_cpu */
 +      /* 1) this_cpu is not the appropriate cpu to perform load balancing
 +       *    at this level.
 +       * 2) There is no busy sibling group to pull from.
 +       * 3) This group is the busiest group.
 +       * 4) This group is more busy than the avg busieness at this
 +       *    sched_domain.
 +       * 5) The imbalance is within the specified limit.
 +       * 6) Any rebalance would lead to ping-pong
 +       */
 +      if (balance && !(*balance))
 +              goto ret;
  
 -      return busiest;
 +      if (!sds.busiest || sds.busiest_nr_running == 0)
 +              goto out_balanced;
  
 -out_balanced:
 -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 -      if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
 -              goto ret;
 +      if (sds.this_load >= sds.max_load)
 +              goto out_balanced;
  
 -      if (this == group_leader && group_leader != group_min) {
 -              *imbalance = min_load_per_task;
 -              if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
 -                      cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
 -                              cpumask_first(sched_group_cpus(group_leader));
 -              }
 -              return group_min;
 -      }
 -#endif
 +      sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
 +
 +      if (sds.this_load >= sds.avg_load)
 +              goto out_balanced;
 +
 +      if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
 +              goto out_balanced;
 +
 +      sds.busiest_load_per_task /= sds.busiest_nr_running;
 +      if (sds.group_imb)
 +              sds.busiest_load_per_task =
 +                      min(sds.busiest_load_per_task, sds.avg_load);
 +
 +      /*
 +       * We're trying to get all the cpus to the average_load, so we don't
 +       * want to push ourselves above the average load, nor do we wish to
 +       * reduce the max loaded cpu below the average load, as either of these
 +       * actions would just result in more rebalancing later, and ping-pong
 +       * tasks around. Thus we look for the minimum possible imbalance.
 +       * Negative imbalances (*we* are more loaded than anyone else) will
 +       * be counted as no imbalance for these purposes -- we can't fix that
 +       * by pulling tasks to us. Be careful of negative numbers as they'll
 +       * appear as very large values with unsigned longs.
 +       */
 +      if (sds.max_load <= sds.busiest_load_per_task)
 +              goto out_balanced;
 +
 +      /* Looks like there is an imbalance. Compute it */
 +      calculate_imbalance(&sds, this_cpu, imbalance);
 +      return sds.busiest;
 +
 +out_balanced:
 +      /*
 +       * There is no obvious imbalance. But check if we can do some balancing
 +       * to save power.
 +       */
 +      if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
 +              return sds.busiest;
  ret:
        *imbalance = 0;
        return NULL;
@@@ -4427,11 -4057,6 +4427,11 @@@ static void run_rebalance_domains(struc
  #endif
  }
  
 +static inline int on_null_domain(int cpu)
 +{
 +      return !rcu_dereference(cpu_rq(cpu)->sd);
 +}
 +
  /*
   * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
   *
@@@ -4489,9 -4114,7 +4489,9 @@@ static inline void trigger_load_balance
            cpumask_test_cpu(cpu, nohz.cpu_mask))
                return;
  #endif
 -      if (time_after_eq(jiffies, rq->next_balance))
 +      /* Don't need to rebalance while attached to NULL domain */
 +      if (time_after_eq(jiffies, rq->next_balance) &&
 +          likely(!on_null_domain(cpu)))
                raise_softirq(SCHED_SOFTIRQ);
  }
  
@@@ -4885,33 -4508,11 +4885,33 @@@ static inline void schedule_debug(struc
  #endif
  }
  
 +static void put_prev_task(struct rq *rq, struct task_struct *prev)
 +{
 +      if (prev->state == TASK_RUNNING) {
 +              u64 runtime = prev->se.sum_exec_runtime;
 +
 +              runtime -= prev->se.prev_sum_exec_runtime;
 +              runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
 +
 +              /*
 +               * In order to avoid avg_overlap growing stale when we are
 +               * indeed overlapping and hence not getting put to sleep, grow
 +               * the avg_overlap on preemption.
 +               *
 +               * We use the average preemption runtime because that
 +               * correlates to the amount of cache footprint a task can
 +               * build up.
 +               */
 +              update_avg(&prev->se.avg_overlap, runtime);
 +      }
 +      prev->sched_class->put_prev_task(rq, prev);
 +}
 +
  /*
   * Pick up the highest-prio task:
   */
  static inline struct task_struct *
 -pick_next_task(struct rq *rq, struct task_struct *prev)
 +pick_next_task(struct rq *rq)
  {
        const struct sched_class *class;
        struct task_struct *p;
  /*
   * schedule() is the main scheduler function.
   */
 -asmlinkage void __sched schedule(void)
 +asmlinkage void __sched __schedule(void)
  {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
        struct rq *rq;
        int cpu;
  
 -need_resched:
 -      preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
        rcu_qsctr_inc(cpu);
@@@ -4983,8 -4586,8 +4983,8 @@@ need_resched_nonpreemptible
        if (unlikely(!rq->nr_running))
                idle_balance(cpu, rq);
  
 -      prev->sched_class->put_prev_task(rq, prev);
 -      next = pick_next_task(rq, prev);
 +      put_prev_task(rq, prev);
 +      next = pick_next_task(rq);
  
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
  
        if (unlikely(reacquire_kernel_lock(current) < 0))
                goto need_resched_nonpreemptible;
 +}
  
 +asmlinkage void __sched schedule(void)
 +{
 +need_resched:
 +      preempt_disable();
 +      __schedule();
        preempt_enable_no_resched();
        if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
                goto need_resched;
  }
  EXPORT_SYMBOL(schedule);
  
 +#ifdef CONFIG_SMP
 +/*
 + * Look out! "owner" is an entirely speculative pointer
 + * access and not reliable.
 + */
 +int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
 +{
 +      unsigned int cpu;
 +      struct rq *rq;
 +
 +      if (!sched_feat(OWNER_SPIN))
 +              return 0;
 +
 +#ifdef CONFIG_DEBUG_PAGEALLOC
 +      /*
 +       * Need to access the cpu field knowing that
 +       * DEBUG_PAGEALLOC could have unmapped it if
 +       * the mutex owner just released it and exited.
 +       */
 +      if (probe_kernel_address(&owner->cpu, cpu))
 +              goto out;
 +#else
 +      cpu = owner->cpu;
 +#endif
 +
 +      /*
 +       * Even if the access succeeded (likely case),
 +       * the cpu field may no longer be valid.
 +       */
 +      if (cpu >= nr_cpumask_bits)
 +              goto out;
 +
 +      /*
 +       * We need to validate that we can do a
 +       * get_cpu() and that we have the percpu area.
 +       */
 +      if (!cpu_online(cpu))
 +              goto out;
 +
 +      rq = cpu_rq(cpu);
 +
 +      for (;;) {
 +              /*
 +               * Owner changed, break to re-assess state.
 +               */
 +              if (lock->owner != owner)
 +                      break;
 +
 +              /*
 +               * Is that owner really running on that cpu?
 +               */
 +              if (task_thread_info(rq->curr) != owner || need_resched())
 +                      return 0;
 +
 +              cpu_relax();
 +      }
 +out:
 +      return 1;
 +}
 +#endif
 +
  #ifdef CONFIG_PREEMPT
  /*
   * this is the entry point to schedule() from in-kernel preemption
@@@ -5106,7 -4642,7 +5106,7 @@@ asmlinkage void __sched preempt_schedul
                 * between schedule and now.
                 */
                barrier();
 -      } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
 +      } while (need_resched());
  }
  EXPORT_SYMBOL(preempt_schedule);
  
@@@ -5135,7 -4671,7 +5135,7 @@@ asmlinkage void __sched preempt_schedul
                 * between schedule and now.
                 */
                barrier();
 -      } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
 +      } while (need_resched());
  }
  
  #endif /* CONFIG_PREEMPT */
@@@ -5196,17 -4732,11 +5196,17 @@@ void __wake_up_locked(wait_queue_head_
        __wake_up_common(q, mode, 1, 0, NULL);
  }
  
 +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
 +{
 +      __wake_up_common(q, mode, 1, 0, key);
 +}
 +
  /**
 - * __wake_up_sync - wake up threads blocked on a waitqueue.
 + * __wake_up_sync_key - wake up threads blocked on a waitqueue.
   * @q: the waitqueue
   * @mode: which threads
   * @nr_exclusive: how many wake-one or wake-many threads to wake up
 + * @key: opaque value to be passed to wakeup targets
   *
   * The sync wakeup differs that the waker knows that it will schedule
   * away soon, so while the target thread will be woken up, it will not
   *
   * On UP it can prevent extra preemption.
   */
 -void
 -__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
 +                      int nr_exclusive, void *key)
  {
        unsigned long flags;
        int sync = 1;
                sync = 0;
  
        spin_lock_irqsave(&q->lock, flags);
 -      __wake_up_common(q, mode, nr_exclusive, sync, NULL);
 +      __wake_up_common(q, mode, nr_exclusive, sync, key);
        spin_unlock_irqrestore(&q->lock, flags);
  }
 +EXPORT_SYMBOL_GPL(__wake_up_sync_key);
 +
 +/*
 + * __wake_up_sync - see __wake_up_sync_key()
 + */
 +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 +{
 +      __wake_up_sync_key(q, mode, nr_exclusive, NULL);
 +}
  EXPORT_SYMBOL_GPL(__wake_up_sync);    /* For internal use only */
  
  /**
@@@ -5624,7 -5145,7 +5624,7 @@@ SYSCALL_DEFINE1(nice, int, increment
        if (increment > 40)
                increment = 40;
  
 -      nice = PRIO_TO_NICE(current->static_prio) + increment;
 +      nice = TASK_NICE(current) + increment;
        if (nice < -20)
                nice = -20;
        if (nice > 19)
@@@ -6897,7 -6418,7 +6897,7 @@@ static void migrate_dead_tasks(unsigne
                if (!rq->nr_running)
                        break;
                update_rq_clock(rq);
 -              next = pick_next_task(rq, rq->curr);
 +              next = pick_next_task(rq);
                if (!next)
                        break;
                next->sched_class->put_prev_task(rq, next);
@@@ -8692,15 -8213,11 +8692,15 @@@ static void init_rt_rq(struct rt_rq *rt
        __set_bit(MAX_RT_PRIO, array->bitmap);
  
  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 -      rt_rq->highest_prio = MAX_RT_PRIO;
 +      rt_rq->highest_prio.curr = MAX_RT_PRIO;
 +#ifdef CONFIG_SMP
 +      rt_rq->highest_prio.next = MAX_RT_PRIO;
 +#endif
  #endif
  #ifdef CONFIG_SMP
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
 +      plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
  #endif
  
        rt_rq->rt_time = 0;
@@@ -10076,7 -9593,7 +10076,7 @@@ static void cpuacct_charge(struct task_
        struct cpuacct *ca;
        int cpu;
  
 -      if (!cpuacct_subsys.active)
 +      if (unlikely(!cpuacct_subsys.active))
                return;
  
        cpu = task_cpu(tsk);
diff --combined kernel/softirq.c
@@@ -180,7 -180,7 +180,7 @@@ asmlinkage void __do_softirq(void
        account_system_vtime(current);
  
        __local_bh_disable((unsigned long)__builtin_return_address(0));
 -      trace_softirq_enter();
 +      lockdep_softirq_enter();
  
        cpu = smp_processor_id();
  restart:
        if (pending)
                wakeup_softirqd();
  
 -      trace_softirq_exit();
 +      lockdep_softirq_exit();
  
        account_system_vtime(current);
        _local_bh_enable();
@@@ -496,7 -496,7 +496,7 @@@ static int __try_remote_softirq(struct 
                cp->flags = 0;
                cp->priv = softirq;
  
-               __smp_call_function_single(cpu, cp);
+               __smp_call_function_single(cpu, cp, 0);
                return 0;
        }
        return 1;