sched: Fix poor interactivity on UP systems due to group scheduler nice tune bug
[linux-2.6.git] / kernel / sched_fair.c
index b9b3462..3547699 100644 (file)
@@ -25,7 +25,7 @@
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
- * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  *
  * NOTE: this latency value is not the same as the concept of
  * 'timeslice length' - timeslices in CFS are of variable length
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
- * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
-unsigned int sysctl_sched_min_granularity = 2000000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
+unsigned int sysctl_sched_min_granularity = 750000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
 
 /*
  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  */
-static unsigned int sched_nr_latency = 3;
+static unsigned int sched_nr_latency = 8;
 
 /*
  * After fork, child runs first. If set to 0 (default) then
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 
+/*
+ * The exponential sliding  window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+
 static const struct sched_class fair_sched_class;
 
 /**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
        return cfs_rq->tg->cfs_rq[this_cpu];
 }
 
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+       if (!cfs_rq->on_list) {
+               /*
+                * Ensure we either appear before our parent (if already
+                * enqueued) or force our parent to appear after us when it is
+                * enqueued.  The fact that we always enqueue bottom-up
+                * reduces this to two cases.
+                */
+               if (cfs_rq->tg->parent &&
+                   cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
+                       list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
+               } else {
+                       list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
+               }
+
+               cfs_rq->on_list = 1;
+       }
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+       if (cfs_rq->on_list) {
+               list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+               cfs_rq->on_list = 0;
+       }
+}
+
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
        list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
        return &cpu_rq(this_cpu)->cfs;
 }
 
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
                for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
        WRT_SYSCTL(sched_min_granularity);
        WRT_SYSCTL(sched_latency);
        WRT_SYSCTL(sched_wakeup_granularity);
-       WRT_SYSCTL(sched_shares_ratelimit);
 #undef WRT_SYSCTL
 
        return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
        return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
 
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -514,12 +561,16 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
        curr->vruntime += delta_exec_weighted;
        update_min_vruntime(cfs_rq);
+
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+       cfs_rq->load_unacc_exec_time += delta_exec;
+#endif
 }
 
 static void update_curr(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *curr = cfs_rq->curr;
-       u64 now = rq_of(cfs_rq)->clock;
+       u64 now = rq_of(cfs_rq)->clock_task;
        unsigned long delta_exec;
 
        if (unlikely(!curr))
@@ -602,7 +653,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
        /*
         * We are starting a new run period:
         */
-       se->exec_start = rq_of(cfs_rq)->clock;
+       se->exec_start = rq_of(cfs_rq)->clock_task;
 }
 
 /**************************************************
@@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                list_add(&se->group_node, &cfs_rq->tasks);
        }
        cfs_rq->nr_running++;
-       se->on_rq = 1;
 }
 
 static void
@@ -647,9 +697,168 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                list_del_init(&se->group_node);
        }
        cfs_rq->nr_running--;
-       se->on_rq = 0;
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+# ifdef CONFIG_SMP
+static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+                                           int global_update)
+{
+       struct task_group *tg = cfs_rq->tg;
+       long load_avg;
+
+       load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+       load_avg -= cfs_rq->load_contribution;
+
+       if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
+               atomic_add(load_avg, &tg->load_weight);
+               cfs_rq->load_contribution += load_avg;
+       }
+}
+
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+       u64 period = sysctl_sched_shares_window;
+       u64 now, delta;
+       unsigned long load = cfs_rq->load.weight;
+
+       if (!cfs_rq)
+               return;
+
+       now = rq_of(cfs_rq)->clock;
+       delta = now - cfs_rq->load_stamp;
+
+       /* truncate load history at 4 idle periods */
+       if (cfs_rq->load_stamp > cfs_rq->load_last &&
+           now - cfs_rq->load_last > 4 * period) {
+               cfs_rq->load_period = 0;
+               cfs_rq->load_avg = 0;
+       }
+
+       cfs_rq->load_stamp = now;
+       cfs_rq->load_unacc_exec_time = 0;
+       cfs_rq->load_period += delta;
+       if (load) {
+               cfs_rq->load_last = now;
+               cfs_rq->load_avg += delta * load;
+       }
+
+       /* consider updating load contribution on each fold or truncate */
+       if (global_update || cfs_rq->load_period > period
+           || !cfs_rq->load_period)
+               update_cfs_rq_load_contribution(cfs_rq, global_update);
+
+       while (cfs_rq->load_period > period) {
+               /*
+                * Inline assembly required to prevent the compiler
+                * optimising this loop into a divmod call.
+                * See __iter_div_u64_rem() for another example of this.
+                */
+               asm("" : "+rm" (cfs_rq->load_period));
+               cfs_rq->load_period /= 2;
+               cfs_rq->load_avg /= 2;
+       }
+
+       if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
+               list_del_leaf_cfs_rq(cfs_rq);
+}
+
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+                               long weight_delta)
+{
+       long load_weight, load, shares;
+
+       load = cfs_rq->load.weight + weight_delta;
+
+       load_weight = atomic_read(&tg->load_weight);
+       load_weight -= cfs_rq->load_contribution;
+       load_weight += load;
+
+       shares = (tg->shares * load);
+       if (load_weight)
+               shares /= load_weight;
+
+       if (shares < MIN_SHARES)
+               shares = MIN_SHARES;
+       if (shares > tg->shares)
+               shares = tg->shares;
+
+       return shares;
+}
+
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+       if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq, 0);
+       }
+}
+# else /* CONFIG_SMP */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+                               long weight_delta)
+{
+       return tg->shares;
+}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+# endif /* CONFIG_SMP */
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+                           unsigned long weight)
+{
+       if (se->on_rq) {
+               /* commit outstanding execution time */
+               if (cfs_rq->curr == se)
+                       update_curr(cfs_rq);
+               account_entity_dequeue(cfs_rq, se);
+       }
+
+       update_load_set(&se->load, weight);
+
+       if (se->on_rq)
+               account_entity_enqueue(cfs_rq, se);
+}
+
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+       struct task_group *tg;
+       struct sched_entity *se;
+       long shares;
+
+       if (!cfs_rq)
+               return;
+
+       tg = cfs_rq->tg;
+       se = tg->se[cpu_of(rq_of(cfs_rq))];
+       if (!se)
+               return;
+#ifndef CONFIG_SMP
+       if (likely(se->load.weight == tg->shares))
+               return;
+#endif
+       shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
+
+       reweight_entity(cfs_rq_of(se), se, shares);
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHEDSTATS
@@ -771,6 +980,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
+       update_cfs_load(cfs_rq, 0);
+       update_cfs_shares(cfs_rq, se->load.weight);
        account_entity_enqueue(cfs_rq, se);
 
        if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +993,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        check_spread(cfs_rq, se);
        if (se != cfs_rq->curr)
                __enqueue_entity(cfs_rq, se);
+       se->on_rq = 1;
+
+       if (cfs_rq->nr_running == 1)
+               list_add_leaf_cfs_rq(cfs_rq);
 }
 
 static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1040,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
        if (se != cfs_rq->curr)
                __dequeue_entity(cfs_rq, se);
+       se->on_rq = 0;
+       update_cfs_load(cfs_rq, 0);
        account_entity_dequeue(cfs_rq, se);
        update_min_vruntime(cfs_rq);
+       update_cfs_shares(cfs_rq, 0);
 
        /*
         * Normalize the entity after updating the min_vruntime because the
@@ -872,6 +1090,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
                struct sched_entity *se = __pick_next_entity(cfs_rq);
                s64 delta = curr->vruntime - se->vruntime;
 
+               if (delta < 0)
+                       return;
+
                if (delta > ideal_runtime)
                        resched_task(rq_of(cfs_rq)->curr);
        }
@@ -955,6 +1176,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
         */
        update_curr(cfs_rq);
 
+       /*
+        * Update share accounting for long-running entities.
+        */
+       update_entity_shares_tick(cfs_rq);
+
 #ifdef CONFIG_SCHED_HRTICK
        /*
         * queued ticks are scheduled to match the slice, so don't bother
@@ -1055,6 +1281,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                flags = ENQUEUE_WAKEUP;
        }
 
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq, 0);
+       }
+
        hrtick_update(rq);
 }
 
@@ -1071,12 +1304,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                dequeue_entity(cfs_rq, se, flags);
+
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight)
                        break;
                flags |= DEQUEUE_SLEEP;
        }
 
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq, 0);
+       }
+
        hrtick_update(rq);
 }
 
@@ -1143,67 +1384,36 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
  * Adding load to a group doesn't make a group heavier, but can cause movement
  * of group shares between cpus. Assuming the shares were perfectly aligned one
  * can calculate the shift in shares.
- *
- * The problem is that perfectly aligning the shares is rather expensive, hence
- * we try to avoid doing that too often - see update_shares(), which ratelimits
- * this change.
- *
- * We compensate this by not only taking the current delta into account, but
- * also considering the delta between when the shares were last adjusted and
- * now.
- *
- * We still saw a performance dip, some tracing learned us that between
- * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
- * significantly. Therefore try to bias the error in direction of failing
- * the affine wakeup.
- *
  */
-static long effective_load(struct task_group *tg, int cpu,
-               long wl, long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
        struct sched_entity *se = tg->se[cpu];
 
        if (!tg->parent)
                return wl;
 
-       /*
-        * By not taking the decrease of shares on the other cpu into
-        * account our error leans towards reducing the affine wakeups.
-        */
-       if (!wl && sched_feat(ASYM_EFF_LOAD))
-               return wl;
-
        for_each_sched_entity(se) {
-               long S, rw, s, a, b;
-               long more_w;
-
-               /*
-                * Instead of using this increment, also add the difference
-                * between when the shares were last updated and now.
-                */
-               more_w = se->my_q->load.weight - se->my_q->rq_weight;
-               wl += more_w;
-               wg += more_w;
+               long lw, w;
 
-               S = se->my_q->tg->shares;
-               s = se->my_q->shares;
-               rw = se->my_q->rq_weight;
+               tg = se->my_q->tg;
+               w = se->my_q->load.weight;
 
-               a = S*(rw + wl);
-               b = S*rw + s*wg;
+               /* use this cpu's instantaneous contribution */
+               lw = atomic_read(&tg->load_weight);
+               lw -= se->my_q->load_contribution;
+               lw += w + wg;
 
-               wl = s*(a-b);
+               wl += w;
 
-               if (likely(b))
-                       wl /= b;
+               if (lw > 0 && wl < lw)
+                       wl = (wl * tg->shares) / lw;
+               else
+                       wl = tg->shares;
 
-               /*
-                * Assume the group is already running and will
-                * thus already be accounted for in the weight.
-                *
-                * That is, moving shares between CPUs, does not
-                * alter the group weight.
-                */
+               /* zero point is MIN_SHARES */
+               if (wl < MIN_SHARES)
+                       wl = MIN_SHARES;
+               wl -= se->load.weight;
                wg = 0;
        }
 
@@ -1240,6 +1450,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
         * effect of the currently running task from the load
         * of the current CPU:
         */
+       rcu_read_lock();
        if (sync) {
                tg = task_group(current);
                weight = current->se.load.weight;
@@ -1275,6 +1486,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
                balanced = this_eff_load <= prev_eff_load;
        } else
                balanced = true;
+       rcu_read_unlock();
 
        /*
         * If the currently running task will sleep within
@@ -1311,7 +1523,7 @@ static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                  int this_cpu, int load_idx)
 {
-       struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+       struct sched_group *idlest = NULL, *group = sd->groups;
        unsigned long min_load = ULONG_MAX, this_load = 0;
        int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
@@ -1346,7 +1558,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
                if (local_group) {
                        this_load = avg_load;
-                       this = group;
                } else if (avg_load < min_load) {
                        min_load = avg_load;
                        idlest = group;
@@ -1507,23 +1718,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
                        sd = tmp;
        }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       if (sched_feat(LB_SHARES_UPDATE)) {
-               /*
-                * Pick the largest domain to update shares over
-                */
-               tmp = sd;
-               if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
-                       tmp = affine_sd;
-
-               if (tmp) {
-                       raw_spin_unlock(&rq->lock);
-                       update_shares(tmp);
-                       raw_spin_lock(&rq->lock);
-               }
-       }
-#endif
-
        if (affine_sd) {
                if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
                        return select_idle_sibling(p, cpu);
@@ -1653,12 +1847,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
        int scale = cfs_rq->nr_running >= sched_nr_latency;
 
-       if (unlikely(rt_prio(p->prio)))
-               goto preempt;
-
-       if (unlikely(p->sched_class != &fair_sched_class))
-               return;
-
        if (unlikely(se == pse))
                return;
 
@@ -1797,7 +1985,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         * 2) too many balance attempts have failed.
         */
 
-       tsk_cache_hot = task_hot(p, rq->clock, sd);
+       tsk_cache_hot = task_hot(p, rq->clock_task, sd);
        if (!tsk_cache_hot ||
                sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
@@ -1914,6 +2102,48 @@ out:
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * update tg->load_weight by folding this cpu's load_avg
+ */
+static int update_shares_cpu(struct task_group *tg, int cpu)
+{
+       struct cfs_rq *cfs_rq;
+       unsigned long flags;
+       struct rq *rq;
+
+       if (!tg->se[cpu])
+               return 0;
+
+       rq = cpu_rq(cpu);
+       cfs_rq = tg->cfs_rq[cpu];
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+
+       update_rq_clock(rq);
+       update_cfs_load(cfs_rq, 1);
+
+       /*
+        * We need to update shares after updating tg->load_weight in
+        * order to adjust the weight of groups with long running tasks.
+        */
+       update_cfs_shares(cfs_rq, 0);
+
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+       return 0;
+}
+
+static void update_shares(int cpu)
+{
+       struct cfs_rq *cfs_rq;
+       struct rq *rq = cpu_rq(cpu);
+
+       rcu_read_lock();
+       for_each_leaf_cfs_rq(rq, cfs_rq)
+               update_shares_cpu(cfs_rq->tg, cpu);
+       rcu_read_unlock();
+}
+
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
@@ -1961,6 +2191,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        return max_load_move - rem_load_move;
 }
 #else
+static inline void update_shares(int cpu)
+{
+}
+
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
@@ -2029,12 +2263,17 @@ struct sd_lb_stats {
        unsigned long this_load;
        unsigned long this_load_per_task;
        unsigned long this_nr_running;
+       unsigned long this_has_capacity;
+       unsigned int  this_idle_cpus;
 
        /* Statistics of the busiest group */
+       unsigned int  busiest_idle_cpus;
        unsigned long max_load;
        unsigned long busiest_load_per_task;
        unsigned long busiest_nr_running;
        unsigned long busiest_group_capacity;
+       unsigned long busiest_has_capacity;
+       unsigned int  busiest_group_weight;
 
        int group_imb; /* Is there imbalance in this sd */
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2056,7 +2295,10 @@ struct sg_lb_stats {
        unsigned long sum_nr_running; /* Nr tasks running in the group */
        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
        unsigned long group_capacity;
+       unsigned long idle_cpus;
+       unsigned long group_weight;
        int group_imb; /* Is there an imbalance in the group ? */
+       int group_has_capacity; /* Is there extra capacity in the group? */
 };
 
 /**
@@ -2266,10 +2508,14 @@ unsigned long scale_rt_power(int cpu)
        struct rq *rq = cpu_rq(cpu);
        u64 total, available;
 
-       sched_avg_update(rq);
-
        total = sched_avg_period() + (rq->clock - rq->age_stamp);
-       available = total - rq->rt_avg;
+
+       if (unlikely(total < rq->rt_avg)) {
+               /* Ensures that power won't end up being negative */
+               available = 0;
+       } else {
+               available = total - rq->rt_avg;
+       }
 
        if (unlikely((s64)total < SCHED_LOAD_SCALE))
                total = SCHED_LOAD_SCALE;
@@ -2354,7 +2600,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
        /*
         * If ~90% of the cpu_power is still there, we're good.
         */
-       if (group->cpu_power * 32 < group->cpu_power_orig * 29)
+       if (group->cpu_power * 32 > group->cpu_power_orig * 29)
                return 1;
 
        return 0;
@@ -2379,7 +2625,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                        int local_group, const struct cpumask *cpus,
                        int *balance, struct sg_lb_stats *sgs)
 {
-       unsigned long load, max_cpu_load, min_cpu_load;
+       unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
        int i;
        unsigned int balance_cpu = -1, first_idle_cpu = 0;
        unsigned long avg_load_per_task = 0;
@@ -2390,6 +2636,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        /* Tally up the load of all CPUs in the group */
        max_cpu_load = 0;
        min_cpu_load = ~0UL;
+       max_nr_running = 0;
 
        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
                struct rq *rq = cpu_rq(i);
@@ -2407,8 +2654,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                        load = target_load(i, load_idx);
                } else {
                        load = source_load(i, load_idx);
-                       if (load > max_cpu_load)
+                       if (load > max_cpu_load) {
                                max_cpu_load = load;
+                               max_nr_running = rq->nr_running;
+                       }
                        if (min_cpu_load > load)
                                min_cpu_load = load;
                }
@@ -2416,7 +2665,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                sgs->group_load += load;
                sgs->sum_nr_running += rq->nr_running;
                sgs->sum_weighted_load += weighted_cpuload(i);
-
+               if (idle_cpu(i))
+                       sgs->idle_cpus++;
        }
 
        /*
@@ -2425,14 +2675,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
         * domains. In the newly idle case, we will allow all the cpu's
         * to do the newly idle load balance.
         */
-       if (idle != CPU_NEWLY_IDLE && local_group &&
-           balance_cpu != this_cpu) {
-               *balance = 0;
-               return;
+       if (idle != CPU_NEWLY_IDLE && local_group) {
+               if (balance_cpu != this_cpu) {
+                       *balance = 0;
+                       return;
+               }
+               update_group_power(sd, this_cpu);
        }
 
-       update_group_power(sd, this_cpu);
-
        /* Adjust by relative CPU power of the group */
        sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
 
@@ -2448,13 +2698,59 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        if (sgs->sum_nr_running)
                avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
 
-       if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+       if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
                sgs->group_imb = 1;
 
-       sgs->group_capacity =
-               DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+       sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
        if (!sgs->group_capacity)
                sgs->group_capacity = fix_small_capacity(sd, group);
+       sgs->group_weight = group->group_weight;
+
+       if (sgs->group_capacity > sgs->sum_nr_running)
+               sgs->group_has_capacity = 1;
+}
+
+/**
+ * update_sd_pick_busiest - return 1 on busiest group
+ * @sd: sched_domain whose statistics are to be checked
+ * @sds: sched_domain statistics
+ * @sg: sched_group candidate to be checked for being the busiest
+ * @sgs: sched_group statistics
+ * @this_cpu: the current cpu
+ *
+ * Determine if @sg is a busier group than the previously selected
+ * busiest group.
+ */
+static bool update_sd_pick_busiest(struct sched_domain *sd,
+                                  struct sd_lb_stats *sds,
+                                  struct sched_group *sg,
+                                  struct sg_lb_stats *sgs,
+                                  int this_cpu)
+{
+       if (sgs->avg_load <= sds->max_load)
+               return false;
+
+       if (sgs->sum_nr_running > sgs->group_capacity)
+               return true;
+
+       if (sgs->group_imb)
+               return true;
+
+       /*
+        * ASYM_PACKING needs to move all the work to the lowest
+        * numbered CPUs in the group, therefore mark all groups
+        * higher than ourself as busy.
+        */
+       if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+           this_cpu < group_first_cpu(sg)) {
+               if (!sds->busiest)
+                       return true;
+
+               if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+                       return true;
+       }
+
+       return false;
 }
 
 /**
@@ -2462,7 +2758,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
  * @sd: sched_domain whose statistics are to be updated.
  * @this_cpu: Cpu for which load balance is currently performed.
  * @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing group.
+ * @sd_idle: Idle status of the sched_domain containing sg.
  * @cpus: Set of cpus considered for load balancing.
  * @balance: Should we balance.
  * @sds: variable to hold the statistics for this sched_domain.
@@ -2473,7 +2769,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                        struct sd_lb_stats *sds)
 {
        struct sched_domain *child = sd->child;
-       struct sched_group *group = sd->groups;
+       struct sched_group *sg = sd->groups;
        struct sg_lb_stats sgs;
        int load_idx, prefer_sibling = 0;
 
@@ -2486,45 +2782,103 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
        do {
                int local_group;
 
-               local_group = cpumask_test_cpu(this_cpu,
-                                              sched_group_cpus(group));
+               local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
                memset(&sgs, 0, sizeof(sgs));
-               update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
+               update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
                                local_group, cpus, balance, &sgs);
 
                if (local_group && !(*balance))
                        return;
 
                sds->total_load += sgs.group_load;
-               sds->total_pwr += group->cpu_power;
+               sds->total_pwr += sg->cpu_power;
 
                /*
                 * In case the child domain prefers tasks go to siblings
-                * first, lower the group capacity to one so that we'll try
-                * and move all the excess tasks away.
+                * first, lower the sg capacity to one so that we'll try
+                * and move all the excess tasks away. We lower the capacity
+                * of a group only if the local group has the capacity to fit
+                * these excess tasks, i.e. nr_running < group_capacity. The
+                * extra check prevents the case where you always pull from the
+                * heaviest group when it is already under-utilized (possible
+                * with a large weight task outweighs the tasks on the system).
                 */
-               if (prefer_sibling)
+               if (prefer_sibling && !local_group && sds->this_has_capacity)
                        sgs.group_capacity = min(sgs.group_capacity, 1UL);
 
                if (local_group) {
                        sds->this_load = sgs.avg_load;
-                       sds->this = group;
+                       sds->this = sg;
                        sds->this_nr_running = sgs.sum_nr_running;
                        sds->this_load_per_task = sgs.sum_weighted_load;
-               } else if (sgs.avg_load > sds->max_load &&
-                          (sgs.sum_nr_running > sgs.group_capacity ||
-                               sgs.group_imb)) {
+                       sds->this_has_capacity = sgs.group_has_capacity;
+                       sds->this_idle_cpus = sgs.idle_cpus;
+               } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
                        sds->max_load = sgs.avg_load;
-                       sds->busiest = group;
+                       sds->busiest = sg;
                        sds->busiest_nr_running = sgs.sum_nr_running;
+                       sds->busiest_idle_cpus = sgs.idle_cpus;
                        sds->busiest_group_capacity = sgs.group_capacity;
                        sds->busiest_load_per_task = sgs.sum_weighted_load;
+                       sds->busiest_has_capacity = sgs.group_has_capacity;
+                       sds->busiest_group_weight = sgs.group_weight;
                        sds->group_imb = sgs.group_imb;
                }
 
-               update_sd_power_savings_stats(group, sds, local_group, &sgs);
-               group = group->next;
-       } while (group != sd->groups);
+               update_sd_power_savings_stats(sg, sds, local_group, &sgs);
+               sg = sg->next;
+       } while (sg != sd->groups);
+}
+
+int __weak arch_sd_sibling_asym_packing(void)
+{
+       return 0*SD_ASYM_PACKING;
+}
+
+/**
+ * check_asym_packing - Check to see if the group is packed into the
+ *                     sched doman.
+ *
+ * This is primarily intended to used at the sibling level.  Some
+ * cores like POWER7 prefer to use lower numbered SMT threads.  In the
+ * case of POWER7, it can move to lower SMT modes only when higher
+ * threads are idle.  When in lower SMT modes, the threads will
+ * perform better since they share less core resources.  Hence when we
+ * have idle threads, we want them to be the higher ones.
+ *
+ * This packing function is run on idle threads.  It checks to see if
+ * the busiest CPU in this domain (core in the P7 case) has a higher
+ * CPU number than the packing function is being run on.  Here we are
+ * assuming lower CPU number will be equivalent to lower a SMT thread
+ * number.
+ *
+ * Returns 1 when packing is required and a task should be moved to
+ * this CPU.  The amount of the imbalance is returned in *imbalance.
+ *
+ * @sd: The sched_domain whose packing is to be checked.
+ * @sds: Statistics of the sched_domain which is to be packed
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: returns amount of imbalanced due to packing.
+ */
+static int check_asym_packing(struct sched_domain *sd,
+                             struct sd_lb_stats *sds,
+                             int this_cpu, unsigned long *imbalance)
+{
+       int busiest_cpu;
+
+       if (!(sd->flags & SD_ASYM_PACKING))
+               return 0;
+
+       if (!sds->busiest)
+               return 0;
+
+       busiest_cpu = group_first_cpu(sds->busiest);
+       if (this_cpu > busiest_cpu)
+               return 0;
+
+       *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+                                      SCHED_LOAD_SCALE);
+       return 1;
 }
 
 /**
@@ -2664,6 +3018,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
                return fix_small_imbalance(sds, this_cpu, imbalance);
 
 }
+
 /******* find_busiest_group() helpers end here *********************/
 
 /**
@@ -2715,13 +3070,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         * 4) This group is more busy than the avg busieness at this
         *    sched_domain.
         * 5) The imbalance is within the specified limit.
+        *
+        * Note: when doing newidle balance, if the local group has excess
+        * capacity (i.e. nr_running < group_capacity) and the busiest group
+        * does not have any capacity, we force a load balance to pull tasks
+        * to the local group. In this case, we skip past checks 3, 4 and 5.
         */
        if (!(*balance))
                goto ret;
 
+       if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
+           check_asym_packing(sd, &sds, this_cpu, imbalance))
+               return sds.busiest;
+
        if (!sds.busiest || sds.busiest_nr_running == 0)
                goto out_balanced;
 
+       /*  SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+       if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+                       !sds.busiest_has_capacity)
+               goto force_balance;
+
        if (sds.this_load >= sds.max_load)
                goto out_balanced;
 
@@ -2730,9 +3099,28 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        if (sds.this_load >= sds.avg_load)
                goto out_balanced;
 
-       if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
-               goto out_balanced;
+       /*
+        * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
+        * And to check for busy balance use !idle_cpu instead of
+        * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
+        * even when they are idle.
+        */
+       if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
+               if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+                       goto out_balanced;
+       } else {
+               /*
+                * This cpu is idle. If the busiest group load doesn't
+                * have more tasks than the number of available cpu's and
+                * there is no imbalance between this and busiest group
+                * wrt to idle cpu's, it is balanced.
+                */
+               if ((sds.this_idle_cpus  <= sds.busiest_idle_cpus + 1) &&
+                   sds.busiest_nr_running <= sds.busiest_group_weight)
+                       goto out_balanced;
+       }
 
+force_balance:
        /* Looks like there is an imbalance. Compute it */
        calculate_imbalance(&sds, this_cpu, imbalance);
        return sds.busiest;
@@ -2808,9 +3196,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
 /* Working cpumask for load_balance and load_balance_newidle. */
 static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
 
-static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
+static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
+                              int busiest_cpu, int this_cpu)
 {
        if (idle == CPU_NEWLY_IDLE) {
+
+               /*
+                * ASYM_PACKING needs to force migrate tasks from busy but
+                * higher numbered CPUs in order to pack all tasks in the
+                * lowest numbered CPUs.
+                */
+               if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
+                       return 1;
+
                /*
                 * The only task running in a non-idle cpu can be moved to this
                 * cpu in an attempt to completely freeup the other CPU
@@ -2873,7 +3271,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        schedstat_inc(sd, lb_count[idle]);
 
 redo:
-       update_shares(sd);
        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
                                   cpus, balance);
 
@@ -2927,9 +3324,17 @@ redo:
 
        if (!ld_moved) {
                schedstat_inc(sd, lb_failed[idle]);
-               sd->nr_balance_failed++;
+               /*
+                * Increment the failure counter only on periodic balance.
+                * We do not want newidle balance, which can be very
+                * frequent, pollute the failure counter causing
+                * excessive cache_hot migrations and active balances.
+                */
+               if (idle != CPU_NEWLY_IDLE)
+                       sd->nr_balance_failed++;
 
-               if (need_active_balance(sd, sd_idle, idle)) {
+               if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
+                                       this_cpu)) {
                        raw_spin_lock_irqsave(&busiest->lock, flags);
 
                        /* don't kick the active_load_balance_cpu_stop,
@@ -3007,8 +3412,6 @@ out_one_pinned:
        else
                ld_moved = 0;
 out:
-       if (ld_moved)
-               update_shares(sd);
        return ld_moved;
 }
 
@@ -3032,6 +3435,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
         */
        raw_spin_unlock(&this_rq->lock);
 
+       update_shares(this_cpu);
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
                int balance = 1;
@@ -3402,6 +3806,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
        int update_next_balance = 0;
        int need_serialize;
 
+       update_shares(cpu);
+
        for_each_domain(cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
@@ -3490,6 +3896,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
                }
 
                raw_spin_lock_irq(&this_rq->lock);
+               update_rq_clock(this_rq);
                update_cpu_load(this_rq);
                raw_spin_unlock_irq(&this_rq->lock);
 
@@ -3524,7 +3931,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
        if (time_before(now, nohz.next_balance))
                return 0;
 
-       if (!rq->nr_running)
+       if (rq->idle_at_tick)
                return 0;
 
        first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
@@ -3643,8 +4050,13 @@ static void task_fork_fair(struct task_struct *p)
 
        raw_spin_lock_irqsave(&rq->lock, flags);
 
-       if (unlikely(task_cpu(p) != this_cpu))
+       update_rq_clock(rq);
+
+       if (unlikely(task_cpu(p) != this_cpu)) {
+               rcu_read_lock();
                __set_task_cpu(p, this_cpu);
+               rcu_read_unlock();
+       }
 
        update_curr(cfs_rq);
 
@@ -3716,13 +4128,26 @@ static void set_curr_task_fair(struct rq *rq)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
-       struct cfs_rq *cfs_rq = task_cfs_rq(p);
-
-       update_curr(cfs_rq);
+       /*
+        * If the task was not on the rq at the time of this cgroup movement
+        * it must have been asleep, sleeping tasks keep their ->vruntime
+        * absolute on their old rq until wakeup (needed for the fair sleeper
+        * bonus in place_entity()).
+        *
+        * If it was on the rq, we've just 'preempted' it, which does convert
+        * ->vruntime to a relative base.
+        *
+        * Make sure both cases convert their relative position when migrating
+        * to another cgroup's rq. This does somewhat interfere with the
+        * fair sleeper stuff for the first placement, but who cares.
+        */
+       if (!on_rq)
+               p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+       set_task_rq(p, task_cpu(p));
        if (!on_rq)
-               place_entity(cfs_rq, &p->se, 1);
+               p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
 }
 #endif
 
@@ -3774,7 +4199,7 @@ static const struct sched_class fair_sched_class = {
        .get_rr_interval        = get_rr_interval_fair,
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-       .moved_group            = moved_group_fair,
+       .task_move_group        = task_move_group_fair,
 #endif
 };