revert ("sched: fair: weight calculations")
Ingo Molnar [Thu, 29 May 2008 09:23:17 +0000 (11:23 +0200)]
Yanmin Zhang reported:

Comparing with kernel 2.6.25, sysbench+mysql(oltp, readonly) has many
regressions with 2.6.26-rc1:

 1) 8-core stoakley: 28%;
 2) 16-core tigerton: 20%;
 3) Itanium Montvale: 50%.

Bisect located this patch:

8f1bc385cfbab474db6c27b5af1e439614f3025c is first bad commit
| commit 8f1bc385cfbab474db6c27b5af1e439614f3025c
| Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
| Date:   Sat Apr 19 19:45:00 2008 +0200
|
|     sched: fair: weight calculations

Revert it to the 2.6.25 state.

Bisected-by: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

kernel/sched.c
kernel/sched_fair.c

index cfa222a..4aac8aa 100644 (file)
@@ -1368,9 +1368,6 @@ static void __resched_task(struct task_struct *p, int tif_bit)
  */
 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 
-/*
- * delta *= weight / lw
- */
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
                struct load_weight *lw)
@@ -1393,6 +1390,12 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
        return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
 
+static inline unsigned long
+calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
+{
+       return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
+}
+
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
        lw->weight += inc;
index e24ecd3..0eb0ae8 100644 (file)
@@ -334,34 +334,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 #endif
 
 /*
- * delta *= w / rw
- */
-static inline unsigned long
-calc_delta_weight(unsigned long delta, struct sched_entity *se)
-{
-       for_each_sched_entity(se) {
-               delta = calc_delta_mine(delta,
-                               se->load.weight, &cfs_rq_of(se)->load);
-       }
-
-       return delta;
-}
-
-/*
- * delta *= rw / w
- */
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
-{
-       for_each_sched_entity(se) {
-               delta = calc_delta_mine(delta,
-                               cfs_rq_of(se)->load.weight, &se->load);
-       }
-
-       return delta;
-}
-
-/*
  * The idea is to set a period in which each task runs once.
  *
  * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
@@ -390,54 +362,47 @@ static u64 __sched_period(unsigned long nr_running)
  */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-       return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
+       u64 slice = __sched_period(cfs_rq->nr_running);
+
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+
+               slice *= se->load.weight;
+               do_div(slice, cfs_rq->load.weight);
+       }
+
+
+       return slice;
 }
 
 /*
  * We calculate the vruntime slice of a to be inserted task
  *
- * vs = s*rw/w = p
+ * vs = s/w = p/rw
  */
 static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        unsigned long nr_running = cfs_rq->nr_running;
+       unsigned long weight;
+       u64 vslice;
 
        if (!se->on_rq)
                nr_running++;
 
-       return __sched_period(nr_running);
-}
-
-/*
- * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
- * that it favours >=0 over <0.
- *
- *   -20         |
- *               |
- *     0 --------+-------
- *             .'
- *    19     .'
- *
- */
-static unsigned long
-calc_delta_asym(unsigned long delta, struct sched_entity *se)
-{
-       struct load_weight lw = {
-               .weight = NICE_0_LOAD,
-               .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
-       };
+       vslice = __sched_period(nr_running);
 
        for_each_sched_entity(se) {
-               struct load_weight *se_lw = &se->load;
+               cfs_rq = cfs_rq_of(se);
 
-               if (se->load.weight < NICE_0_LOAD)
-                       se_lw = &lw;
+               weight = cfs_rq->load.weight;
+               if (!se->on_rq)
+                       weight += se->load.weight;
 
-               delta = calc_delta_mine(delta,
-                               cfs_rq_of(se)->load.weight, se_lw);
+               vslice *= NICE_0_LOAD;
+               do_div(vslice, weight);
        }
 
-       return delta;
+       return vslice;
 }
 
 /*
@@ -454,7 +419,11 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
        curr->sum_exec_runtime += delta_exec;
        schedstat_add(cfs_rq, exec_clock, delta_exec);
-       delta_exec_weighted = calc_delta_fair(delta_exec, curr);
+       delta_exec_weighted = delta_exec;
+       if (unlikely(curr->load.weight != NICE_0_LOAD)) {
+               delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
+                                                       &curr->load);
+       }
        curr->vruntime += delta_exec_weighted;
 }
 
@@ -661,17 +630,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
        if (!initial) {
                /* sleeps upto a single latency don't count. */
-               if (sched_feat(NEW_FAIR_SLEEPERS)) {
-                       unsigned long thresh = sysctl_sched_latency;
-
-                       /*
-                        * convert the sleeper threshold into virtual time
-                        */
-                       if (sched_feat(NORMALIZED_SLEEPER))
-                               thresh = calc_delta_fair(thresh, se);
-
-                       vruntime -= thresh;
-               }
+               if (sched_feat(NEW_FAIR_SLEEPERS))
+                       vruntime -= sysctl_sched_latency;
 
                /* ensure we never gain time by being placed backwards. */
                vruntime = max_vruntime(se->vruntime, vruntime);
@@ -1169,10 +1129,11 @@ static unsigned long wakeup_gran(struct sched_entity *se)
        unsigned long gran = sysctl_sched_wakeup_granularity;
 
        /*
-        * More easily preempt - nice tasks, while not making it harder for
-        * + nice tasks.
+        * More easily preempt - nice tasks, while not making
+        * it harder for + nice tasks.
         */
-       gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
+       if (unlikely(se->load.weight > NICE_0_LOAD))
+               gran = calc_delta_fair(gran, &se->load);
 
        return gran;
 }