scheduler: Re-compute time-average nr_running on read
Alex Frid [Fri, 18 May 2012 19:18:38 +0000 (12:18 -0700)]
Re-compute time-average nr_running when it is read. This would
prevent reading stalled average value if there were no run-queue
changes for a long time. New average value is returned to the reader,
but not stored to avoid concurrent writes. Light-weight sequential
counter synchronization is used to assure data consistency for
re-computing average.

Change-Id: I8e4ea1b28ea00b3ddaf6ef7cdcd27866f87d360b
Signed-off-by: Alex Frid <afrid@nvidia.com>
(cherry picked from commit 527a759d9b40bf57958eb002edd2bb82014dab99)
Reviewed-on: http://git-master/r/111637
Reviewed-by: Sai Gurrappadi <sgurrappadi@nvidia.com>
Tested-by: Sai Gurrappadi <sgurrappadi@nvidia.com>
Reviewed-by: Automatic_Commit_Validation_User
Reviewed-by: Peter Boonstoppel <pboonstoppel@nvidia.com>
Reviewed-by: Yu-Huan Hsu <yhsu@nvidia.com>

kernel/sched.c

index 8b1b096..bb40a1b 100644 (file)
@@ -475,6 +475,7 @@ struct rq {
        /* time-based average load */
        u64 nr_last_stamp;
        unsigned int ave_nr_running;
+       seqcount_t ave_seqcnt;
 
        /* capture load from *all* tasks on this cpu: */
        struct load_weight load;
@@ -1770,31 +1771,39 @@ static const struct sched_class rt_sched_class;
 #define NR_AVE_PERIOD          (1 << NR_AVE_PERIOD_EXP)
 #define NR_AVE_DIV_PERIOD(x)   ((x) >> NR_AVE_PERIOD_EXP)
 
-static inline void do_avg_nr_running(struct rq *rq)
+static inline unsigned int do_avg_nr_running(struct rq *rq)
 {
        s64 nr, deltax;
+       unsigned int ave_nr_running = rq->ave_nr_running;
 
        deltax = rq->clock_task - rq->nr_last_stamp;
-       rq->nr_last_stamp = rq->clock_task;
        nr = NR_AVE_SCALE(rq->nr_running);
 
        if (deltax > NR_AVE_PERIOD)
-               rq->ave_nr_running = nr;
+               ave_nr_running = nr;
        else
-               rq->ave_nr_running +=
-                       NR_AVE_DIV_PERIOD(deltax * (nr - rq->ave_nr_running));
+               ave_nr_running +=
+                       NR_AVE_DIV_PERIOD(deltax * (nr - ave_nr_running));
+
+       return ave_nr_running;
 }
 
 static void inc_nr_running(struct rq *rq)
 {
-       do_avg_nr_running(rq);
+       write_seqcount_begin(&rq->ave_seqcnt);
+       rq->ave_nr_running = do_avg_nr_running(rq);
+       rq->nr_last_stamp = rq->clock_task;
        rq->nr_running++;
+       write_seqcount_end(&rq->ave_seqcnt);
 }
 
 static void dec_nr_running(struct rq *rq)
 {
-       do_avg_nr_running(rq);
+       write_seqcount_begin(&rq->ave_seqcnt);
+       rq->ave_nr_running = do_avg_nr_running(rq);
+       rq->nr_last_stamp = rq->clock_task;
        rq->nr_running--;
+       write_seqcount_end(&rq->ave_seqcnt);
 }
 
 static void set_load_weight(struct task_struct *p)
@@ -3289,9 +3298,26 @@ unsigned long nr_iowait(void)
 unsigned long avg_nr_running(void)
 {
        unsigned long i, sum = 0;
+       unsigned int seqcnt, ave_nr_running;
 
-       for_each_online_cpu(i)
-               sum += cpu_rq(i)->ave_nr_running;
+       for_each_online_cpu(i) {
+               struct rq *q = cpu_rq(i);
+
+               /*
+                * Update average to avoid reading stalled value if there were
+                * no run-queue changes for a long time. On the other hand if
+                * the changes are happening right now, just read current value
+                * directly.
+                */
+               seqcnt = read_seqcount_begin(&q->ave_seqcnt);
+               ave_nr_running = do_avg_nr_running(q);
+               if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) {
+                       read_seqcount_begin(&q->ave_seqcnt);
+                       ave_nr_running = q->ave_nr_running;
+               }
+
+               sum += ave_nr_running;
+       }
 
        return sum;
 }