Merge branches 'sched/rt' and 'sched/urgent' into sched/core
[linux-2.6.git] / kernel / sched.c
index e1fc67d..1dae85a 100644 (file)
@@ -467,11 +467,17 @@ struct rt_rq {
        struct rt_prio_array active;
        unsigned long rt_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       int highest_prio; /* highest queued rt task prio */
+       struct {
+               int curr; /* highest queued rt task prio */
+#ifdef CONFIG_SMP
+               int next; /* next highest */
+#endif
+       } highest_prio;
 #endif
 #ifdef CONFIG_SMP
        unsigned long rt_nr_migratory;
        int overloaded;
+       struct plist_head pushable_tasks;
 #endif
        int rt_throttled;
        u64 rt_time;
@@ -1610,21 +1616,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 
 #endif
 
+#ifdef CONFIG_PREEMPT
+
 /*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ * way at the expense of forcing extra atomic operations in all
+ * invocations.  This assures that the double_lock is acquired using the
+ * same underlying policy as the spinlock_t on this architecture, which
+ * reduces latency compared to the unfair variant below.  However, it
+ * also adds more overhead and therefore may reduce throughput.
  */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(this_rq->lock)
+       __acquires(busiest->lock)
+       __acquires(this_rq->lock)
+{
+       spin_unlock(&this_rq->lock);
+       double_rq_lock(this_rq, busiest);
+
+       return 1;
+}
+
+#else
+/*
+ * Unfair double_lock_balance: Optimizes throughput at the expense of
+ * latency by eliminating extra atomic operations when the locks are
+ * already in proper order on entry.  This favors lower cpu-ids and will
+ * grant the double lock to lower cpus over higher ids under contention,
+ * regardless of entry order into the function.
+ */
+static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(this_rq->lock)
        __acquires(busiest->lock)
        __acquires(this_rq->lock)
 {
        int ret = 0;
 
-       if (unlikely(!irqs_disabled())) {
-               /* printk() doesn't work good under rq->lock */
-               spin_unlock(&this_rq->lock);
-               BUG_ON(1);
-       }
        if (unlikely(!spin_trylock(&busiest->lock))) {
                if (busiest < this_rq) {
                        spin_unlock(&this_rq->lock);
@@ -1637,6 +1664,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
        return ret;
 }
 
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+{
+       if (unlikely(!irqs_disabled())) {
+               /* printk() doesn't work good under rq->lock */
+               spin_unlock(&this_rq->lock);
+               BUG_ON(1);
+       }
+
+       return _double_lock_balance(this_rq, busiest);
+}
+
 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(busiest->lock)
 {
@@ -1705,6 +1748,9 @@ static void update_avg(u64 *avg, u64 sample)
 
 static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
+       if (wakeup)
+               p->se.start_runtime = p->se.sum_exec_runtime;
+
        sched_info_queued(p);
        p->sched_class->enqueue_task(rq, p, wakeup);
        p->se.on_rq = 1;
@@ -1712,10 +1758,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
-       if (sleep && p->se.last_wakeup) {
-               update_avg(&p->se.avg_overlap,
-                          p->se.sum_exec_runtime - p->se.last_wakeup);
-               p->se.last_wakeup = 0;
+       if (sleep) {
+               if (p->se.last_wakeup) {
+                       update_avg(&p->se.avg_overlap,
+                               p->se.sum_exec_runtime - p->se.last_wakeup);
+                       p->se.last_wakeup = 0;
+               } else {
+                       update_avg(&p->se.avg_wakeup,
+                               sysctl_sched_wakeup_granularity);
+               }
        }
 
        sched_info_dequeued(p);
@@ -2355,6 +2406,22 @@ out_activate:
        activate_task(rq, p, 1);
        success = 1;
 
+       /*
+        * Only attribute actual wakeups done by this task.
+        */
+       if (!in_interrupt()) {
+               struct sched_entity *se = &current->se;
+               u64 sample = se->sum_exec_runtime;
+
+               if (se->last_wakeup)
+                       sample -= se->last_wakeup;
+               else
+                       sample -= se->start_runtime;
+               update_avg(&se->avg_wakeup, sample);
+
+               se->last_wakeup = se->sum_exec_runtime;
+       }
+
 out_running:
        trace_sched_wakeup(rq, p, success);
        check_preempt_curr(rq, p, sync);
@@ -2365,8 +2432,6 @@ out_running:
                p->sched_class->task_wake_up(rq, p);
 #endif
 out:
-       current->se.last_wakeup = current->se.sum_exec_runtime;
-
        task_rq_unlock(rq, &flags);
 
        return success;
@@ -2396,6 +2461,8 @@ static void __sched_fork(struct task_struct *p)
        p->se.prev_sum_exec_runtime     = 0;
        p->se.last_wakeup               = 0;
        p->se.avg_overlap               = 0;
+       p->se.start_runtime             = 0;
+       p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
 
 #ifdef CONFIG_SCHEDSTATS
        p->se.wait_start                = 0;
@@ -2458,6 +2525,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
 #endif
+       plist_node_init(&p->pushable_tasks, MAX_PRIO);
+
        put_cpu();
 }
 
@@ -2598,6 +2667,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 {
        struct mm_struct *mm = rq->prev_mm;
        long prev_state;
+#ifdef CONFIG_SMP
+       int post_schedule = 0;
+
+       if (current->sched_class->needs_post_schedule)
+               post_schedule = current->sched_class->needs_post_schedule(rq);
+#endif
 
        rq->prev_mm = NULL;
 
@@ -2616,7 +2691,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        finish_arch_switch(prev);
        finish_lock_switch(rq, prev);
 #ifdef CONFIG_SMP
-       if (current->sched_class->post_schedule)
+       if (post_schedule)
                current->sched_class->post_schedule(rq);
 #endif
 
@@ -2997,6 +3072,16 @@ next:
        pulled++;
        rem_load_move -= p->se.load.weight;
 
+#ifdef CONFIG_PREEMPT
+       /*
+        * NEWIDLE balancing is a source of latency, so preemptible kernels
+        * will stop after the first task is pulled to minimize the critical
+        * section.
+        */
+       if (idle == CPU_NEWLY_IDLE)
+               goto out;
+#endif
+
        /*
         * We only want to steal up to the prescribed amount of weighted load.
         */
@@ -3043,9 +3128,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                                sd, idle, all_pinned, &this_best_prio);
                class = class->next;
 
+#ifdef CONFIG_PREEMPT
+               /*
+                * NEWIDLE balancing is a source of latency, so preemptible
+                * kernels will stop after the first task is pulled to minimize
+                * the critical section.
+                */
                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
                        break;
-
+#endif
        } while (class && max_load_move > total_load_moved);
 
        return total_load_moved > 0;
@@ -8219,11 +8310,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        __set_bit(MAX_RT_PRIO, array->bitmap);
 
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       rt_rq->highest_prio = MAX_RT_PRIO;
+       rt_rq->highest_prio.curr = MAX_RT_PRIO;
+#ifdef CONFIG_SMP
+       rt_rq->highest_prio.next = MAX_RT_PRIO;
+#endif
 #endif
 #ifdef CONFIG_SMP
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
+       plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
 #endif
 
        rt_rq->rt_time = 0;