sched: Also serialize ttwu_local() with p->pi_lock

[linux-2.6.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 66ca5d9ba83ce561503ce5826f5c6808103902af..6b269b79c52c84bb5a2f00025cd56e1314af6742 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,6 @@
  #include <linux/init.h>
  #include <linux/uaccess.h>
  #include <linux/highmem.h>
-#include <linux/smp_lock.h>
  #include <asm/mmu_context.h>
  #include <linux/interrupt.h>
  #include <linux/capability.h>
@@ -313,6 +312,9 @@ struct cfs_rq {
  
         u64 exec_clock;
         u64 min_vruntime;
+#ifndef CONFIG_64BIT
+       u64 min_vruntime_copy;
+#endif
  
         struct rb_root tasks_timeline;
         struct rb_node *rb_leftmost;
@@ -324,7 +326,7 @@ struct cfs_rq {
          * 'curr' points to currently running entity on this cfs_rq.
          * It is set to NULL otherwise (i.e when none are currently running).
          */
-       struct sched_entity *curr, *next, *last;
+       struct sched_entity *curr, *next, *last, *skip;
  
         unsigned int nr_spread_over;
  
@@ -606,9 +608,6 @@ static inline struct task_group *task_group(struct task_struct *p)
         struct task_group *tg;
         struct cgroup_subsys_state *css;
  
-       if (p->flags & PF_EXITING)
-               return &root_task_group;
-
         css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
                         lockdep_is_held(&task_rq(p)->lock));
         tg = container_of(css, struct task_group, css);
@@ -664,10 +663,9 @@ static void update_rq_clock(struct rq *rq)
  #endif
  
  /**
- * runqueue_is_locked
+ * runqueue_is_locked - Returns true if the current cpu runqueue is locked
   * @cpu: the processor in question.
   *
- * Returns true if the current cpu runqueue is locked.
   * This interface allows printk to be called with the runqueue lock
   * held and know whether or not it is OK to wake up the klogd.
   */
@@ -843,18 +841,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
         return rq->curr == p;
  }
  
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
  static inline int task_running(struct rq *rq, struct task_struct *p)
  {
+#ifdef CONFIG_SMP
+       return p->on_cpu;
+#else
         return task_current(rq, p);
+#endif
  }
  
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
  static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
  {
+#ifdef CONFIG_SMP
+       /*
+        * We can optimise this out completely for !SMP, because the
+        * SMP rebalancing from interrupt is the only thing that cares
+        * here.
+        */
+       next->on_cpu = 1;
+#endif
  }
  
  static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
  {
+#ifdef CONFIG_SMP
+       /*
+        * After ->on_cpu is cleared, the task can be moved to a different CPU.
+        * We must ensure this doesn't happen until the switch is completely
+        * finished.
+        */
+       smp_wmb();
+       prev->on_cpu = 0;
+#endif
  #ifdef CONFIG_DEBUG_SPINLOCK
         /* this is a valid case when another task releases the spinlock */
         rq->lock.owner = current;
@@ -870,15 +889,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
  }
  
  #else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
-       return p->oncpu;
-#else
-       return task_current(rq, p);
-#endif
-}
-
  static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
  {
  #ifdef CONFIG_SMP
@@ -887,7 +897,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
          * SMP rebalancing from interrupt is the only thing that cares
          * here.
          */
-       next->oncpu = 1;
+       next->on_cpu = 1;
  #endif
  #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
         raw_spin_unlock_irq(&rq->lock);
@@ -900,12 +910,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
  {
  #ifdef CONFIG_SMP
         /*
-        * After ->oncpu is cleared, the task can be moved to a different CPU.
+        * After ->on_cpu is cleared, the task can be moved to a different CPU.
          * We must ensure this doesn't happen until the switch is completely
          * finished.
          */
         smp_wmb();
-       prev->oncpu = 0;
+       prev->on_cpu = 0;
  #endif
  #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
         local_irq_enable();
@@ -1686,6 +1696,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
                 __release(rq2->lock);
  }
  
+#else /* CONFIG_SMP */
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+       __acquires(rq1->lock)
+       __acquires(rq2->lock)
+{
+       BUG_ON(!irqs_disabled());
+       BUG_ON(rq1 != rq2);
+       raw_spin_lock(&rq1->lock);
+       __acquire(rq2->lock);   /* Fake it out ;) */
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+       __releases(rq1->lock)
+       __releases(rq2->lock)
+{
+       BUG_ON(rq1 != rq2);
+       raw_spin_unlock(&rq1->lock);
+       __release(rq2->lock);
+}
+
  #endif
  
  static void calc_load_account_idle(struct rq *this_rq);
@@ -1745,7 +1788,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
         update_rq_clock(rq);
         sched_info_queued(p);
         p->sched_class->enqueue_task(rq, p, flags);
-       p->se.on_rq = 1;
  }
  
  static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1753,7 +1795,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
         update_rq_clock(rq);
         sched_info_dequeued(p);
         p->sched_class->dequeue_task(rq, p, flags);
-       p->se.on_rq = 0;
  }
  
  /*
@@ -1880,7 +1921,7 @@ void account_system_vtime(struct task_struct *curr)
          */
         if (hardirq_count())
                 __this_cpu_add(cpu_hardirq_time, delta);
-       else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+       else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
                 __this_cpu_add(cpu_softirq_time, delta);
  
         irq_time_write_end();
@@ -1920,8 +1961,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
                 sched_rt_avg_update(rq, irq_delta);
  }
  
+static int irqtime_account_hi_update(void)
+{
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+       unsigned long flags;
+       u64 latest_ns;
+       int ret = 0;
+
+       local_irq_save(flags);
+       latest_ns = this_cpu_read(cpu_hardirq_time);
+       if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+               ret = 1;
+       local_irq_restore(flags);
+       return ret;
+}
+
+static int irqtime_account_si_update(void)
+{
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+       unsigned long flags;
+       u64 latest_ns;
+       int ret = 0;
+
+       local_irq_save(flags);
+       latest_ns = this_cpu_read(cpu_softirq_time);
+       if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+               ret = 1;
+       local_irq_restore(flags);
+       return ret;
+}
+
  #else /* CONFIG_IRQ_TIME_ACCOUNTING */
  
+#define sched_clock_irqtime    (0)
+
  static void update_rq_clock_task(struct rq *rq, s64 delta)
  {
         rq->clock_task += delta;
@@ -2025,14 +2098,14 @@ inline int task_curr(const struct task_struct *p)
  
  static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                                        const struct sched_class *prev_class,
-                                      int oldprio, int running)
+                                      int oldprio)
  {
         if (prev_class != p->sched_class) {
                 if (prev_class->switched_from)
-                       prev_class->switched_from(rq, p, running);
-               p->sched_class->switched_to(rq, p, running);
-       } else
-               p->sched_class->prio_changed(rq, p, oldprio, running);
+                       prev_class->switched_from(rq, p);
+               p->sched_class->switched_to(rq, p);
+       } else if (oldprio != p->prio)
+               p->sched_class->prio_changed(rq, p, oldprio);
  }
  
  static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2056,7 +2129,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
          * A queue event has occurred, and we're going to schedule.  In
          * this case, we can save a useless back to back clock update.
          */
-       if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
+       if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
                 rq->skip_clock_update = 1;
  }
  
@@ -2125,13 +2198,15 @@ static int migration_cpu_stop(void *data);
   * The task's runqueue lock must be held.
   * Returns true if you have to wait for migration thread.
   */
-static bool migrate_task(struct task_struct *p, struct rq *rq)
+static bool need_migrate_task(struct task_struct *p)
  {
         /*
          * If the task is not on a runqueue (and not running), then
          * the next wake-up will properly place the task.
          */
-       return p->se.on_rq || task_running(rq, p);
+       bool running = p->on_rq || p->on_cpu;
+       smp_rmb(); /* finish_lock_switch() */
+       return running;
  }
  
  /*
@@ -2191,7 +2266,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                 rq = task_rq_lock(p, &flags);
                 trace_sched_wait_task(p);
                 running = task_running(rq, p);
-               on_rq = p->se.on_rq;
+               on_rq = p->on_rq;
                 ncsw = 0;
                 if (!match_state || p->state == match_state)
                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -2249,7 +2324,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
   * Cause a process which is running on another CPU to enter
   * kernel-mode, without any delay. (to get signals handled.)
   *
- * NOTE: this function doesnt have to take the runqueue lock,
+ * NOTE: this function doesn't have to take the runqueue lock,
   * because all it wants to ensure is that the remote task enters
   * the kernel. If the IPI races and the task has been migrated
   * to another CPU then no harm is done and the purpose has been
@@ -2268,30 +2343,9 @@ void kick_process(struct task_struct *p)
  EXPORT_SYMBOL_GPL(kick_process);
  #endif /* CONFIG_SMP */
  
-/**
- * task_oncpu_function_call - call a function on the cpu on which a task runs
- * @p:         the task to evaluate
- * @func:      the function to be called
- * @info:      the function call argument
- *
- * Calls the function @func when the task is currently running. This might
- * be on the current CPU, which just calls the function directly
- */
-void task_oncpu_function_call(struct task_struct *p,
-                             void (*func) (void *info), void *info)
-{
-       int cpu;
-
-       preempt_disable();
-       cpu = task_cpu(p);
-       if (task_curr(p))
-               smp_call_function_single(cpu, func, info, 1);
-       preempt_enable();
-}
-
  #ifdef CONFIG_SMP
  /*
- * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ * ->cpus_allowed is protected by both rq->lock and p->pi_lock
   */
  static int select_fallback_rq(int cpu, struct task_struct *p)
  {
@@ -2324,12 +2378,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
  }
  
  /*
- * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
   */
  static inline
-int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
  {
-       int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
+       int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
  
         /*
          * In order not to call set_task_cpu() on a blocking task we need
@@ -2355,27 +2409,55 @@ static void update_avg(u64 *avg, u64 sample)
  }
  #endif
  
-static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
-                                bool is_sync, bool is_migrate, bool is_local,
-                                unsigned long en_flags)
+static void
+ttwu_stat(struct rq *rq, struct task_struct *p, int cpu, int wake_flags)
  {
+#ifdef CONFIG_SCHEDSTATS
+#ifdef CONFIG_SMP
+       int this_cpu = smp_processor_id();
+
+       if (cpu == this_cpu) {
+               schedstat_inc(rq, ttwu_local);
+               schedstat_inc(p, se.statistics.nr_wakeups_local);
+       } else {
+               struct sched_domain *sd;
+
+               schedstat_inc(p, se.statistics.nr_wakeups_remote);
+               for_each_domain(this_cpu, sd) {
+                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+                               schedstat_inc(sd, ttwu_wake_remote);
+                               break;
+                       }
+               }
+       }
+#endif /* CONFIG_SMP */
+
+       schedstat_inc(rq, ttwu_count);
         schedstat_inc(p, se.statistics.nr_wakeups);
-       if (is_sync)
+
+       if (wake_flags & WF_SYNC)
                 schedstat_inc(p, se.statistics.nr_wakeups_sync);
-       if (is_migrate)
+
+       if (cpu != task_cpu(p))
                 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-       if (is_local)
-               schedstat_inc(p, se.statistics.nr_wakeups_local);
-       else
-               schedstat_inc(p, se.statistics.nr_wakeups_remote);
  
+#endif /* CONFIG_SCHEDSTATS */
+}
+
+static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+{
         activate_task(rq, p, en_flags);
+       p->on_rq = 1;
+
+       /* if a worker is waking up, notify workqueue */
+       if (p->flags & PF_WQ_WORKER)
+               wq_worker_waking_up(p, cpu_of(rq));
  }
  
-static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
-                                       int wake_flags, bool success)
+static void
+ttwu_post_activation(struct task_struct *p, struct rq *rq, int wake_flags)
  {
-       trace_sched_wakeup(p, success);
+       trace_sched_wakeup(p, true);
         check_preempt_curr(rq, p, wake_flags);
  
         p->state = TASK_RUNNING;
@@ -2394,9 +2476,6 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
                 rq->idle_stamp = 0;
         }
  #endif
-       /* if a worker is waking up, notify workqueue */
-       if ((p->flags & PF_WQ_WORKER) && success)
-               wq_worker_waking_up(p, cpu_of(rq));
  }
  
  /**
@@ -2425,40 +2504,30 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
         this_cpu = get_cpu();
  
         smp_wmb();
-       rq = task_rq_lock(p, &flags);
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
+       rq = __task_rq_lock(p);
         if (!(p->state & state))
                 goto out;
  
-       if (p->se.on_rq)
+       cpu = task_cpu(p);
+
+       if (p->on_rq)
                 goto out_running;
  
-       cpu = task_cpu(p);
         orig_cpu = cpu;
-
  #ifdef CONFIG_SMP
         if (unlikely(task_running(rq, p)))
                 goto out_activate;
  
-       /*
-        * In order to handle concurrent wakeups and release the rq->lock
-        * we put the task in TASK_WAKING state.
-        *
-        * First fix up the nr_uninterruptible count:
-        */
-       if (task_contributes_to_load(p)) {
-               if (likely(cpu_online(orig_cpu)))
-                       rq->nr_uninterruptible--;
-               else
-                       this_rq()->nr_uninterruptible--;
-       }
+       p->sched_contributes_to_load = !!task_contributes_to_load(p);
         p->state = TASK_WAKING;
  
         if (p->sched_class->task_waking) {
-               p->sched_class->task_waking(rq, p);
+               p->sched_class->task_waking(p);
                 en_flags |= ENQUEUE_WAKING;
         }
  
-       cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
+       cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
         if (cpu != orig_cpu)
                 set_task_cpu(p, cpu);
         __task_rq_unlock(rq);
@@ -2475,30 +2544,19 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
         WARN_ON(task_cpu(p) != cpu);
         WARN_ON(p->state != TASK_WAKING);
  
-#ifdef CONFIG_SCHEDSTATS
-       schedstat_inc(rq, ttwu_count);
-       if (cpu == this_cpu)
-               schedstat_inc(rq, ttwu_local);
-       else {
-               struct sched_domain *sd;
-               for_each_domain(this_cpu, sd) {
-                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                               schedstat_inc(sd, ttwu_wake_remote);
-                               break;
-                       }
-               }
-       }
-#endif /* CONFIG_SCHEDSTATS */
+       if (p->sched_contributes_to_load)
+               rq->nr_uninterruptible--;
  
  out_activate:
  #endif /* CONFIG_SMP */
-       ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
-                     cpu == this_cpu, en_flags);
-       success = 1;
+       ttwu_activate(rq, p, en_flags);
  out_running:
-       ttwu_post_activation(p, rq, wake_flags, success);
+       ttwu_post_activation(p, rq, wake_flags);
+       ttwu_stat(rq, p, cpu, wake_flags);
+       success = 1;
  out:
-       task_rq_unlock(rq, &flags);
+       __task_rq_unlock(rq);
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
         put_cpu();
  
         return success;
@@ -2508,31 +2566,34 @@ out:
   * try_to_wake_up_local - try to wake up a local task with rq lock held
   * @p: the thread to be awakened
   *
- * Put @p on the run-queue if it's not already there.  The caller must
+ * Put @p on the run-queue if it's not already there. The caller must
   * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.  this_rq() stays locked over invocation.
+ * the current task.
   */
  static void try_to_wake_up_local(struct task_struct *p)
  {
         struct rq *rq = task_rq(p);
-       bool success = false;
  
         BUG_ON(rq != this_rq());
         BUG_ON(p == current);
         lockdep_assert_held(&rq->lock);
  
+       if (!raw_spin_trylock(&p->pi_lock)) {
+               raw_spin_unlock(&rq->lock);
+               raw_spin_lock(&p->pi_lock);
+               raw_spin_lock(&rq->lock);
+       }
+
         if (!(p->state & TASK_NORMAL))
-               return;
+               goto out;
  
-       if (!p->se.on_rq) {
-               if (likely(!task_running(rq, p))) {
-                       schedstat_inc(rq, ttwu_count);
-                       schedstat_inc(rq, ttwu_local);
-               }
-               ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
-               success = true;
-       }
-       ttwu_post_activation(p, rq, 0, success);
+       if (!p->on_rq)
+               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+
+       ttwu_post_activation(p, rq, 0);
+       ttwu_stat(rq, p, smp_processor_id(), 0);
+out:
+       raw_spin_unlock(&p->pi_lock);
  }
  
  /**
@@ -2565,18 +2626,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
   */
  static void __sched_fork(struct task_struct *p)
  {
+       p->on_rq                        = 0;
+
+       p->se.on_rq                     = 0;
         p->se.exec_start                = 0;
         p->se.sum_exec_runtime          = 0;
         p->se.prev_sum_exec_runtime     = 0;
         p->se.nr_migrations             = 0;
+       p->se.vruntime                  = 0;
+       INIT_LIST_HEAD(&p->se.group_node);
  
  #ifdef CONFIG_SCHEDSTATS
         memset(&p->se.statistics, 0, sizeof(p->se.statistics));
  #endif
  
         INIT_LIST_HEAD(&p->rt.run_list);
-       p->se.on_rq = 0;
-       INIT_LIST_HEAD(&p->se.group_node);
  
  #ifdef CONFIG_PREEMPT_NOTIFIERS
         INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2646,8 +2710,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
         if (likely(sched_info_on()))
                 memset(&p->sched_info, 0, sizeof(p->sched_info));
  #endif
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
-       p->oncpu = 0;
+#if defined(CONFIG_SMP)
+       p->on_cpu = 0;
  #endif
  #ifdef CONFIG_PREEMPT
         /* Want to start with kernel preemption disabled. */
@@ -2685,7 +2749,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
          * We set TASK_WAKING so that select_task_rq() can drop rq->lock
          * without people poking at ->cpus_allowed.
          */
-       cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
+       cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
         set_task_cpu(p, cpu);
  
         p->state = TASK_RUNNING;
@@ -2694,7 +2758,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
  
         rq = task_rq_lock(p, &flags);
         activate_task(rq, p, 0);
-       trace_sched_wakeup_new(p, 1);
+       p->on_rq = 1;
+       trace_sched_wakeup_new(p, true);
         check_preempt_curr(rq, p, WF_FORK);
  #ifdef CONFIG_SMP
         if (p->sched_class->task_woken)
@@ -2779,9 +2844,12 @@ static inline void
  prepare_task_switch(struct rq *rq, struct task_struct *prev,
                     struct task_struct *next)
  {
+       sched_info_switch(prev, next);
+       perf_event_task_sched_out(prev, next);
         fire_sched_out_preempt_notifiers(prev, next);
         prepare_lock_switch(rq, next);
         prepare_arch_switch(next);
+       trace_sched_switch(prev, next);
  }
  
  /**
@@ -2914,7 +2982,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
         struct mm_struct *mm, *oldmm;
  
         prepare_task_switch(rq, prev, next);
-       trace_sched_switch(prev, next);
+
         mm = next->mm;
         oldmm = prev->active_mm;
         /*
@@ -3411,7 +3479,7 @@ void sched_exec(void)
         int dest_cpu;
  
         rq = task_rq_lock(p, &flags);
-       dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+       dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
         if (dest_cpu == smp_processor_id())
                 goto unlock;
  
@@ -3419,7 +3487,7 @@ void sched_exec(void)
          * select_task_rq() can race against ->cpus_allowed
          */
         if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-           likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
+           likely(cpu_active(dest_cpu)) && need_migrate_task(p)) {
                 struct migration_arg arg = { p, dest_cpu };
  
                 task_rq_unlock(rq, &flags);
@@ -3570,6 +3638,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
         }
  }
  
+/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+                       cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+       cputime64_t tmp = cputime_to_cputime64(cputime);
+
+       /* Add system time to process. */
+       p->stime = cputime_add(p->stime, cputime);
+       p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+       account_group_system_time(p, cputime);
+
+       /* Add system time to cpustat. */
+       *target_cputime64 = cputime64_add(*target_cputime64, tmp);
+       cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
+       /* Account for system time used */
+       acct_update_integrals(p);
+}
+
  /*
   * Account system cpu time to a process.
   * @p: the process that the cpu time gets accounted to
@@ -3581,36 +3675,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
                          cputime_t cputime, cputime_t cputime_scaled)
  {
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       cputime64_t tmp;
+       cputime64_t *target_cputime64;
  
         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
                 account_guest_time(p, cputime, cputime_scaled);
                 return;
         }
  
-       /* Add system time to process. */
-       p->stime = cputime_add(p->stime, cputime);
-       p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
-       account_group_system_time(p, cputime);
-
-       /* Add system time to cpustat. */
-       tmp = cputime_to_cputime64(cputime);
         if (hardirq_count() - hardirq_offset)
-               cpustat->irq = cputime64_add(cpustat->irq, tmp);
+               target_cputime64 = &cpustat->irq;
         else if (in_serving_softirq())
-               cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+               target_cputime64 = &cpustat->softirq;
         else
-               cpustat->system = cputime64_add(cpustat->system, tmp);
-
-       cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+               target_cputime64 = &cpustat->system;
  
-       /* Account for system time used */
-       acct_update_integrals(p);
+       __account_system_time(p, cputime, cputime_scaled, target_cputime64);
  }
  
  /*
   * Account for involuntary wait time.
- * @steal: the cpu time spent in involuntary wait
+ * @cputime: the cpu time spent in involuntary wait
   */
  void account_steal_time(cputime_t cputime)
  {
@@ -3638,6 +3722,73 @@ void account_idle_time(cputime_t cputime)
  
  #ifndef CONFIG_VIRT_CPU_ACCOUNTING
  
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ *   - check for guest_time
+ *   - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+                                               struct rq *rq)
+{
+       cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+       cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+
+       if (irqtime_account_hi_update()) {
+               cpustat->irq = cputime64_add(cpustat->irq, tmp);
+       } else if (irqtime_account_si_update()) {
+               cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+       } else if (this_cpu_ksoftirqd() == p) {
+               /*
+                * ksoftirqd time do not get accounted in cpu_softirq_time.
+                * So, we have to handle it separately here.
+                * Also, p->stime needs to be updated for ksoftirqd.
+                */
+               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                                       &cpustat->softirq);
+       } else if (user_tick) {
+               account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+       } else if (p == rq->idle) {
+               account_idle_time(cputime_one_jiffy);
+       } else if (p->flags & PF_VCPU) { /* System time or guest time */
+               account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+       } else {
+               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                                       &cpustat->system);
+       }
+}
+
+static void irqtime_account_idle_ticks(int ticks)
+{
+       int i;
+       struct rq *rq = this_rq();
+
+       for (i = 0; i < ticks; i++)
+               irqtime_account_process_tick(current, 0, rq);
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+static void irqtime_account_idle_ticks(int ticks) {}
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+                                               struct rq *rq) {}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
  /*
   * Account a single tick of cpu time.
   * @p: the process that the cpu time gets accounted to
@@ -3648,6 +3799,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
         cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
         struct rq *rq = this_rq();
  
+       if (sched_clock_irqtime) {
+               irqtime_account_process_tick(p, user_tick, rq);
+               return;
+       }
+
         if (user_tick)
                 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
         else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3673,6 +3829,12 @@ void account_steal_ticks(unsigned long ticks)
   */
  void account_idle_ticks(unsigned long ticks)
  {
+
+       if (sched_clock_irqtime) {
+               irqtime_account_idle_ticks(ticks);
+               return;
+       }
+
         account_idle_time(jiffies_to_cputime(ticks));
  }
  
@@ -3898,7 +4060,7 @@ static inline void schedule_debug(struct task_struct *prev)
  
  static void put_prev_task(struct rq *rq, struct task_struct *prev)
  {
-       if (prev->se.on_rq)
+       if (prev->on_rq)
                 update_rq_clock(rq);
         prev->sched_class->put_prev_task(rq, prev);
  }
@@ -3948,9 +4110,6 @@ need_resched:
         rcu_note_context_switch(cpu);
         prev = rq->curr;
  
-       release_kernel_lock(prev);
-need_resched_nonpreemptible:
-
         schedule_debug(prev);
  
         if (sched_feat(HRTICK))
@@ -3963,11 +4122,13 @@ need_resched_nonpreemptible:
                 if (unlikely(signal_pending_state(prev->state, prev))) {
                         prev->state = TASK_RUNNING;
                 } else {
+                       deactivate_task(rq, prev, DEQUEUE_SLEEP);
+                       prev->on_rq = 0;
+
                         /*
-                        * If a worker is going to sleep, notify and
-                        * ask workqueue whether it wants to wake up a
-                        * task to maintain concurrency.  If so, wake
-                        * up the task.
+                        * If a worker went to sleep, notify and ask workqueue
+                        * whether it wants to wake up a task to maintain
+                        * concurrency.
                          */
                         if (prev->flags & PF_WQ_WORKER) {
                                 struct task_struct *to_wakeup;
@@ -3976,7 +4137,16 @@ need_resched_nonpreemptible:
                                 if (to_wakeup)
                                         try_to_wake_up_local(to_wakeup);
                         }
-                       deactivate_task(rq, prev, DEQUEUE_SLEEP);
+
+                       /*
+                        * If we are going to sleep and we have plugged IO
+                        * queued, make sure to submit it to avoid deadlocks.
+                        */
+                       if (blk_needs_flush_plug(prev)) {
+                               raw_spin_unlock(&rq->lock);
+                               blk_flush_plug(prev);
+                               raw_spin_lock(&rq->lock);
+                       }
                 }
                 switch_count = &prev->nvcsw;
         }
@@ -3992,9 +4162,6 @@ need_resched_nonpreemptible:
         rq->skip_clock_update = 0;
  
         if (likely(prev != next)) {
-               sched_info_switch(prev, next);
-               perf_event_task_sched_out(prev, next);
-
                 rq->nr_switches++;
                 rq->curr = next;
                 ++*switch_count;
@@ -4013,9 +4180,6 @@ need_resched_nonpreemptible:
  
         post_schedule(rq);
  
-       if (unlikely(reacquire_kernel_lock(prev)))
-               goto need_resched_nonpreemptible;
-
         preempt_enable_no_resched();
         if (need_resched())
                 goto need_resched;
@@ -4023,70 +4187,53 @@ need_resched_nonpreemptible:
  EXPORT_SYMBOL(schedule);
  
  #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
-{
-       unsigned int cpu;
-       struct rq *rq;
  
-       if (!sched_feat(OWNER_SPIN))
-               return 0;
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+       bool ret = false;
  
-#ifdef CONFIG_DEBUG_PAGEALLOC
-       /*
-        * Need to access the cpu field knowing that
-        * DEBUG_PAGEALLOC could have unmapped it if
-        * the mutex owner just released it and exited.
-        */
-       if (probe_kernel_address(&owner->cpu, cpu))
-               return 0;
-#else
-       cpu = owner->cpu;
-#endif
+       rcu_read_lock();
+       if (lock->owner != owner)
+               goto fail;
  
         /*
-        * Even if the access succeeded (likely case),
-        * the cpu field may no longer be valid.
+        * Ensure we emit the owner->on_cpu, dereference _after_ checking
+        * lock->owner still matches owner, if that fails, owner might
+        * point to free()d memory, if it still matches, the rcu_read_lock()
+        * ensures the memory stays valid.
          */
-       if (cpu >= nr_cpumask_bits)
-               return 0;
+       barrier();
  
-       /*
-        * We need to validate that we can do a
-        * get_cpu() and that we have the percpu area.
-        */
-       if (!cpu_online(cpu))
-               return 0;
+       ret = owner->on_cpu;
+fail:
+       rcu_read_unlock();
  
-       rq = cpu_rq(cpu);
+       return ret;
+}
  
-       for (;;) {
-               /*
-                * Owner changed, break to re-assess state.
-                */
-               if (lock->owner != owner) {
-                       /*
-                        * If the lock has switched to a different owner,
-                        * we likely have heavy contention. Return 0 to quit
-                        * optimistic spinning and not contend further:
-                        */
-                       if (lock->owner)
-                               return 0;
-                       break;
-               }
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+{
+       if (!sched_feat(OWNER_SPIN))
+               return 0;
  
-               /*
-                * Is that owner really running on that cpu?
-                */
-               if (task_thread_info(rq->curr) != owner || need_resched())
+       while (owner_running(lock, owner)) {
+               if (need_resched())
                         return 0;
  
                 arch_mutex_cpu_relax();
         }
  
+       /*
+        * If the owner changed to another task there is likely
+        * heavy contention, stop spinning.
+        */
+       if (lock->owner)
+               return 0;
+
         return 1;
  }
  #endif
@@ -4216,6 +4363,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
  {
         __wake_up_common(q, mode, 1, 0, key);
  }
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
  
  /**
   * __wake_up_sync_key - wake up threads blocked on a waitqueue.
@@ -4552,12 +4700,14 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
  
         BUG_ON(prio < 0 || prio > MAX_PRIO);
  
+       lockdep_assert_held(&p->pi_lock);
+
         rq = task_rq_lock(p, &flags);
  
         trace_sched_pi_setprio(p, prio);
         oldprio = p->prio;
         prev_class = p->sched_class;
-       on_rq = p->se.on_rq;
+       on_rq = p->on_rq;
         running = task_current(rq, p);
         if (on_rq)
                 dequeue_task(rq, p, 0);
@@ -4573,11 +4723,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
  
         if (running)
                 p->sched_class->set_curr_task(rq);
-       if (on_rq) {
+       if (on_rq)
                 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
  
-               check_class_changed(rq, p, prev_class, oldprio, running);
-       }
+       check_class_changed(rq, p, prev_class, oldprio);
         task_rq_unlock(rq, &flags);
  }
  
@@ -4606,7 +4755,7 @@ void set_user_nice(struct task_struct *p, long nice)
                 p->static_prio = NICE_TO_PRIO(nice);
                 goto out_unlock;
         }
-       on_rq = p->se.on_rq;
+       on_rq = p->on_rq;
         if (on_rq)
                 dequeue_task(rq, p, 0);
  
@@ -4740,8 +4889,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
  static void
  __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
  {
-       BUG_ON(p->se.on_rq);
-
         p->policy = policy;
         p->rt_priority = prio;
         p->normal_prio = normal_prio(p);
@@ -4764,8 +4911,11 @@ static bool check_same_owner(struct task_struct *p)
  
         rcu_read_lock();
         pcred = __task_cred(p);
-       match = (cred->euid == pcred->euid ||
-                cred->euid == pcred->uid);
+       if (cred->user->user_ns == pcred->user->user_ns)
+               match = (cred->euid == pcred->euid ||
+                        cred->euid == pcred->uid);
+       else
+               match = false;
         rcu_read_unlock();
         return match;
  }
@@ -4825,12 +4975,15 @@ recheck:
                             param->sched_priority > rlim_rtprio)
                                 return -EPERM;
                 }
+
                 /*
-                * Like positive nice levels, dont allow tasks to
-                * move out of SCHED_IDLE either:
+                * Treat SCHED_IDLE as nice 20. Only allow a switch to
+                * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                  */
-               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
-                       return -EPERM;
+               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+                       if (!can_nice(p, TASK_NICE(p)))
+                               return -EPERM;
+               }
  
                 /* can't change other user's priorities */
                 if (!check_same_owner(p))
@@ -4853,7 +5006,7 @@ recheck:
          */
         raw_spin_lock_irqsave(&p->pi_lock, flags);
         /*
-        * To be able to change p->policy safely, the apropriate
+        * To be able to change p->policy safely, the appropriate
          * runqueue lock must be held.
          */
         rq = __task_rq_lock(p);
@@ -4867,6 +5020,17 @@ recheck:
                 return -EINVAL;
         }
  
+       /*
+        * If not changing anything there's no need to proceed further:
+        */
+       if (unlikely(policy == p->policy && (!rt_policy(policy) ||
+                       param->sched_priority == p->rt_priority))) {
+
+               __task_rq_unlock(rq);
+               raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+               return 0;
+       }
+
  #ifdef CONFIG_RT_GROUP_SCHED
         if (user) {
                 /*
@@ -4890,7 +5054,7 @@ recheck:
                 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                 goto recheck;
         }
-       on_rq = p->se.on_rq;
+       on_rq = p->on_rq;
         running = task_current(rq, p);
         if (on_rq)
                 deactivate_task(rq, p, 0);
@@ -4905,11 +5069,10 @@ recheck:
  
         if (running)
                 p->sched_class->set_curr_task(rq);
-       if (on_rq) {
+       if (on_rq)
                 activate_task(rq, p, 0);
  
-               check_class_changed(rq, p, prev_class, oldprio, running);
-       }
+       check_class_changed(rq, p, prev_class, oldprio);
         __task_rq_unlock(rq);
         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  
@@ -5091,7 +5254,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
                 goto out_free_cpus_allowed;
         }
         retval = -EPERM;
-       if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
+       if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
                 goto out_unlock;
  
         retval = security_task_setscheduler(p);
@@ -5162,7 +5325,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
  {
         struct task_struct *p;
         unsigned long flags;
-       struct rq *rq;
         int retval;
  
         get_online_cpus();
@@ -5177,9 +5339,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
         if (retval)
                 goto out_unlock;
  
-       rq = task_rq_lock(p, &flags);
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
         cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
-       task_rq_unlock(rq, &flags);
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  
  out_unlock:
         rcu_read_unlock();
@@ -5326,6 +5488,67 @@ void __sched yield(void)
  }
  EXPORT_SYMBOL(yield);
  
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Returns true if we indeed boosted the target task.
+ */
+bool __sched yield_to(struct task_struct *p, bool preempt)
+{
+       struct task_struct *curr = current;
+       struct rq *rq, *p_rq;
+       unsigned long flags;
+       bool yielded = 0;
+
+       local_irq_save(flags);
+       rq = this_rq();
+
+again:
+       p_rq = task_rq(p);
+       double_rq_lock(rq, p_rq);
+       while (task_rq(p) != p_rq) {
+               double_rq_unlock(rq, p_rq);
+               goto again;
+       }
+
+       if (!curr->sched_class->yield_to_task)
+               goto out;
+
+       if (curr->sched_class != p->sched_class)
+               goto out;
+
+       if (task_running(p_rq, p) || p->state)
+               goto out;
+
+       yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+       if (yielded) {
+               schedstat_inc(rq, yld_count);
+               /*
+                * Make p's CPU reschedule; pick_next_entity takes care of
+                * fairness.
+                */
+               if (preempt && rq != p_rq)
+                       resched_task(p_rq->curr);
+       }
+
+out:
+       double_rq_unlock(rq, p_rq);
+       local_irq_restore(flags);
+
+       if (yielded)
+               schedule();
+
+       return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
  /*
   * This task is about to go to sleep on IO. Increment rq->nr_iowait so
   * that process accounting knows that this is a task in IO wait state.
@@ -5336,6 +5559,7 @@ void __sched io_schedule(void)
  
         delayacct_blkio_start();
         atomic_inc(&rq->nr_iowait);
+       blk_flush_plug(current);
         current->in_iowait = 1;
         schedule();
         current->in_iowait = 0;
@@ -5351,6 +5575,7 @@ long __sched io_schedule_timeout(long timeout)
  
         delayacct_blkio_start();
         atomic_inc(&rq->nr_iowait);
+       blk_flush_plug(current);
         current->in_iowait = 1;
         ret = schedule_timeout(timeout);
         current->in_iowait = 0;
@@ -5499,7 +5724,7 @@ void show_state_filter(unsigned long state_filter)
         do_each_thread(g, p) {
                 /*
                  * reset the NMI-timeout, listing all files on a slow
-                * console might take alot of time:
+                * console might take a lot of time:
                  */
                 touch_nmi_watchdog();
                 if (!state_filter || (p->state & state_filter))
@@ -5559,8 +5784,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         rcu_read_unlock();
  
         rq->curr = rq->idle = idle;
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
-       idle->oncpu = 1;
+#if defined(CONFIG_SMP)
+       idle->on_cpu = 1;
  #endif
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  
@@ -5574,7 +5799,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
          * The idle tasks have their own, simple scheduling class:
          */
         idle->sched_class = &idle_sched_class;
-       ftrace_graph_init_task(idle);
+       ftrace_graph_init_idle_task(idle, cpu);
  }
  
  /*
@@ -5664,18 +5889,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
         unsigned int dest_cpu;
         int ret = 0;
  
-       /*
-        * Serialize against TASK_WAKING so that ttwu() and wunt() can
-        * drop the rq->lock and still rely on ->cpus_allowed.
-        */
-again:
-       while (task_is_waking(p))
-               cpu_relax();
-       rq = task_rq_lock(p, &flags);
-       if (task_is_waking(p)) {
-               task_rq_unlock(rq, &flags);
-               goto again;
-       }
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
+       rq = __task_rq_lock(p);
  
         if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                 ret = -EINVAL;
@@ -5700,16 +5915,18 @@ again:
                 goto out;
  
         dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-       if (migrate_task(p, rq)) {
+       if (need_migrate_task(p)) {
                 struct migration_arg arg = { p, dest_cpu };
                 /* Need help from migration thread: drop lock and wait. */
-               task_rq_unlock(rq, &flags);
+               __task_rq_unlock(rq);
+               raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
                 tlb_migrate_finish(p->mm);
                 return 0;
         }
  out:
-       task_rq_unlock(rq, &flags);
+       __task_rq_unlock(rq);
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  
         return ret;
  }
@@ -5749,7 +5966,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
          * If we're not on a rq, the next wake-up will ensure we're
          * placed properly.
          */
-       if (p->se.on_rq) {
+       if (p->on_rq) {
                 deactivate_task(rq_src, p, 0);
                 set_task_cpu(p, dest_cpu);
                 activate_task(rq_dest, p, 0);
@@ -6114,6 +6331,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 break;
  #endif
         }
+
+       update_max_interval();
+
         return NOTIFY_OK;
  }
  
@@ -7799,6 +8019,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
         INIT_LIST_HEAD(&cfs_rq->tasks);
  #ifdef CONFIG_FAIR_GROUP_SCHED
         cfs_rq->rq = rq;
+       /* allow initial update_cfs_load() to truncate */
+#ifdef CONFIG_SMP
+       cfs_rq->load_stamp = 1;
+#endif
  #endif
         cfs_rq->min_vruntime = (u64)(-(1LL << 20));
  }
@@ -8077,7 +8301,7 @@ static inline int preempt_count_equals(int preempt_offset)
  {
         int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
  
-       return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+       return (nested == preempt_offset);
  }
  
  void __might_sleep(const char *file, int line, int preempt_offset)
@@ -8112,9 +8336,11 @@ EXPORT_SYMBOL(__might_sleep);
  #ifdef CONFIG_MAGIC_SYSRQ
  static void normalize_task(struct rq *rq, struct task_struct *p)
  {
+       const struct sched_class *prev_class = p->sched_class;
+       int old_prio = p->prio;
         int on_rq;
  
-       on_rq = p->se.on_rq;
+       on_rq = p->on_rq;
         if (on_rq)
                 deactivate_task(rq, p, 0);
         __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8122,6 +8348,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
                 activate_task(rq, p, 0);
                 resched_task(rq->curr);
         }
+
+       check_class_changed(rq, p, prev_class, old_prio);
  }
  
  void normalize_rt_tasks(void)
@@ -8237,7 +8465,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  {
         struct cfs_rq *cfs_rq;
         struct sched_entity *se;
-       struct rq *rq;
         int i;
  
         tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8250,8 +8477,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
         tg->shares = NICE_0_LOAD;
  
         for_each_possible_cpu(i) {
-               rq = cpu_rq(i);
-
                 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                       GFP_KERNEL, cpu_to_node(i));
                 if (!cfs_rq)
@@ -8458,7 +8683,7 @@ void sched_move_task(struct task_struct *tsk)
         rq = task_rq_lock(tsk, &flags);
  
         running = task_current(rq, tsk);
-       on_rq = tsk->se.on_rq;
+       on_rq = tsk->on_rq;
  
         if (on_rq)
                 dequeue_task(rq, tsk, 0);
@@ -8513,7 +8738,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
                 /* Propagate contribution to hierarchy */
                 raw_spin_lock_irqsave(&rq->lock, flags);
                 for_each_sched_entity(se)
-                       update_cfs_shares(group_cfs_rq(se), 0);
+                       update_cfs_shares(group_cfs_rq(se));
                 raw_spin_unlock_irqrestore(&rq->lock, flags);
         }
  
@@ -8887,7 +9112,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
  }
  
  static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+               struct cgroup *old_cgrp, struct task_struct *task)
  {
         /*
          * cgroup_exit() is called in the copy_process() failure path.