SCHED_TTWU_QUEUE is not longer needed since sparc32 now implements IPI
[linux-2.6.git] / kernel / sched.c
index da14302..c4b3410 100644 (file)
@@ -32,7 +32,6 @@
 #include <linux/init.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
-#include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
+#include <asm/mutex.h>
 
 #include "sched_cpupri.h"
 #include "workqueue_sched.h"
+#include "sched_autogroup.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -253,6 +254,8 @@ struct task_group {
        /* runqueue "owned" by this group on each cpu */
        struct cfs_rq **cfs_rq;
        unsigned long shares;
+
+       atomic_t load_weight;
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -268,25 +271,18 @@ struct task_group {
        struct task_group *parent;
        struct list_head siblings;
        struct list_head children;
-};
 
-#define root_task_group init_task_group
+#ifdef CONFIG_SCHED_AUTOGROUP
+       struct autogroup *autogroup;
+#endif
+};
 
-/* task_group_lock serializes add/remove of task groups and also changes to
- * a task group's cpu shares.
- */
+/* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-#ifdef CONFIG_SMP
-static int root_task_group_empty(void)
-{
-       return list_empty(&root_task_group.children);
-}
-#endif
-
-# define INIT_TASK_GROUP_LOAD  NICE_0_LOAD
+# define ROOT_TASK_GROUP_LOAD  NICE_0_LOAD
 
 /*
  * A weight of 0 or 1 can cause arithmetics problems.
@@ -299,13 +295,13 @@ static int root_task_group_empty(void)
 #define MIN_SHARES     2
 #define MAX_SHARES     (1UL << 18)
 
-static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
 #endif
 
 /* Default task group.
  *     Every task in system belong to this group at bootup.
  */
-struct task_group init_task_group;
+struct task_group root_task_group;
 
 #endif /* CONFIG_CGROUP_SCHED */
 
@@ -316,6 +312,9 @@ struct cfs_rq {
 
        u64 exec_clock;
        u64 min_vruntime;
+#ifndef CONFIG_64BIT
+       u64 min_vruntime_copy;
+#endif
 
        struct rb_root tasks_timeline;
        struct rb_node *rb_leftmost;
@@ -327,7 +326,7 @@ struct cfs_rq {
         * 'curr' points to currently running entity on this cfs_rq.
         * It is set to NULL otherwise (i.e when none are currently running).
         */
-       struct sched_entity *curr, *next, *last;
+       struct sched_entity *curr, *next, *last, *skip;
 
        unsigned int nr_spread_over;
 
@@ -342,6 +341,7 @@ struct cfs_rq {
         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
         * list is used during load balance.
         */
+       int on_list;
        struct list_head leaf_cfs_rq_list;
        struct task_group *tg;  /* group that "owns" this runqueue */
 
@@ -360,14 +360,17 @@ struct cfs_rq {
        unsigned long h_load;
 
        /*
-        * this cpu's part of tg->shares
+        * Maintaining per-cpu shares distribution for group scheduling
+        *
+        * load_stamp is the last time we updated the load average
+        * load_last is the last time we updated the load average and saw load
+        * load_unacc_exec_time is currently unaccounted execution time
         */
-       unsigned long shares;
+       u64 load_avg;
+       u64 load_period;
+       u64 load_stamp, load_last, load_unacc_exec_time;
 
-       /*
-        * load.weight at the time we set shares
-        */
-       unsigned long rq_weight;
+       unsigned long load_contribution;
 #endif
 #endif
 };
@@ -552,9 +555,10 @@ struct rq {
        /* try_to_wake_up() stats */
        unsigned int ttwu_count;
        unsigned int ttwu_local;
+#endif
 
-       /* BKL stats */
-       unsigned int bkl_count;
+#ifdef CONFIG_SMP
+       struct task_struct *wake_list;
 #endif
 };
 
@@ -599,17 +603,20 @@ static inline int cpu_of(struct rq *rq)
  * Return the group to which this tasks belongs.
  *
  * We use task_subsys_state_check() and extend the RCU verification
- * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
  * holds that lock for each task it moves into the cgroup. Therefore
  * by holding that lock, we pin the task to the current cgroup.
  */
 static inline struct task_group *task_group(struct task_struct *p)
 {
+       struct task_group *tg;
        struct cgroup_subsys_state *css;
 
        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
-                       lockdep_is_held(&task_rq(p)->lock));
-       return container_of(css, struct task_group, css);
+                       lockdep_is_held(&p->pi_lock));
+       tg = container_of(css, struct task_group, css);
+
+       return autogroup_task_group(p, tg);
 }
 
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -636,23 +643,18 @@ static inline struct task_group *task_group(struct task_struct *p)
 
 #endif /* CONFIG_CGROUP_SCHED */
 
-static u64 irq_time_cpu(int cpu);
-static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+static void update_rq_clock_task(struct rq *rq, s64 delta);
 
-inline void update_rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)
 {
-       int cpu = cpu_of(rq);
-       u64 irq_time;
+       s64 delta;
 
        if (rq->skip_clock_update)
                return;
 
-       rq->clock = sched_clock_cpu(cpu);
-       irq_time = irq_time_cpu(cpu);
-       if (rq->clock - irq_time > rq->clock_task)
-               rq->clock_task = rq->clock - irq_time;
-
-       sched_irq_time_avg_update(rq, irq_time);
+       delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+       rq->clock += delta;
+       update_rq_clock_task(rq, delta);
 }
 
 /*
@@ -665,10 +667,9 @@ inline void update_rq_clock(struct rq *rq)
 #endif
 
 /**
- * runqueue_is_locked
+ * runqueue_is_locked - Returns true if the current cpu runqueue is locked
  * @cpu: the processor in question.
  *
- * Returns true if the current cpu runqueue is locked.
  * This interface allows printk to be called with the runqueue lock
  * held and know whether or not it is OK to wake up the klogd.
  */
@@ -742,7 +743,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        buf[cnt] = 0;
        cmp = strstrip(buf);
 
-       if (strncmp(buf, "NO_", 3) == 0) {
+       if (strncmp(cmp, "NO_", 3) == 0) {
                neg = 1;
                cmp += 3;
        }
@@ -798,20 +799,6 @@ late_initcall(sched_init_debug);
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
 /*
- * ratelimit for updating the group shares.
- * default: 0.25ms
- */
-unsigned int sysctl_sched_shares_ratelimit = 250000;
-unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
-
-/*
- * Inject some fuzzyness into changing the per-cpu group shares
- * this avoids remote rq-locks at the expense of fairness.
- * default: 4
- */
-unsigned int sysctl_sched_shares_thresh = 4;
-
-/*
  * period over which we average the RT time consumption, measured
  * in ms.
  *
@@ -858,18 +845,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
        return rq->curr == p;
 }
 
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
+#ifdef CONFIG_SMP
+       return p->on_cpu;
+#else
        return task_current(rq, p);
+#endif
 }
 
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
+#ifdef CONFIG_SMP
+       /*
+        * We can optimise this out completely for !SMP, because the
+        * SMP rebalancing from interrupt is the only thing that cares
+        * here.
+        */
+       next->on_cpu = 1;
+#endif
 }
 
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
+#ifdef CONFIG_SMP
+       /*
+        * After ->on_cpu is cleared, the task can be moved to a different CPU.
+        * We must ensure this doesn't happen until the switch is completely
+        * finished.
+        */
+       smp_wmb();
+       prev->on_cpu = 0;
+#endif
 #ifdef CONFIG_DEBUG_SPINLOCK
        /* this is a valid case when another task releases the spinlock */
        rq->lock.owner = current;
@@ -885,15 +893,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 }
 
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
-       return p->oncpu;
-#else
-       return task_current(rq, p);
-#endif
-}
-
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 #ifdef CONFIG_SMP
@@ -902,7 +901,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
         * SMP rebalancing from interrupt is the only thing that cares
         * here.
         */
-       next->oncpu = 1;
+       next->on_cpu = 1;
 #endif
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
        raw_spin_unlock_irq(&rq->lock);
@@ -915,12 +914,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_SMP
        /*
-        * After ->oncpu is cleared, the task can be moved to a different CPU.
+        * After ->on_cpu is cleared, the task can be moved to a different CPU.
         * We must ensure this doesn't happen until the switch is completely
         * finished.
         */
        smp_wmb();
-       prev->oncpu = 0;
+       prev->on_cpu = 0;
 #endif
 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
        local_irq_enable();
@@ -929,23 +928,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
 /*
- * Check whether the task is waking, we use this to synchronize ->cpus_allowed
- * against ttwu().
- */
-static inline int task_is_waking(struct task_struct *p)
-{
-       return unlikely(p->state == TASK_WAKING);
-}
-
-/*
- * __task_rq_lock - lock the runqueue a given task resides on.
- * Must be called interrupts disabled.
+ * __task_rq_lock - lock the rq @p resides on.
  */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
        __acquires(rq->lock)
 {
        struct rq *rq;
 
+       lockdep_assert_held(&p->pi_lock);
+
        for (;;) {
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
@@ -956,22 +947,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
 }
 
 /*
- * task_rq_lock - lock the runqueue a given task resides on and disable
- * interrupts. Note the ordering: we can safely lookup the task_rq without
- * explicitly disabling preemption.
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
  */
 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+       __acquires(p->pi_lock)
        __acquires(rq->lock)
 {
        struct rq *rq;
 
        for (;;) {
-               local_irq_save(*flags);
+               raw_spin_lock_irqsave(&p->pi_lock, *flags);
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
                if (likely(rq == task_rq(p)))
                        return rq;
-               raw_spin_unlock_irqrestore(&rq->lock, *flags);
+               raw_spin_unlock(&rq->lock);
+               raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
        }
 }
 
@@ -981,10 +972,13 @@ static void __task_rq_unlock(struct rq *rq)
        raw_spin_unlock(&rq->lock);
 }
 
-static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
        __releases(rq->lock)
+       __releases(p->pi_lock)
 {
-       raw_spin_unlock_irqrestore(&rq->lock, *flags);
+       raw_spin_unlock(&rq->lock);
+       raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
 
 /*
@@ -1360,6 +1354,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
        lw->inv_weight = 0;
 }
 
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+       lw->weight = w;
+       lw->inv_weight = 0;
+}
+
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
  * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1548,101 +1548,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-static __read_mostly unsigned long __percpu *update_shares_data;
-
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-
-/*
- * Calculate and set the cpu's group shares.
- */
-static void update_group_shares_cpu(struct task_group *tg, int cpu,
-                                   unsigned long sd_shares,
-                                   unsigned long sd_rq_weight,
-                                   unsigned long *usd_rq_weight)
-{
-       unsigned long shares, rq_weight;
-       int boost = 0;
-
-       rq_weight = usd_rq_weight[cpu];
-       if (!rq_weight) {
-               boost = 1;
-               rq_weight = NICE_0_LOAD;
-       }
-
-       /*
-        *             \Sum_j shares_j * rq_weight_i
-        * shares_i =  -----------------------------
-        *                  \Sum_j rq_weight_j
-        */
-       shares = (sd_shares * rq_weight) / sd_rq_weight;
-       shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
-
-       if (abs(shares - tg->se[cpu]->load.weight) >
-                       sysctl_sched_shares_thresh) {
-               struct rq *rq = cpu_rq(cpu);
-               unsigned long flags;
-
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
-               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-               __set_se_shares(tg->se[cpu], shares);
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
-       }
-}
-
-/*
- * Re-compute the task group their per cpu shares over the given domain.
- * This needs to be done in a bottom-up fashion because the rq weight of a
- * parent group depends on the shares of its child groups.
- */
-static int tg_shares_up(struct task_group *tg, void *data)
-{
-       unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
-       unsigned long *usd_rq_weight;
-       struct sched_domain *sd = data;
-       unsigned long flags;
-       int i;
-
-       if (!tg->se[0])
-               return 0;
-
-       local_irq_save(flags);
-       usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
-
-       for_each_cpu(i, sched_domain_span(sd)) {
-               weight = tg->cfs_rq[i]->load.weight;
-               usd_rq_weight[i] = weight;
-
-               rq_weight += weight;
-               /*
-                * If there are currently no tasks on the cpu pretend there
-                * is one of average load so that when a new task gets to
-                * run here it will not get delayed by group starvation.
-                */
-               if (!weight)
-                       weight = NICE_0_LOAD;
-
-               sum_weight += weight;
-               shares += tg->cfs_rq[i]->shares;
-       }
-
-       if (!rq_weight)
-               rq_weight = sum_weight;
-
-       if ((!shares && rq_weight) || shares > tg->shares)
-               shares = tg->shares;
-
-       if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
-               shares = tg->shares;
-
-       for_each_cpu(i, sched_domain_span(sd))
-               update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
-
-       local_irq_restore(flags);
-
-       return 0;
-}
-
 /*
  * Compute the cpu's hierarchical load factor for each task group.
  * This needs to be done in a top-down fashion because the load of a child
@@ -1657,7 +1562,7 @@ static int tg_load_down(struct task_group *tg, void *data)
                load = cpu_rq(cpu)->load.weight;
        } else {
                load = tg->parent->cfs_rq[cpu]->h_load;
-               load *= tg->cfs_rq[cpu]->shares;
+               load *= tg->se[cpu]->load.weight;
                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
        }
 
@@ -1666,34 +1571,11 @@ static int tg_load_down(struct task_group *tg, void *data)
        return 0;
 }
 
-static void update_shares(struct sched_domain *sd)
-{
-       s64 elapsed;
-       u64 now;
-
-       if (root_task_group_empty())
-               return;
-
-       now = local_clock();
-       elapsed = now - sd->last_update;
-
-       if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
-               sd->last_update = now;
-               walk_tg_tree(tg_nop, tg_shares_up, sd);
-       }
-}
-
 static void update_h_load(long cpu)
 {
        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 
-#else
-
-static inline void update_shares(struct sched_domain *sd)
-{
-}
-
 #endif
 
 #ifdef CONFIG_PREEMPT
@@ -1813,15 +1695,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
                __release(rq2->lock);
 }
 
-#endif
+#else /* CONFIG_SMP */
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+       __acquires(rq1->lock)
+       __acquires(rq2->lock)
 {
-#ifdef CONFIG_SMP
-       cfs_rq->shares = shares;
-#endif
+       BUG_ON(!irqs_disabled());
+       BUG_ON(rq1 != rq2);
+       raw_spin_lock(&rq1->lock);
+       __acquire(rq2->lock);   /* Fake it out ;) */
 }
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+       __releases(rq1->lock)
+       __releases(rq2->lock)
+{
+       BUG_ON(rq1 != rq2);
+       raw_spin_unlock(&rq1->lock);
+       __release(rq2->lock);
+}
+
 #endif
 
 static void calc_load_account_idle(struct rq *this_rq);
@@ -1881,7 +1787,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
        update_rq_clock(rq);
        sched_info_queued(p);
        p->sched_class->enqueue_task(rq, p, flags);
-       p->se.on_rq = 1;
 }
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1889,7 +1794,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
        update_rq_clock(rq);
        sched_info_dequeued(p);
        p->sched_class->dequeue_task(rq, p, flags);
-       p->se.on_rq = 0;
 }
 
 /*
@@ -1925,10 +1829,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
  * They are read and saved off onto struct rq in update_rq_clock().
  * This may result in other CPU reading this CPU's irq time and can
  * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value (or semi updated value on 32 bit) with a side effect of
- * accounting a slice of irq time to wrong task when irq is in progress
- * while we read rq->clock. That is a worthy compromise in place of having
- * locks on each irq in account_system_time.
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
  */
 static DEFINE_PER_CPU(u64, cpu_hardirq_time);
 static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1946,19 +1849,58 @@ void disable_sched_clock_irqtime(void)
        sched_clock_irqtime = 0;
 }
 
-static u64 irq_time_cpu(int cpu)
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
 {
-       if (!sched_clock_irqtime)
-               return 0;
+       __this_cpu_inc(irq_time_seq.sequence);
+       smp_wmb();
+}
 
+static inline void irq_time_write_end(void)
+{
+       smp_wmb();
+       __this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+       u64 irq_time;
+       unsigned seq;
+
+       do {
+               seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+               irq_time = per_cpu(cpu_softirq_time, cpu) +
+                          per_cpu(cpu_hardirq_time, cpu);
+       } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+       return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
        return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
 }
+#endif /* CONFIG_64BIT */
 
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
 void account_system_vtime(struct task_struct *curr)
 {
        unsigned long flags;
+       s64 delta;
        int cpu;
-       u64 now, delta;
 
        if (!sched_clock_irqtime)
                return;
@@ -1966,9 +1908,10 @@ void account_system_vtime(struct task_struct *curr)
        local_irq_save(flags);
 
        cpu = smp_processor_id();
-       now = sched_clock_cpu(cpu);
-       delta = now - per_cpu(irq_start_time, cpu);
-       per_cpu(irq_start_time, cpu) = now;
+       delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+       __this_cpu_add(irq_start_time, delta);
+
+       irq_time_write_begin();
        /*
         * We do not account for softirq time from ksoftirqd here.
         * We want to continue accounting softirq time to ksoftirqd thread
@@ -1976,37 +1919,92 @@ void account_system_vtime(struct task_struct *curr)
         * that do not consume any time, but still wants to run.
         */
        if (hardirq_count())
-               per_cpu(cpu_hardirq_time, cpu) += delta;
-       else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
-               per_cpu(cpu_softirq_time, cpu) += delta;
+               __this_cpu_add(cpu_hardirq_time, delta);
+       else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+               __this_cpu_add(cpu_softirq_time, delta);
 
+       irq_time_write_end();
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
 
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
-       if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
-               u64 delta_irq = curr_irq_time - rq->prev_irq_time;
-               rq->prev_irq_time = curr_irq_time;
-               sched_rt_avg_update(rq, delta_irq);
-       }
+       s64 irq_delta;
+
+       irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+       /*
+        * Since irq_time is only updated on {soft,}irq_exit, we might run into
+        * this case when a previous update_rq_clock() happened inside a
+        * {soft,}irq region.
+        *
+        * When this happens, we stop ->clock_task and only update the
+        * prev_irq_time stamp to account for the part that fit, so that a next
+        * update will consume the rest. This ensures ->clock_task is
+        * monotonic.
+        *
+        * It does however cause some slight miss-attribution of {soft,}irq
+        * time, a more accurate solution would be to update the irq_time using
+        * the current rq->clock timestamp, except that would require using
+        * atomic ops.
+        */
+       if (irq_delta > delta)
+               irq_delta = delta;
+
+       rq->prev_irq_time += irq_delta;
+       delta -= irq_delta;
+       rq->clock_task += delta;
+
+       if (irq_delta && sched_feat(NONIRQ_POWER))
+               sched_rt_avg_update(rq, irq_delta);
 }
 
-#else
+static int irqtime_account_hi_update(void)
+{
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+       unsigned long flags;
+       u64 latest_ns;
+       int ret = 0;
 
-static u64 irq_time_cpu(int cpu)
+       local_irq_save(flags);
+       latest_ns = this_cpu_read(cpu_hardirq_time);
+       if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+               ret = 1;
+       local_irq_restore(flags);
+       return ret;
+}
+
+static int irqtime_account_si_update(void)
 {
-       return 0;
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+       unsigned long flags;
+       u64 latest_ns;
+       int ret = 0;
+
+       local_irq_save(flags);
+       latest_ns = this_cpu_read(cpu_softirq_time);
+       if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+               ret = 1;
+       local_irq_restore(flags);
+       return ret;
 }
 
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
 
-#endif
+#define sched_clock_irqtime    (0)
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+       rq->clock_task += delta;
+}
+
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
+#include "sched_autogroup.c"
 #include "sched_stoptask.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
@@ -2099,14 +2097,14 @@ inline int task_curr(const struct task_struct *p)
 
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                                       const struct sched_class *prev_class,
-                                      int oldprio, int running)
+                                      int oldprio)
 {
        if (prev_class != p->sched_class) {
                if (prev_class->switched_from)
-                       prev_class->switched_from(rq, p, running);
-               p->sched_class->switched_to(rq, p, running);
-       } else
-               p->sched_class->prio_changed(rq, p, oldprio, running);
+                       prev_class->switched_from(rq, p);
+               p->sched_class->switched_to(rq, p);
+       } else if (oldprio != p->prio)
+               p->sched_class->prio_changed(rq, p, oldprio);
 }
 
 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2130,7 +2128,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
         * A queue event has occurred, and we're going to schedule.  In
         * this case, we can save a useless back to back clock update.
         */
-       if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
+       if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
                rq->skip_clock_update = 1;
 }
 
@@ -2176,6 +2174,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         */
        WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
                        !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+
+#ifdef CONFIG_LOCKDEP
+       WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+                                     lockdep_is_held(&task_rq(p)->lock)));
+#endif
 #endif
 
        trace_sched_migrate_task(p, new_cpu);
@@ -2199,15 +2202,15 @@ static int migration_cpu_stop(void *data);
  * The task's runqueue lock must be held.
  * Returns true if you have to wait for migration thread.
  */
-static bool migrate_task(struct task_struct *p, int dest_cpu)
+static bool need_migrate_task(struct task_struct *p)
 {
-       struct rq *rq = task_rq(p);
-
        /*
         * If the task is not on a runqueue (and not running), then
         * the next wake-up will properly place the task.
         */
-       return p->se.on_rq || task_running(rq, p);
+       bool running = p->on_rq || p->on_cpu;
+       smp_rmb(); /* finish_lock_switch() */
+       return running;
 }
 
 /*
@@ -2267,11 +2270,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                rq = task_rq_lock(p, &flags);
                trace_sched_wait_task(p);
                running = task_running(rq, p);
-               on_rq = p->se.on_rq;
+               on_rq = p->on_rq;
                ncsw = 0;
                if (!match_state || p->state == match_state)
                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-               task_rq_unlock(rq, &flags);
+               task_rq_unlock(rq, p, &flags);
 
                /*
                 * If it changed from the expected state, bail out now.
@@ -2300,7 +2303,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                 * yield - it could be a while.
                 */
                if (unlikely(on_rq)) {
-                       schedule_timeout_uninterruptible(1);
+                       ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+
+                       set_current_state(TASK_UNINTERRUPTIBLE);
+                       schedule_hrtimeout(&to, HRTIMER_MODE_REL);
                        continue;
                }
 
@@ -2322,7 +2328,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
  * Cause a process which is running on another CPU to enter
  * kernel-mode, without any delay. (to get signals handled.)
  *
- * NOTE: this function doesnt have to take the runqueue lock,
+ * NOTE: this function doesn't have to take the runqueue lock,
  * because all it wants to ensure is that the remote task enters
  * the kernel. If the IPI races and the task has been migrated
  * to another CPU then no harm is done and the purpose has been
@@ -2341,30 +2347,9 @@ void kick_process(struct task_struct *p)
 EXPORT_SYMBOL_GPL(kick_process);
 #endif /* CONFIG_SMP */
 
-/**
- * task_oncpu_function_call - call a function on the cpu on which a task runs
- * @p:         the task to evaluate
- * @func:      the function to be called
- * @info:      the function call argument
- *
- * Calls the function @func when the task is currently running. This might
- * be on the current CPU, which just calls the function directly
- */
-void task_oncpu_function_call(struct task_struct *p,
-                             void (*func) (void *info), void *info)
-{
-       int cpu;
-
-       preempt_disable();
-       cpu = task_cpu(p);
-       if (task_curr(p))
-               smp_call_function_single(cpu, func, info, 1);
-       preempt_enable();
-}
-
 #ifdef CONFIG_SMP
 /*
- * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ * ->cpus_allowed is protected by both rq->lock and p->pi_lock
  */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
@@ -2382,30 +2367,27 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                return dest_cpu;
 
        /* No more Mr. Nice Guy. */
-       if (unlikely(dest_cpu >= nr_cpu_ids)) {
-               dest_cpu = cpuset_cpus_allowed_fallback(p);
-               /*
-                * Don't tell them about moving exiting tasks or
-                * kernel threads (both mm NULL), since they never
-                * leave kernel.
-                */
-               if (p->mm && printk_ratelimit()) {
-                       printk(KERN_INFO "process %d (%s) no "
-                              "longer affine to cpu%d\n",
-                              task_pid_nr(p), p->comm, cpu);
-               }
+       dest_cpu = cpuset_cpus_allowed_fallback(p);
+       /*
+        * Don't tell them about moving exiting tasks or
+        * kernel threads (both mm NULL), since they never
+        * leave kernel.
+        */
+       if (p->mm && printk_ratelimit()) {
+               printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
+                               task_pid_nr(p), p->comm, cpu);
        }
 
        return dest_cpu;
 }
 
 /*
- * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
  */
 static inline
-int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
 {
-       int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
+       int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
 
        /*
         * In order not to call set_task_cpu() on a blocking task we need
@@ -2431,27 +2413,60 @@ static void update_avg(u64 *avg, u64 sample)
 }
 #endif
 
-static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
-                                bool is_sync, bool is_migrate, bool is_local,
-                                unsigned long en_flags)
+static void
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
+#ifdef CONFIG_SCHEDSTATS
+       struct rq *rq = this_rq();
+
+#ifdef CONFIG_SMP
+       int this_cpu = smp_processor_id();
+
+       if (cpu == this_cpu) {
+               schedstat_inc(rq, ttwu_local);
+               schedstat_inc(p, se.statistics.nr_wakeups_local);
+       } else {
+               struct sched_domain *sd;
+
+               schedstat_inc(p, se.statistics.nr_wakeups_remote);
+               for_each_domain(this_cpu, sd) {
+                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+                               schedstat_inc(sd, ttwu_wake_remote);
+                               break;
+                       }
+               }
+       }
+#endif /* CONFIG_SMP */
+
+       schedstat_inc(rq, ttwu_count);
        schedstat_inc(p, se.statistics.nr_wakeups);
-       if (is_sync)
+
+       if (wake_flags & WF_SYNC)
                schedstat_inc(p, se.statistics.nr_wakeups_sync);
-       if (is_migrate)
+
+       if (cpu != task_cpu(p))
                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-       if (is_local)
-               schedstat_inc(p, se.statistics.nr_wakeups_local);
-       else
-               schedstat_inc(p, se.statistics.nr_wakeups_remote);
 
+#endif /* CONFIG_SCHEDSTATS */
+}
+
+static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+{
        activate_task(rq, p, en_flags);
+       p->on_rq = 1;
+
+       /* if a worker is waking up, notify workqueue */
+       if (p->flags & PF_WQ_WORKER)
+               wq_worker_waking_up(p, cpu_of(rq));
 }
 
-static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
-                                       int wake_flags, bool success)
+/*
+ * Mark the task runnable and perform wakeup-preemption.
+ */
+static void
+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
-       trace_sched_wakeup(p, success);
+       trace_sched_wakeup(p, true);
        check_preempt_curr(rq, p, wake_flags);
 
        p->state = TASK_RUNNING;
@@ -2470,9 +2485,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
                rq->idle_stamp = 0;
        }
 #endif
-       /* if a worker is waking up, notify workqueue */
-       if ((p->flags & PF_WQ_WORKER) && success)
-               wq_worker_waking_up(p, cpu_of(rq));
+}
+
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+#ifdef CONFIG_SMP
+       if (p->sched_contributes_to_load)
+               rq->nr_uninterruptible--;
+#endif
+
+       ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+       ttwu_do_wakeup(rq, p, wake_flags);
+}
+
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+       struct rq *rq;
+       int ret = 0;
+
+       rq = __task_rq_lock(p);
+       if (p->on_rq) {
+               ttwu_do_wakeup(rq, p, wake_flags);
+               ret = 1;
+       }
+       __task_rq_unlock(rq);
+
+       return ret;
+}
+
+#ifdef CONFIG_SMP
+static void sched_ttwu_pending(void)
+{
+       struct rq *rq = this_rq();
+       struct task_struct *list = xchg(&rq->wake_list, NULL);
+
+       if (!list)
+               return;
+
+       raw_spin_lock(&rq->lock);
+
+       while (list) {
+               struct task_struct *p = list;
+               list = list->wake_entry;
+               ttwu_do_activate(rq, p, 0);
+       }
+
+       raw_spin_unlock(&rq->lock);
+}
+
+void scheduler_ipi(void)
+{
+       sched_ttwu_pending();
+}
+
+static void ttwu_queue_remote(struct task_struct *p, int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       struct task_struct *next = rq->wake_list;
+
+       for (;;) {
+               struct task_struct *old = next;
+
+               p->wake_entry = next;
+               next = cmpxchg(&rq->wake_list, old, p);
+               if (next == old)
+                       break;
+       }
+
+       if (!next)
+               smp_send_reschedule(cpu);
+}
+#endif
+
+static void ttwu_queue(struct task_struct *p, int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+
+#if defined(CONFIG_SMP)
+       if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+               ttwu_queue_remote(p, cpu);
+               return;
+       }
+#endif
+
+       raw_spin_lock(&rq->lock);
+       ttwu_do_activate(rq, p, 0);
+       raw_spin_unlock(&rq->lock);
 }
 
 /**
@@ -2490,92 +2595,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
  * Returns %true if @p was woken up, %false if it was already running
  * or @state didn't match @p's state.
  */
-static int try_to_wake_up(struct task_struct *p, unsigned int state,
-                         int wake_flags)
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 {
-       int cpu, orig_cpu, this_cpu, success = 0;
        unsigned long flags;
-       unsigned long en_flags = ENQUEUE_WAKEUP;
-       struct rq *rq;
-
-       this_cpu = get_cpu();
+       int cpu, success = 0;
 
        smp_wmb();
-       rq = task_rq_lock(p, &flags);
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
        if (!(p->state & state))
                goto out;
 
-       if (p->se.on_rq)
-               goto out_running;
-
+       success = 1; /* we're going to change ->state */
        cpu = task_cpu(p);
-       orig_cpu = cpu;
 
-#ifdef CONFIG_SMP
-       if (unlikely(task_running(rq, p)))
-               goto out_activate;
+       if (p->on_rq && ttwu_remote(p, wake_flags))
+               goto stat;
 
+#ifdef CONFIG_SMP
        /*
-        * In order to handle concurrent wakeups and release the rq->lock
-        * we put the task in TASK_WAKING state.
-        *
-        * First fix up the nr_uninterruptible count:
+        * If the owning (remote) cpu is still in the middle of schedule() with
+        * this task as prev, wait until its done referencing the task.
         */
-       if (task_contributes_to_load(p)) {
-               if (likely(cpu_online(orig_cpu)))
-                       rq->nr_uninterruptible--;
-               else
-                       this_rq()->nr_uninterruptible--;
-       }
-       p->state = TASK_WAKING;
-
-       if (p->sched_class->task_waking) {
-               p->sched_class->task_waking(rq, p);
-               en_flags |= ENQUEUE_WAKING;
+       while (p->on_cpu) {
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+               /*
+                * If called from interrupt context we could have landed in the
+                * middle of schedule(), in this case we should take care not
+                * to spin on ->on_cpu if p is current, since that would
+                * deadlock.
+                */
+               if (p == current) {
+                       ttwu_queue(p, cpu);
+                       goto stat;
+               }
+#endif
+               cpu_relax();
        }
-
-       cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
-       if (cpu != orig_cpu)
-               set_task_cpu(p, cpu);
-       __task_rq_unlock(rq);
-
-       rq = cpu_rq(cpu);
-       raw_spin_lock(&rq->lock);
-
        /*
-        * We migrated the task without holding either rq->lock, however
-        * since the task is not on the task list itself, nobody else
-        * will try and migrate the task, hence the rq should match the
-        * cpu we just moved it to.
+        * Pairs with the smp_wmb() in finish_lock_switch().
         */
-       WARN_ON(task_cpu(p) != cpu);
-       WARN_ON(p->state != TASK_WAKING);
+       smp_rmb();
 
-#ifdef CONFIG_SCHEDSTATS
-       schedstat_inc(rq, ttwu_count);
-       if (cpu == this_cpu)
-               schedstat_inc(rq, ttwu_local);
-       else {
-               struct sched_domain *sd;
-               for_each_domain(this_cpu, sd) {
-                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                               schedstat_inc(sd, ttwu_wake_remote);
-                               break;
-                       }
-               }
-       }
-#endif /* CONFIG_SCHEDSTATS */
+       p->sched_contributes_to_load = !!task_contributes_to_load(p);
+       p->state = TASK_WAKING;
 
-out_activate:
+       if (p->sched_class->task_waking)
+               p->sched_class->task_waking(p);
+
+       cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+       if (task_cpu(p) != cpu)
+               set_task_cpu(p, cpu);
 #endif /* CONFIG_SMP */
-       ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
-                     cpu == this_cpu, en_flags);
-       success = 1;
-out_running:
-       ttwu_post_activation(p, rq, wake_flags, success);
+
+       ttwu_queue(p, cpu);
+stat:
+       ttwu_stat(p, cpu, wake_flags);
 out:
-       task_rq_unlock(rq, &flags);
-       put_cpu();
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
        return success;
 }
@@ -2584,31 +2661,34 @@ out:
  * try_to_wake_up_local - try to wake up a local task with rq lock held
  * @p: the thread to be awakened
  *
- * Put @p on the run-queue if it's not alredy there.  The caller must
+ * Put @p on the run-queue if it's not already there. The caller must
  * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.  this_rq() stays locked over invocation.
+ * the current task.
  */
 static void try_to_wake_up_local(struct task_struct *p)
 {
        struct rq *rq = task_rq(p);
-       bool success = false;
 
        BUG_ON(rq != this_rq());
        BUG_ON(p == current);
        lockdep_assert_held(&rq->lock);
 
+       if (!raw_spin_trylock(&p->pi_lock)) {
+               raw_spin_unlock(&rq->lock);
+               raw_spin_lock(&p->pi_lock);
+               raw_spin_lock(&rq->lock);
+       }
+
        if (!(p->state & TASK_NORMAL))
-               return;
+               goto out;
 
-       if (!p->se.on_rq) {
-               if (likely(!task_running(rq, p))) {
-                       schedstat_inc(rq, ttwu_count);
-                       schedstat_inc(rq, ttwu_local);
-               }
-               ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
-               success = true;
-       }
-       ttwu_post_activation(p, rq, 0, success);
+       if (!p->on_rq)
+               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+
+       ttwu_do_wakeup(rq, p, 0);
+       ttwu_stat(p, smp_processor_id(), 0);
+out:
+       raw_spin_unlock(&p->pi_lock);
 }
 
 /**
@@ -2641,18 +2721,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
  */
 static void __sched_fork(struct task_struct *p)
 {
+       p->on_rq                        = 0;
+
+       p->se.on_rq                     = 0;
        p->se.exec_start                = 0;
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
        p->se.nr_migrations             = 0;
+       p->se.vruntime                  = 0;
+       INIT_LIST_HEAD(&p->se.group_node);
 
 #ifdef CONFIG_SCHEDSTATS
        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 
        INIT_LIST_HEAD(&p->rt.run_list);
-       p->se.on_rq = 0;
-       INIT_LIST_HEAD(&p->se.group_node);
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2664,6 +2747,7 @@ static void __sched_fork(struct task_struct *p)
  */
 void sched_fork(struct task_struct *p, int clone_flags)
 {
+       unsigned long flags;
        int cpu = get_cpu();
 
        __sched_fork(p);
@@ -2714,22 +2798,24 @@ void sched_fork(struct task_struct *p, int clone_flags)
         *
         * Silence PROVE_RCU.
         */
-       rcu_read_lock();
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
        set_task_cpu(p, cpu);
-       rcu_read_unlock();
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
        if (likely(sched_info_on()))
                memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
-       p->oncpu = 0;
+#if defined(CONFIG_SMP)
+       p->on_cpu = 0;
 #endif
 #ifdef CONFIG_PREEMPT
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
 #endif
+#ifdef CONFIG_SMP
        plist_node_init(&p->pushable_tasks, MAX_PRIO);
+#endif
 
        put_cpu();
 }
@@ -2745,37 +2831,27 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
        unsigned long flags;
        struct rq *rq;
-       int cpu __maybe_unused = get_cpu();
 
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
 #ifdef CONFIG_SMP
-       rq = task_rq_lock(p, &flags);
-       p->state = TASK_WAKING;
-
        /*
         * Fork balancing, do it here and not earlier because:
         *  - cpus_allowed can change in the fork path
         *  - any previously selected cpu might disappear through hotplug
-        *
-        * We set TASK_WAKING so that select_task_rq() can drop rq->lock
-        * without people poking at ->cpus_allowed.
         */
-       cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
-       set_task_cpu(p, cpu);
-
-       p->state = TASK_RUNNING;
-       task_rq_unlock(rq, &flags);
+       set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
 #endif
 
-       rq = task_rq_lock(p, &flags);
+       rq = __task_rq_lock(p);
        activate_task(rq, p, 0);
-       trace_sched_wakeup_new(p, 1);
+       p->on_rq = 1;
+       trace_sched_wakeup_new(p, true);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_woken)
                p->sched_class->task_woken(rq, p);
 #endif
-       task_rq_unlock(rq, &flags);
-       put_cpu();
+       task_rq_unlock(rq, p, &flags);
 }
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2853,9 +2929,12 @@ static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
 {
+       sched_info_switch(prev, next);
+       perf_event_task_sched_out(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
        prepare_lock_switch(rq, next);
        prepare_arch_switch(next);
+       trace_sched_switch(prev, next);
 }
 
 /**
@@ -2988,7 +3067,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
        struct mm_struct *mm, *oldmm;
 
        prepare_task_switch(rq, prev, next);
-       trace_sched_switch(prev, next);
+
        mm = next->mm;
        oldmm = prev->active_mm;
        /*
@@ -3481,27 +3560,22 @@ void sched_exec(void)
 {
        struct task_struct *p = current;
        unsigned long flags;
-       struct rq *rq;
        int dest_cpu;
 
-       rq = task_rq_lock(p, &flags);
-       dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
+       dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
        if (dest_cpu == smp_processor_id())
                goto unlock;
 
-       /*
-        * select_task_rq() can race against ->cpus_allowed
-        */
-       if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-           likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+       if (likely(cpu_active(dest_cpu))) {
                struct migration_arg arg = { p, dest_cpu };
 
-               task_rq_unlock(rq, &flags);
-               stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+               raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+               stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
                return;
        }
 unlock:
-       task_rq_unlock(rq, &flags);
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 }
 
 #endif
@@ -3538,7 +3612,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
 
        rq = task_rq_lock(p, &flags);
        ns = do_task_delta_exec(p, rq);
-       task_rq_unlock(rq, &flags);
+       task_rq_unlock(rq, p, &flags);
 
        return ns;
 }
@@ -3556,7 +3630,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 
        rq = task_rq_lock(p, &flags);
        ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
-       task_rq_unlock(rq, &flags);
+       task_rq_unlock(rq, p, &flags);
 
        return ns;
 }
@@ -3580,7 +3654,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
        rq = task_rq_lock(p, &flags);
        thread_group_cputime(p, &totals);
        ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
-       task_rq_unlock(rq, &flags);
+       task_rq_unlock(rq, p, &flags);
 
        return ns;
 }
@@ -3645,6 +3719,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
 }
 
 /*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+                       cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+       cputime64_t tmp = cputime_to_cputime64(cputime);
+
+       /* Add system time to process. */
+       p->stime = cputime_add(p->stime, cputime);
+       p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+       account_group_system_time(p, cputime);
+
+       /* Add system time to cpustat. */
+       *target_cputime64 = cputime64_add(*target_cputime64, tmp);
+       cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
+       /* Account for system time used */
+       acct_update_integrals(p);
+}
+
+/*
  * Account system cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3655,36 +3755,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
                         cputime_t cputime, cputime_t cputime_scaled)
 {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       cputime64_t tmp;
+       cputime64_t *target_cputime64;
 
        if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
                account_guest_time(p, cputime, cputime_scaled);
                return;
        }
 
-       /* Add system time to process. */
-       p->stime = cputime_add(p->stime, cputime);
-       p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
-       account_group_system_time(p, cputime);
-
-       /* Add system time to cpustat. */
-       tmp = cputime_to_cputime64(cputime);
        if (hardirq_count() - hardirq_offset)
-               cpustat->irq = cputime64_add(cpustat->irq, tmp);
+               target_cputime64 = &cpustat->irq;
        else if (in_serving_softirq())
-               cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+               target_cputime64 = &cpustat->softirq;
        else
-               cpustat->system = cputime64_add(cpustat->system, tmp);
+               target_cputime64 = &cpustat->system;
 
-       cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
-
-       /* Account for system time used */
-       acct_update_integrals(p);
+       __account_system_time(p, cputime, cputime_scaled, target_cputime64);
 }
 
 /*
  * Account for involuntary wait time.
- * @steal: the cpu time spent in involuntary wait
+ * @cputime: the cpu time spent in involuntary wait
  */
 void account_steal_time(cputime_t cputime)
 {
@@ -3712,6 +3802,73 @@ void account_idle_time(cputime_t cputime)
 
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ *   - check for guest_time
+ *   - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+                                               struct rq *rq)
+{
+       cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+       cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+
+       if (irqtime_account_hi_update()) {
+               cpustat->irq = cputime64_add(cpustat->irq, tmp);
+       } else if (irqtime_account_si_update()) {
+               cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+       } else if (this_cpu_ksoftirqd() == p) {
+               /*
+                * ksoftirqd time do not get accounted in cpu_softirq_time.
+                * So, we have to handle it separately here.
+                * Also, p->stime needs to be updated for ksoftirqd.
+                */
+               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                                       &cpustat->softirq);
+       } else if (user_tick) {
+               account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+       } else if (p == rq->idle) {
+               account_idle_time(cputime_one_jiffy);
+       } else if (p->flags & PF_VCPU) { /* System time or guest time */
+               account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+       } else {
+               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                                       &cpustat->system);
+       }
+}
+
+static void irqtime_account_idle_ticks(int ticks)
+{
+       int i;
+       struct rq *rq = this_rq();
+
+       for (i = 0; i < ticks; i++)
+               irqtime_account_process_tick(current, 0, rq);
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+static void irqtime_account_idle_ticks(int ticks) {}
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+                                               struct rq *rq) {}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
 /*
  * Account a single tick of cpu time.
  * @p: the process that the cpu time gets accounted to
@@ -3722,6 +3879,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
        struct rq *rq = this_rq();
 
+       if (sched_clock_irqtime) {
+               irqtime_account_process_tick(p, user_tick, rq);
+               return;
+       }
+
        if (user_tick)
                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
        else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3747,6 +3909,12 @@ void account_steal_ticks(unsigned long ticks)
  */
 void account_idle_ticks(unsigned long ticks)
 {
+
+       if (sched_clock_irqtime) {
+               irqtime_account_idle_ticks(ticks);
+               return;
+       }
+
        account_idle_time(jiffies_to_cputime(ticks));
 }
 
@@ -3964,7 +4132,7 @@ static inline void schedule_debug(struct task_struct *prev)
        schedstat_inc(this_rq(), sched_count);
 #ifdef CONFIG_SCHEDSTATS
        if (unlikely(prev->lock_depth >= 0)) {
-               schedstat_inc(this_rq(), bkl_count);
+               schedstat_inc(this_rq(), rq_sched_info.bkl_count);
                schedstat_inc(prev, sched_info.bkl_count);
        }
 #endif
@@ -3972,7 +4140,7 @@ static inline void schedule_debug(struct task_struct *prev)
 
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-       if (prev->se.on_rq)
+       if (prev->on_rq)
                update_rq_clock(rq);
        prev->sched_class->put_prev_task(rq, prev);
 }
@@ -4022,9 +4190,6 @@ need_resched:
        rcu_note_context_switch(cpu);
        prev = rq->curr;
 
-       release_kernel_lock(prev);
-need_resched_nonpreemptible:
-
        schedule_debug(prev);
 
        if (sched_feat(HRTICK))
@@ -4037,11 +4202,13 @@ need_resched_nonpreemptible:
                if (unlikely(signal_pending_state(prev->state, prev))) {
                        prev->state = TASK_RUNNING;
                } else {
+                       deactivate_task(rq, prev, DEQUEUE_SLEEP);
+                       prev->on_rq = 0;
+
                        /*
-                        * If a worker is going to sleep, notify and
-                        * ask workqueue whether it wants to wake up a
-                        * task to maintain concurrency.  If so, wake
-                        * up the task.
+                        * If a worker went to sleep, notify and ask workqueue
+                        * whether it wants to wake up a task to maintain
+                        * concurrency.
                         */
                        if (prev->flags & PF_WQ_WORKER) {
                                struct task_struct *to_wakeup;
@@ -4050,7 +4217,16 @@ need_resched_nonpreemptible:
                                if (to_wakeup)
                                        try_to_wake_up_local(to_wakeup);
                        }
-                       deactivate_task(rq, prev, DEQUEUE_SLEEP);
+
+                       /*
+                        * If we are going to sleep and we have plugged IO
+                        * queued, make sure to submit it to avoid deadlocks.
+                        */
+                       if (blk_needs_flush_plug(prev)) {
+                               raw_spin_unlock(&rq->lock);
+                               blk_schedule_flush_plug(prev);
+                               raw_spin_lock(&rq->lock);
+                       }
                }
                switch_count = &prev->nvcsw;
        }
@@ -4066,13 +4242,9 @@ need_resched_nonpreemptible:
        rq->skip_clock_update = 0;
 
        if (likely(prev != next)) {
-               sched_info_switch(prev, next);
-               perf_event_task_sched_out(prev, next);
-
                rq->nr_switches++;
                rq->curr = next;
                ++*switch_count;
-               WARN_ON_ONCE(test_tsk_need_resched(next));
 
                context_switch(rq, prev, next); /* unlocks the rq */
                /*
@@ -4088,9 +4260,6 @@ need_resched_nonpreemptible:
 
        post_schedule(rq);
 
-       if (unlikely(reacquire_kernel_lock(prev)))
-               goto need_resched_nonpreemptible;
-
        preempt_enable_no_resched();
        if (need_resched())
                goto need_resched;
@@ -4098,70 +4267,53 @@ need_resched_nonpreemptible:
 EXPORT_SYMBOL(schedule);
 
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+       bool ret = false;
+
+       rcu_read_lock();
+       if (lock->owner != owner)
+               goto fail;
+
+       /*
+        * Ensure we emit the owner->on_cpu, dereference _after_ checking
+        * lock->owner still matches owner, if that fails, owner might
+        * point to free()d memory, if it still matches, the rcu_read_lock()
+        * ensures the memory stays valid.
+        */
+       barrier();
+
+       ret = owner->on_cpu;
+fail:
+       rcu_read_unlock();
+
+       return ret;
+}
+
 /*
  * Look out! "owner" is an entirely speculative pointer
  * access and not reliable.
  */
-int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 {
-       unsigned int cpu;
-       struct rq *rq;
-
        if (!sched_feat(OWNER_SPIN))
                return 0;
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
-       /*
-        * Need to access the cpu field knowing that
-        * DEBUG_PAGEALLOC could have unmapped it if
-        * the mutex owner just released it and exited.
-        */
-       if (probe_kernel_address(&owner->cpu, cpu))
-               return 0;
-#else
-       cpu = owner->cpu;
-#endif
+       while (owner_running(lock, owner)) {
+               if (need_resched())
+                       return 0;
 
-       /*
-        * Even if the access succeeded (likely case),
-        * the cpu field may no longer be valid.
-        */
-       if (cpu >= nr_cpumask_bits)
-               return 0;
+               arch_mutex_cpu_relax();
+       }
 
        /*
-        * We need to validate that we can do a
-        * get_cpu() and that we have the percpu area.
+        * If the owner changed to another task there is likely
+        * heavy contention, stop spinning.
         */
-       if (!cpu_online(cpu))
+       if (lock->owner)
                return 0;
 
-       rq = cpu_rq(cpu);
-
-       for (;;) {
-               /*
-                * Owner changed, break to re-assess state.
-                */
-               if (lock->owner != owner) {
-                       /*
-                        * If the lock has switched to a different owner,
-                        * we likely have heavy contention. Return 0 to quit
-                        * optimistic spinning and not contend further:
-                        */
-                       if (lock->owner)
-                               return 0;
-                       break;
-               }
-
-               /*
-                * Is that owner really running on that cpu?
-                */
-               if (task_thread_info(rq->curr) != owner || need_resched())
-                       return 0;
-
-               cpu_relax();
-       }
-
        return 1;
 }
 #endif
@@ -4291,6 +4443,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
 {
        __wake_up_common(q, mode, 1, 0, key);
 }
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
 /**
  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
@@ -4471,7 +4624,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
  * This waits for either a completion of a specific task to be signaled or for a
  * specified timeout to expire. It is interruptible. The timeout is in jiffies.
  */
-unsigned long __sched
+long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
                                          unsigned long timeout)
 {
@@ -4504,7 +4657,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
  * signaled or for a specified timeout to expire. It can be
  * interrupted by a kill signal. The timeout is in jiffies.
  */
-unsigned long __sched
+long __sched
 wait_for_completion_killable_timeout(struct completion *x,
                                     unsigned long timeout)
 {
@@ -4620,19 +4773,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-       unsigned long flags;
        int oldprio, on_rq, running;
        struct rq *rq;
        const struct sched_class *prev_class;
 
        BUG_ON(prio < 0 || prio > MAX_PRIO);
 
-       rq = task_rq_lock(p, &flags);
+       rq = __task_rq_lock(p);
 
        trace_sched_pi_setprio(p, prio);
        oldprio = p->prio;
        prev_class = p->sched_class;
-       on_rq = p->se.on_rq;
+       on_rq = p->on_rq;
        running = task_current(rq, p);
        if (on_rq)
                dequeue_task(rq, p, 0);
@@ -4648,12 +4800,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
        if (running)
                p->sched_class->set_curr_task(rq);
-       if (on_rq) {
+       if (on_rq)
                enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
 
-               check_class_changed(rq, p, prev_class, oldprio, running);
-       }
-       task_rq_unlock(rq, &flags);
+       check_class_changed(rq, p, prev_class, oldprio);
+       __task_rq_unlock(rq);
 }
 
 #endif
@@ -4681,7 +4832,7 @@ void set_user_nice(struct task_struct *p, long nice)
                p->static_prio = NICE_TO_PRIO(nice);
                goto out_unlock;
        }
-       on_rq = p->se.on_rq;
+       on_rq = p->on_rq;
        if (on_rq)
                dequeue_task(rq, p, 0);
 
@@ -4701,7 +4852,7 @@ void set_user_nice(struct task_struct *p, long nice)
                        resched_task(rq->curr);
        }
 out_unlock:
-       task_rq_unlock(rq, &flags);
+       task_rq_unlock(rq, p, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
 
@@ -4815,8 +4966,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
 static void
 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 {
-       BUG_ON(p->se.on_rq);
-
        p->policy = policy;
        p->rt_priority = prio;
        p->normal_prio = normal_prio(p);
@@ -4839,14 +4988,17 @@ static bool check_same_owner(struct task_struct *p)
 
        rcu_read_lock();
        pcred = __task_cred(p);
-       match = (cred->euid == pcred->euid ||
-                cred->euid == pcred->uid);
+       if (cred->user->user_ns == pcred->user->user_ns)
+               match = (cred->euid == pcred->euid ||
+                        cred->euid == pcred->uid);
+       else
+               match = false;
        rcu_read_unlock();
        return match;
 }
 
 static int __sched_setscheduler(struct task_struct *p, int policy,
-                               struct sched_param *param, bool user)
+                               const struct sched_param *param, bool user)
 {
        int retval, oldprio, oldpolicy = -1, on_rq, running;
        unsigned long flags;
@@ -4900,12 +5052,15 @@ recheck:
                            param->sched_priority > rlim_rtprio)
                                return -EPERM;
                }
+
                /*
-                * Like positive nice levels, dont allow tasks to
-                * move out of SCHED_IDLE either:
+                * Treat SCHED_IDLE as nice 20. Only allow a switch to
+                * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                 */
-               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
-                       return -EPERM;
+               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+                       if (!can_nice(p, TASK_NICE(p)))
+                               return -EPERM;
+               }
 
                /* can't change other user's priorities */
                if (!check_same_owner(p))
@@ -4925,21 +5080,29 @@ recheck:
        /*
         * make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
-        */
-       raw_spin_lock_irqsave(&p->pi_lock, flags);
-       /*
-        * To be able to change p->policy safely, the apropriate
+        *
+        * To be able to change p->policy safely, the appropriate
         * runqueue lock must be held.
         */
-       rq = __task_rq_lock(p);
+       rq = task_rq_lock(p, &flags);
 
        /*
         * Changing the policy of the stop threads its a very bad idea
         */
        if (p == rq->stop) {
+               task_rq_unlock(rq, p, &flags);
+               return -EINVAL;
+       }
+
+       /*
+        * If not changing anything there's no need to proceed further:
+        */
+       if (unlikely(policy == p->policy && (!rt_policy(policy) ||
+                       param->sched_priority == p->rt_priority))) {
+
                __task_rq_unlock(rq);
                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-               return -EINVAL;
+               return 0;
        }
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -4949,9 +5112,9 @@ recheck:
                 * assigned.
                 */
                if (rt_bandwidth_enabled() && rt_policy(policy) &&
-                               task_group(p)->rt_bandwidth.rt_runtime == 0) {
-                       __task_rq_unlock(rq);
-                       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+                               task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+                               !task_group_is_autogroup(task_group(p))) {
+                       task_rq_unlock(rq, p, &flags);
                        return -EPERM;
                }
        }
@@ -4960,11 +5123,10 @@ recheck:
        /* recheck policy now with rq lock held */
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                policy = oldpolicy = -1;
-               __task_rq_unlock(rq);
-               raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+               task_rq_unlock(rq, p, &flags);
                goto recheck;
        }
-       on_rq = p->se.on_rq;
+       on_rq = p->on_rq;
        running = task_current(rq, p);
        if (on_rq)
                deactivate_task(rq, p, 0);
@@ -4979,13 +5141,11 @@ recheck:
 
        if (running)
                p->sched_class->set_curr_task(rq);
-       if (on_rq) {
+       if (on_rq)
                activate_task(rq, p, 0);
 
-               check_class_changed(rq, p, prev_class, oldprio, running);
-       }
-       __task_rq_unlock(rq);
-       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+       check_class_changed(rq, p, prev_class, oldprio);
+       task_rq_unlock(rq, p, &flags);
 
        rt_mutex_adjust_pi(p);
 
@@ -5001,7 +5161,7 @@ recheck:
  * NOTE that the task may be already dead.
  */
 int sched_setscheduler(struct task_struct *p, int policy,
-                      struct sched_param *param)
+                      const struct sched_param *param)
 {
        return __sched_setscheduler(p, policy, param, true);
 }
@@ -5019,7 +5179,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
  * but our caller might not have that capability.
  */
 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
-                              struct sched_param *param)
+                              const struct sched_param *param)
 {
        return __sched_setscheduler(p, policy, param, false);
 }
@@ -5165,7 +5325,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
                goto out_free_cpus_allowed;
        }
        retval = -EPERM;
-       if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
+       if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
                goto out_unlock;
 
        retval = security_task_setscheduler(p);
@@ -5236,7 +5396,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
        struct task_struct *p;
        unsigned long flags;
-       struct rq *rq;
        int retval;
 
        get_online_cpus();
@@ -5251,9 +5410,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
        if (retval)
                goto out_unlock;
 
-       rq = task_rq_lock(p, &flags);
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
        cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
-       task_rq_unlock(rq, &flags);
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 out_unlock:
        rcu_read_unlock();
@@ -5400,6 +5559,67 @@ void __sched yield(void)
 }
 EXPORT_SYMBOL(yield);
 
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Returns true if we indeed boosted the target task.
+ */
+bool __sched yield_to(struct task_struct *p, bool preempt)
+{
+       struct task_struct *curr = current;
+       struct rq *rq, *p_rq;
+       unsigned long flags;
+       bool yielded = 0;
+
+       local_irq_save(flags);
+       rq = this_rq();
+
+again:
+       p_rq = task_rq(p);
+       double_rq_lock(rq, p_rq);
+       while (task_rq(p) != p_rq) {
+               double_rq_unlock(rq, p_rq);
+               goto again;
+       }
+
+       if (!curr->sched_class->yield_to_task)
+               goto out;
+
+       if (curr->sched_class != p->sched_class)
+               goto out;
+
+       if (task_running(p_rq, p) || p->state)
+               goto out;
+
+       yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+       if (yielded) {
+               schedstat_inc(rq, yld_count);
+               /*
+                * Make p's CPU reschedule; pick_next_entity takes care of
+                * fairness.
+                */
+               if (preempt && rq != p_rq)
+                       resched_task(p_rq->curr);
+       }
+
+out:
+       double_rq_unlock(rq, p_rq);
+       local_irq_restore(flags);
+
+       if (yielded)
+               schedule();
+
+       return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
 /*
  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
@@ -5410,6 +5630,7 @@ void __sched io_schedule(void)
 
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
+       blk_flush_plug(current);
        current->in_iowait = 1;
        schedule();
        current->in_iowait = 0;
@@ -5425,6 +5646,7 @@ long __sched io_schedule_timeout(long timeout)
 
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
+       blk_flush_plug(current);
        current->in_iowait = 1;
        ret = schedule_timeout(timeout);
        current->in_iowait = 0;
@@ -5515,7 +5737,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 
        rq = task_rq_lock(p, &flags);
        time_slice = p->sched_class->get_rr_interval(rq, p);
-       task_rq_unlock(rq, &flags);
+       task_rq_unlock(rq, p, &flags);
 
        rcu_read_unlock();
        jiffies_to_timespec(time_slice, &t);
@@ -5535,7 +5757,7 @@ void sched_show_task(struct task_struct *p)
        unsigned state;
 
        state = p->state ? __ffs(p->state) + 1 : 0;
-       printk(KERN_INFO "%-13.13s %c", p->comm,
+       printk(KERN_INFO "%-15.15s %c", p->comm,
                state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
        if (state == TASK_RUNNING)
@@ -5573,7 +5795,7 @@ void show_state_filter(unsigned long state_filter)
        do_each_thread(g, p) {
                /*
                 * reset the NMI-timeout, listing all files on a slow
-                * console might take alot of time:
+                * console might take a lot of time:
                 */
                touch_nmi_watchdog();
                if (!state_filter || (p->state & state_filter))
@@ -5633,8 +5855,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        rcu_read_unlock();
 
        rq->curr = rq->idle = idle;
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
-       idle->oncpu = 1;
+#if defined(CONFIG_SMP)
+       idle->on_cpu = 1;
 #endif
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 
@@ -5648,7 +5870,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         * The idle tasks have their own, simple scheduling class:
         */
        idle->sched_class = &idle_sched_class;
-       ftrace_graph_init_task(idle);
+       ftrace_graph_init_idle_task(idle, cpu);
 }
 
 /*
@@ -5699,7 +5921,6 @@ static void update_sysctl(void)
        SET_SYSCTL(sched_min_granularity);
        SET_SYSCTL(sched_latency);
        SET_SYSCTL(sched_wakeup_granularity);
-       SET_SYSCTL(sched_shares_ratelimit);
 #undef SET_SYSCTL
 }
 
@@ -5739,18 +5960,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        unsigned int dest_cpu;
        int ret = 0;
 
-       /*
-        * Serialize against TASK_WAKING so that ttwu() and wunt() can
-        * drop the rq->lock and still rely on ->cpus_allowed.
-        */
-again:
-       while (task_is_waking(p))
-               cpu_relax();
        rq = task_rq_lock(p, &flags);
-       if (task_is_waking(p)) {
-               task_rq_unlock(rq, &flags);
-               goto again;
-       }
 
        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                ret = -EINVAL;
@@ -5775,16 +5985,16 @@ again:
                goto out;
 
        dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-       if (migrate_task(p, dest_cpu)) {
+       if (need_migrate_task(p)) {
                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
-               task_rq_unlock(rq, &flags);
+               task_rq_unlock(rq, p, &flags);
                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
                tlb_migrate_finish(p->mm);
                return 0;
        }
 out:
-       task_rq_unlock(rq, &flags);
+       task_rq_unlock(rq, p, &flags);
 
        return ret;
 }
@@ -5812,6 +6022,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        rq_src = cpu_rq(src_cpu);
        rq_dest = cpu_rq(dest_cpu);
 
+       raw_spin_lock(&p->pi_lock);
        double_rq_lock(rq_src, rq_dest);
        /* Already moved. */
        if (task_cpu(p) != src_cpu)
@@ -5824,7 +6035,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
         * If we're not on a rq, the next wake-up will ensure we're
         * placed properly.
         */
-       if (p->se.on_rq) {
+       if (p->on_rq) {
                deactivate_task(rq_src, p, 0);
                set_task_cpu(p, dest_cpu);
                activate_task(rq_dest, p, 0);
@@ -5834,6 +6045,7 @@ done:
        ret = 1;
 fail:
        double_rq_unlock(rq_src, rq_dest);
+       raw_spin_unlock(&p->pi_lock);
        return ret;
 }
 
@@ -5857,29 +6069,20 @@ static int migration_cpu_stop(void *data)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+
 /*
- * Figure out where task on dead CPU should go, use force if necessary.
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
  */
-void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void idle_task_exit(void)
 {
-       struct rq *rq = cpu_rq(dead_cpu);
-       int needs_cpu, uninitialized_var(dest_cpu);
-       unsigned long flags;
+       struct mm_struct *mm = current->active_mm;
 
-       local_irq_save(flags);
+       BUG_ON(cpu_online(smp_processor_id()));
 
-       raw_spin_lock(&rq->lock);
-       needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
-       if (needs_cpu)
-               dest_cpu = select_fallback_rq(dead_cpu, p);
-       raw_spin_unlock(&rq->lock);
-       /*
-        * It can only fail if we race with set_cpus_allowed(),
-        * in the racer should migrate the task anyway.
-        */
-       if (needs_cpu)
-               __migrate_task(p, dead_cpu, dest_cpu);
-       local_irq_restore(flags);
+       if (mm != &init_mm)
+               switch_mm(mm, &init_mm, current);
+       mmdrop(mm);
 }
 
 /*
@@ -5892,128 +6095,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
        struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
-       unsigned long flags;
 
-       local_irq_save(flags);
-       double_rq_lock(rq_src, rq_dest);
        rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
        rq_src->nr_uninterruptible = 0;
-       double_rq_unlock(rq_src, rq_dest);
-       local_irq_restore(flags);
-}
-
-/* Run through task list and migrate tasks from the dead cpu. */
-static void migrate_live_tasks(int src_cpu)
-{
-       struct task_struct *p, *t;
-
-       read_lock(&tasklist_lock);
-
-       do_each_thread(t, p) {
-               if (p == current)
-                       continue;
-
-               if (task_cpu(p) == src_cpu)
-                       move_task_off_dead_cpu(src_cpu, p);
-       } while_each_thread(t, p);
-
-       read_unlock(&tasklist_lock);
 }
 
 /*
- * Schedules idle task to be the next runnable task on current CPU.
- * It does so by boosting its priority to highest possible.
- * Used by CPU offline code.
+ * remove the tasks which were accounted by rq from calc_load_tasks.
  */
-void sched_idle_next(void)
+static void calc_global_load_remove(struct rq *rq)
 {
-       int this_cpu = smp_processor_id();
-       struct rq *rq = cpu_rq(this_cpu);
-       struct task_struct *p = rq->idle;
-       unsigned long flags;
-
-       /* cpu has to be offline */
-       BUG_ON(cpu_online(this_cpu));
-
-       /*
-        * Strictly not necessary since rest of the CPUs are stopped by now
-        * and interrupts disabled on the current cpu.
-        */
-       raw_spin_lock_irqsave(&rq->lock, flags);
-
-       __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-
-       activate_task(rq, p, 0);
-
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+       rq->calc_load_active = 0;
 }
 
 /*
- * Ensures that the idle task is using init_mm right before its cpu goes
- * offline.
+ * Migrate all tasks from the rq, sleeping tasks will be migrated by
+ * try_to_wake_up()->select_task_rq().
+ *
+ * Called with rq->lock held even though we'er in stop_machine() and
+ * there's no concurrency possible, we hold the required locks anyway
+ * because of lock validation efforts.
  */
-void idle_task_exit(void)
-{
-       struct mm_struct *mm = current->active_mm;
-
-       BUG_ON(cpu_online(smp_processor_id()));
-
-       if (mm != &init_mm)
-               switch_mm(mm, &init_mm, current);
-       mmdrop(mm);
-}
-
-/* called under rq->lock with disabled interrupts */
-static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
+static void migrate_tasks(unsigned int dead_cpu)
 {
        struct rq *rq = cpu_rq(dead_cpu);
-
-       /* Must be exiting, otherwise would be on tasklist. */
-       BUG_ON(!p->exit_state);
-
-       /* Cannot have done final schedule yet: would have vanished. */
-       BUG_ON(p->state == TASK_DEAD);
-
-       get_task_struct(p);
+       struct task_struct *next, *stop = rq->stop;
+       int dest_cpu;
 
        /*
-        * Drop lock around migration; if someone else moves it,
-        * that's OK. No task can be added to this CPU, so iteration is
-        * fine.
+        * Fudge the rq selection such that the below task selection loop
+        * doesn't get stuck on the currently eligible stop task.
+        *
+        * We're currently inside stop_machine() and the rq is either stuck
+        * in the stop_machine_cpu_stop() loop, or we're executing this code,
+        * either way we should never end up calling schedule() until we're
+        * done here.
         */
-       raw_spin_unlock_irq(&rq->lock);
-       move_task_off_dead_cpu(dead_cpu, p);
-       raw_spin_lock_irq(&rq->lock);
-
-       put_task_struct(p);
-}
-
-/* release_task() removes task from tasklist, so we won't find dead tasks. */
-static void migrate_dead_tasks(unsigned int dead_cpu)
-{
-       struct rq *rq = cpu_rq(dead_cpu);
-       struct task_struct *next;
+       rq->stop = NULL;
 
        for ( ; ; ) {
-               if (!rq->nr_running)
+               /*
+                * There's this thread running, bail when that's the only
+                * remaining thread.
+                */
+               if (rq->nr_running == 1)
                        break;
+
                next = pick_next_task(rq);
-               if (!next)
-                       break;
+               BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
-               migrate_dead(dead_cpu, next);
 
+               /* Find suitable destination for @next, with force if needed. */
+               dest_cpu = select_fallback_rq(dead_cpu, next);
+               raw_spin_unlock(&rq->lock);
+
+               __migrate_task(next, dead_cpu, dest_cpu);
+
+               raw_spin_lock(&rq->lock);
        }
-}
 
-/*
- * remove the tasks which were accounted by rq from calc_load_tasks.
- */
-static void calc_global_load_remove(struct rq *rq)
-{
-       atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
-       rq->calc_load_active = 0;
+       rq->stop = stop;
 }
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6223,15 +6367,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
        unsigned long flags;
        struct rq *rq = cpu_rq(cpu);
 
-       switch (action) {
+       switch (action & ~CPU_TASKS_FROZEN) {
 
        case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
                rq->calc_load_update = calc_load_update;
                break;
 
        case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
                /* Update our root-domain */
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
@@ -6243,33 +6385,26 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 
 #ifdef CONFIG_HOTPLUG_CPU
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               migrate_live_tasks(cpu);
-               /* Idle task back to normal (off runqueue, low prio) */
-               raw_spin_lock_irq(&rq->lock);
-               deactivate_task(rq, rq->idle, 0);
-               __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
-               rq->idle->sched_class = &idle_sched_class;
-               migrate_dead_tasks(cpu);
-               raw_spin_unlock_irq(&rq->lock);
-               migrate_nr_uninterruptible(rq);
-               BUG_ON(rq->nr_running != 0);
-               calc_global_load_remove(rq);
-               break;
-
        case CPU_DYING:
-       case CPU_DYING_FROZEN:
+               sched_ttwu_pending();
                /* Update our root-domain */
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                        set_rq_offline(rq);
                }
+               migrate_tasks(cpu);
+               BUG_ON(rq->nr_running != 1); /* the migration thread */
                raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+               migrate_nr_uninterruptible(rq);
+               calc_global_load_remove(rq);
                break;
 #endif
        }
+
+       update_max_interval();
+
        return NOTIFY_OK;
 }
 
@@ -7955,6 +8090,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
        INIT_LIST_HEAD(&cfs_rq->tasks);
 #ifdef CONFIG_FAIR_GROUP_SCHED
        cfs_rq->rq = rq;
+       /* allow initial update_cfs_load() to truncate */
+#ifdef CONFIG_SMP
+       cfs_rq->load_stamp = 1;
+#endif
 #endif
        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
@@ -7997,18 +8136,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
-                               struct sched_entity *se, int cpu, int add,
+                               struct sched_entity *se, int cpu,
                                struct sched_entity *parent)
 {
        struct rq *rq = cpu_rq(cpu);
        tg->cfs_rq[cpu] = cfs_rq;
        init_cfs_rq(cfs_rq, rq);
        cfs_rq->tg = tg;
-       if (add)
-               list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 
        tg->se[cpu] = se;
-       /* se could be NULL for init_task_group */
+       /* se could be NULL for root_task_group */
        if (!se)
                return;
 
@@ -8018,15 +8155,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                se->cfs_rq = parent->my_q;
 
        se->my_q = cfs_rq;
-       se->load.weight = tg->shares;
-       se->load.inv_weight = 0;
+       update_load_set(&se->load, 0);
        se->parent = parent;
 }
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
-               struct sched_rt_entity *rt_se, int cpu, int add,
+               struct sched_rt_entity *rt_se, int cpu,
                struct sched_rt_entity *parent)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -8035,8 +8171,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
        init_rt_rq(rt_rq, rq);
        rt_rq->tg = tg;
        rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
-       if (add)
-               list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
 
        tg->rt_se[cpu] = rt_se;
        if (!rt_se)
@@ -8071,18 +8205,18 @@ void __init sched_init(void)
                ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-               init_task_group.se = (struct sched_entity **)ptr;
+               root_task_group.se = (struct sched_entity **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
 
-               init_task_group.cfs_rq = (struct cfs_rq **)ptr;
+               root_task_group.cfs_rq = (struct cfs_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
-               init_task_group.rt_se = (struct sched_rt_entity **)ptr;
+               root_task_group.rt_se = (struct sched_rt_entity **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
 
-               init_task_group.rt_rq = (struct rt_rq **)ptr;
+               root_task_group.rt_rq = (struct rt_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
 
 #endif /* CONFIG_RT_GROUP_SCHED */
@@ -8102,20 +8236,16 @@ void __init sched_init(void)
                        global_rt_period(), global_rt_runtime());
 
 #ifdef CONFIG_RT_GROUP_SCHED
-       init_rt_bandwidth(&init_task_group.rt_bandwidth,
+       init_rt_bandwidth(&root_task_group.rt_bandwidth,
                        global_rt_period(), global_rt_runtime());
 #endif /* CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_CGROUP_SCHED
-       list_add(&init_task_group.list, &task_groups);
-       INIT_LIST_HEAD(&init_task_group.children);
-
+       list_add(&root_task_group.list, &task_groups);
+       INIT_LIST_HEAD(&root_task_group.children);
+       autogroup_init(&init_task);
 #endif /* CONFIG_CGROUP_SCHED */
 
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
-       update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
-                                           __alignof__(unsigned long));
-#endif
        for_each_possible_cpu(i) {
                struct rq *rq;
 
@@ -8127,38 +8257,34 @@ void __init sched_init(void)
                init_cfs_rq(&rq->cfs, rq);
                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-               init_task_group.shares = init_task_group_load;
+               root_task_group.shares = root_task_group_load;
                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
                /*
-                * How much cpu bandwidth does init_task_group get?
+                * How much cpu bandwidth does root_task_group get?
                 *
                 * In case of task-groups formed thr' the cgroup filesystem, it
                 * gets 100% of the cpu resources in the system. This overall
                 * system cpu resource is divided among the tasks of
-                * init_task_group and its child task-groups in a fair manner,
+                * root_task_group and its child task-groups in a fair manner,
                 * based on each entity's (task or task-group's) weight
                 * (se->load.weight).
                 *
-                * In other words, if init_task_group has 10 tasks of weight
+                * In other words, if root_task_group has 10 tasks of weight
                 * 1024) and two child groups A0 and A1 (of weight 1024 each),
                 * then A0's share of the cpu resource is:
                 *
                 *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
                 *
-                * We achieve this by letting init_task_group's tasks sit
-                * directly in rq->cfs (i.e init_task_group->se[] = NULL).
+                * We achieve this by letting root_task_group's tasks sit
+                * directly in rq->cfs (i.e root_task_group->se[] = NULL).
                 */
-               init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
-#endif
+               init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
                rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
                INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
-               init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
-#endif
+               init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
 
                for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8238,8 +8364,6 @@ void __init sched_init(void)
                zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
 
-       perf_event_init();
-
        scheduler_running = 1;
 }
 
@@ -8248,7 +8372,7 @@ static inline int preempt_count_equals(int preempt_offset)
 {
        int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
 
-       return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+       return (nested == preempt_offset);
 }
 
 void __might_sleep(const char *file, int line, int preempt_offset)
@@ -8283,9 +8407,11 @@ EXPORT_SYMBOL(__might_sleep);
 #ifdef CONFIG_MAGIC_SYSRQ
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
+       const struct sched_class *prev_class = p->sched_class;
+       int old_prio = p->prio;
        int on_rq;
 
-       on_rq = p->se.on_rq;
+       on_rq = p->on_rq;
        if (on_rq)
                deactivate_task(rq, p, 0);
        __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8293,6 +8419,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
                activate_task(rq, p, 0);
                resched_task(rq->curr);
        }
+
+       check_class_changed(rq, p, prev_class, old_prio);
 }
 
 void normalize_rt_tasks(void)
@@ -8408,7 +8536,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se;
-       struct rq *rq;
        int i;
 
        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8421,8 +8548,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        tg->shares = NICE_0_LOAD;
 
        for_each_possible_cpu(i) {
-               rq = cpu_rq(i);
-
                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                      GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
@@ -8433,7 +8558,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                if (!se)
                        goto err_free_rq;
 
-               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
+               init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
        }
 
        return 1;
@@ -8444,15 +8569,21 @@ err:
        return 0;
 }
 
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-       list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
-                       &cpu_rq(cpu)->leaf_cfs_rq_list);
-}
-
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
-       list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags;
+
+       /*
+       * Only empty task groups can be destroyed; so we can speculatively
+       * check on_list without danger of it being re-added.
+       */
+       if (!tg->cfs_rq[cpu]->on_list)
+               return;
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+       list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 #else /* !CONFG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
@@ -8465,10 +8596,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        return 1;
 }
 
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-}
-
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 }
@@ -8523,7 +8650,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                if (!rt_se)
                        goto err_free_rq;
 
-               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
+               init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
        }
 
        return 1;
@@ -8533,17 +8660,6 @@ err_free_rq:
 err:
        return 0;
 }
-
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-       list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
-                       &cpu_rq(cpu)->leaf_rt_rq_list);
-}
-
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-       list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
-}
 #else /* !CONFIG_RT_GROUP_SCHED */
 static inline void free_rt_sched_group(struct task_group *tg)
 {
@@ -8554,14 +8670,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
        return 1;
 }
-
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
-
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
 #endif /* CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_CGROUP_SCHED
@@ -8569,6 +8677,7 @@ static void free_sched_group(struct task_group *tg)
 {
        free_fair_sched_group(tg);
        free_rt_sched_group(tg);
+       autogroup_free(tg);
        kfree(tg);
 }
 
@@ -8577,7 +8686,6 @@ struct task_group *sched_create_group(struct task_group *parent)
 {
        struct task_group *tg;
        unsigned long flags;
-       int i;
 
        tg = kzalloc(sizeof(*tg), GFP_KERNEL);
        if (!tg)
@@ -8590,10 +8698,6 @@ struct task_group *sched_create_group(struct task_group *parent)
                goto err;
 
        spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i) {
-               register_fair_sched_group(tg, i);
-               register_rt_sched_group(tg, i);
-       }
        list_add_rcu(&tg->list, &task_groups);
 
        WARN_ON(!parent); /* root should already exist */
@@ -8623,11 +8727,11 @@ void sched_destroy_group(struct task_group *tg)
        unsigned long flags;
        int i;
 
-       spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i) {
+       /* end participation in shares distribution */
+       for_each_possible_cpu(i)
                unregister_fair_sched_group(tg, i);
-               unregister_rt_sched_group(tg, i);
-       }
+
+       spin_lock_irqsave(&task_group_lock, flags);
        list_del_rcu(&tg->list);
        list_del_rcu(&tg->siblings);
        spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8650,7 +8754,7 @@ void sched_move_task(struct task_struct *tsk)
        rq = task_rq_lock(tsk, &flags);
 
        running = task_current(rq, tsk);
-       on_rq = tsk->se.on_rq;
+       on_rq = tsk->on_rq;
 
        if (on_rq)
                dequeue_task(rq, tsk, 0);
@@ -8669,38 +8773,11 @@ void sched_move_task(struct task_struct *tsk)
        if (on_rq)
                enqueue_task(rq, tsk, 0);
 
-       task_rq_unlock(rq, &flags);
+       task_rq_unlock(rq, tsk, &flags);
 }
 #endif /* CONFIG_CGROUP_SCHED */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void __set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-       struct cfs_rq *cfs_rq = se->cfs_rq;
-       int on_rq;
-
-       on_rq = se->on_rq;
-       if (on_rq)
-               dequeue_entity(cfs_rq, se, 0);
-
-       se->load.weight = shares;
-       se->load.inv_weight = 0;
-
-       if (on_rq)
-               enqueue_entity(cfs_rq, se, 0);
-}
-
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-       struct cfs_rq *cfs_rq = se->cfs_rq;
-       struct rq *rq = cfs_rq->rq;
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       __set_se_shares(se, shares);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
 static DEFINE_MUTEX(shares_mutex);
 
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8723,37 +8800,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
        if (tg->shares == shares)
                goto done;
 
-       spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i)
-               unregister_fair_sched_group(tg, i);
-       list_del_rcu(&tg->siblings);
-       spin_unlock_irqrestore(&task_group_lock, flags);
-
-       /* wait for any ongoing reference to this group to finish */
-       synchronize_sched();
-
-       /*
-        * Now we are free to modify the group's share on each cpu
-        * w/o tripping rebalance_share or load_balance_fair.
-        */
        tg->shares = shares;
        for_each_possible_cpu(i) {
-               /*
-                * force a rebalance
-                */
-               cfs_rq_set_shares(tg->cfs_rq[i], 0);
-               set_se_shares(tg->se[i], shares);
+               struct rq *rq = cpu_rq(i);
+               struct sched_entity *se;
+
+               se = tg->se[i];
+               /* Propagate contribution to hierarchy */
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               for_each_sched_entity(se)
+                       update_cfs_shares(group_cfs_rq(se));
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
        }
 
-       /*
-        * Enable load balance activity on this group, by inserting it back on
-        * each cpu's rq->leaf_cfs_rq_list.
-        */
-       spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i)
-               register_fair_sched_group(tg, i);
-       list_add_rcu(&tg->siblings, &tg->parent->children);
-       spin_unlock_irqrestore(&task_group_lock, flags);
 done:
        mutex_unlock(&shares_mutex);
        return 0;
@@ -9052,7 +9111,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 
        if (!cgrp->parent) {
                /* This is early initialization for the top cgroup */
-               return &init_task_group.css;
+               return &root_task_group.css;
        }
 
        parent = cgroup_tg(cgrp->parent);
@@ -9123,6 +9182,21 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
        }
 }
 
+static void
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+               struct cgroup *old_cgrp, struct task_struct *task)
+{
+       /*
+        * cgroup_exit() is called in the copy_process() failure path.
+        * Ignore this case since the task hasn't ran yet, this avoids
+        * trying to poke a half freed task state from generic code.
+        */
+       if (!(task->flags & PF_EXITING))
+               return;
+
+       sched_move_task(task);
+}
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
                                u64 shareval)
@@ -9195,6 +9269,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .destroy        = cpu_cgroup_destroy,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
+       .exit           = cpu_cgroup_exit,
        .populate       = cpu_cgroup_populate,
        .subsys_id      = cpu_cgroup_subsys_id,
        .early_init     = 1,
@@ -9479,72 +9554,3 @@ struct cgroup_subsys cpuacct_subsys = {
 };
 #endif /* CONFIG_CGROUP_CPUACCT */
 
-#ifndef CONFIG_SMP
-
-void synchronize_sched_expedited(void)
-{
-       barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#else /* #ifndef CONFIG_SMP */
-
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
-       /*
-        * There must be a full memory barrier on each affected CPU
-        * between the time that try_stop_cpus() is called and the
-        * time that it returns.
-        *
-        * In the current initial implementation of cpu_stop, the
-        * above condition is already met when the control reaches
-        * this point and the following smp_mb() is not strictly
-        * necessary.  Do smp_mb() anyway for documentation and
-        * robustness against future implementation changes.
-        */
-       smp_mb(); /* See above comment block. */
-       return 0;
-}
-
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly.  This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier.  Failing to
- * observe this restriction will result in deadlock.
- */
-void synchronize_sched_expedited(void)
-{
-       int snap, trycount = 0;
-
-       smp_mb();  /* ensure prior mod happens before capturing snap. */
-       snap = atomic_read(&synchronize_sched_expedited_count) + 1;
-       get_online_cpus();
-       while (try_stop_cpus(cpu_online_mask,
-                            synchronize_sched_expedited_cpu_stop,
-                            NULL) == -EAGAIN) {
-               put_online_cpus();
-               if (trycount++ < 10)
-                       udelay(trycount * num_online_cpus());
-               else {
-                       synchronize_sched();
-                       return;
-               }
-               if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
-                       smp_mb(); /* ensure test happens before caller kfree */
-                       return;
-               }
-               get_online_cpus();
-       }
-       atomic_inc(&synchronize_sched_expedited_count);
-       smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
-       put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#endif /* #else #ifndef CONFIG_SMP */