Merge commit 'v2.6.34-rc6' into sched/core
Ingo Molnar [Fri, 7 May 2010 09:27:54 +0000 (11:27 +0200)]
19 files changed:
Documentation/scheduler/sched-design-CFS.txt
Documentation/scheduler/sched-rt-group.txt
include/linux/cpuset.h
include/linux/sched.h
init/Kconfig
kernel/capability.c
kernel/cpu.c
kernel/cpuset.c
kernel/cred-internals.h [deleted file]
kernel/cred.c
kernel/exit.c
kernel/sched.c
kernel/sched_debug.c
kernel/sched_fair.c
kernel/sched_features.h
kernel/sched_idletask.c
kernel/sched_rt.c
kernel/time/tick-sched.c
kernel/user.c

index 6f33593..8239ebb 100644 (file)
@@ -211,7 +211,7 @@ provide fair CPU time to each such task group.  For example, it may be
 desirable to first provide fair CPU time to each user on the system and then to
 each task belonging to a user.
 
-CONFIG_GROUP_SCHED strives to achieve exactly that.  It lets tasks to be
+CONFIG_CGROUP_SCHED strives to achieve exactly that.  It lets tasks to be
 grouped and divides CPU time fairly among such groups.
 
 CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and
@@ -220,38 +220,11 @@ SCHED_RR) tasks.
 CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and
 SCHED_BATCH) tasks.
 
-At present, there are two (mutually exclusive) mechanisms to group tasks for
-CPU bandwidth control purposes:
-
- - Based on user id (CONFIG_USER_SCHED)
-
-   With this option, tasks are grouped according to their user id.
-
- - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED)
-
-   This options needs CONFIG_CGROUPS to be defined, and lets the administrator
+   These options need CONFIG_CGROUPS to be defined, and let the administrator
    create arbitrary groups of tasks, using the "cgroup" pseudo filesystem.  See
    Documentation/cgroups/cgroups.txt for more information about this filesystem.
 
-Only one of these options to group tasks can be chosen and not both.
-
-When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new
-user and a "cpu_share" file is added in that directory.
-
-       # cd /sys/kernel/uids
-       # cat 512/cpu_share             # Display user 512's CPU share
-       1024
-       # echo 2048 > 512/cpu_share     # Modify user 512's CPU share
-       # cat 512/cpu_share             # Display user 512's CPU share
-       2048
-       #
-
-CPU bandwidth between two users is divided in the ratio of their CPU shares.
-For example: if you would like user "root" to get twice the bandwidth of user
-"guest," then set the cpu_share for both the users such that "root"'s cpu_share
-is twice "guest"'s cpu_share.
-
-When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each
+When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each
 group created using the pseudo filesystem.  See example steps below to create
 task groups and modify their CPU share using the "cgroups" pseudo filesystem.
 
@@ -273,24 +246,3 @@ task groups and modify their CPU share using the "cgroups" pseudo filesystem.
 
        # #Launch gmplayer (or your favourite movie player)
        # echo <movie_player_pid> > multimedia/tasks
-
-8. Implementation note: user namespaces
-
-User namespaces are intended to be hierarchical.  But they are currently
-only partially implemented.  Each of those has ramifications for CFS.
-
-First, since user namespaces are hierarchical, the /sys/kernel/uids
-presentation is inadequate.  Eventually we will likely want to use sysfs
-tagging to provide private views of /sys/kernel/uids within each user
-namespace.
-
-Second, the hierarchical nature is intended to support completely
-unprivileged use of user namespaces.  So if using user groups, then
-we want the users in a user namespace to be children of the user
-who created it.
-
-That is currently unimplemented.  So instead, every user in a new
-user namespace will receive 1024 shares just like any user in the
-initial user namespace.  Note that at the moment creation of a new
-user namespace requires each of CAP_SYS_ADMIN, CAP_SETUID, and
-CAP_SETGID.
index 86eabe6..605b0d4 100644 (file)
@@ -126,23 +126,12 @@ priority!
 2.3 Basis for grouping tasks
 ----------------------------
 
-There are two compile-time settings for allocating CPU bandwidth. These are
-configured using the "Basis for grouping tasks" multiple choice menu under
-General setup > Group CPU Scheduler:
-
-a. CONFIG_USER_SCHED (aka "Basis for grouping tasks" =  "user id")
-
-This lets you use the virtual files under
-"/sys/kernel/uids/<uid>/cpu_rt_runtime_us" to control he CPU time reserved for
-each user .
-
-The other option is:
-
-.o CONFIG_CGROUP_SCHED (aka "Basis for grouping tasks" = "Control groups")
+Enabling CONFIG_RT_GROUP_SCHED lets you explicitly allocate real
+CPU bandwidth to task groups.
 
 This uses the /cgroup virtual file system and
 "/cgroup/<cgroup>/cpu.rt_runtime_us" to control the CPU time reserved for each
-control group instead.
+control group.
 
 For more information on working with control groups, you should read
 Documentation/cgroups/cgroups.txt as well.
@@ -161,8 +150,7 @@ For now, this can be simplified to just the following (but see Future plans):
 ===============
 
 There is work in progress to make the scheduling period for each group
-("/sys/kernel/uids/<uid>/cpu_rt_period_us" or
-"/cgroup/<cgroup>/cpu.rt_period_us" respectively) configurable as well.
+("/cgroup/<cgroup>/cpu.rt_period_us") configurable as well.
 
 The constraint on the period is that a subgroup must have a smaller or
 equal period to its parent. But realistically its not very useful _yet_
index a5740fc..a73454a 100644 (file)
@@ -21,8 +21,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
-extern void cpuset_cpus_allowed_locked(struct task_struct *p,
-                                      struct cpumask *mask);
+extern int cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -69,9 +68,6 @@ struct seq_file;
 extern void cpuset_task_status_allowed(struct seq_file *m,
                                        struct task_struct *task);
 
-extern void cpuset_lock(void);
-extern void cpuset_unlock(void);
-
 extern int cpuset_mem_spread_node(void);
 
 static inline int cpuset_do_page_mem_spread(void)
@@ -105,10 +101,11 @@ static inline void cpuset_cpus_allowed(struct task_struct *p,
 {
        cpumask_copy(mask, cpu_possible_mask);
 }
-static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
-                                             struct cpumask *mask)
+
+static inline int cpuset_cpus_allowed_fallback(struct task_struct *p)
 {
-       cpumask_copy(mask, cpu_possible_mask);
+       cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
+       return cpumask_any(cpu_active_mask);
 }
 
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
@@ -157,9 +154,6 @@ static inline void cpuset_task_status_allowed(struct seq_file *m,
 {
 }
 
-static inline void cpuset_lock(void) {}
-static inline void cpuset_unlock(void) {}
-
 static inline int cpuset_mem_spread_node(void)
 {
        return 0;
index dad7f66..dfea405 100644 (file)
@@ -275,11 +275,17 @@ extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
 extern int get_nohz_load_balancer(void);
+extern int nohz_ratelimit(int cpu);
 #else
 static inline int select_nohz_load_balancer(int cpu)
 {
        return 0;
 }
+
+static inline int nohz_ratelimit(int cpu)
+{
+       return 0;
+}
 #endif
 
 /*
@@ -954,6 +960,7 @@ struct sched_domain {
        char *name;
 #endif
 
+       unsigned int span_weight;
        /*
         * Span of all CPUs in this domain.
         *
@@ -1026,12 +1033,17 @@ struct sched_domain;
 #define WF_SYNC                0x01            /* waker goes to sleep after wakup */
 #define WF_FORK                0x02            /* child wakeup after fork */
 
+#define ENQUEUE_WAKEUP         1
+#define ENQUEUE_WAKING         2
+#define ENQUEUE_HEAD           4
+
+#define DEQUEUE_SLEEP          1
+
 struct sched_class {
        const struct sched_class *next;
 
-       void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup,
-                             bool head);
-       void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
+       void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+       void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
        void (*yield_task) (struct rq *rq);
 
        void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
@@ -1040,7 +1052,8 @@ struct sched_class {
        void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
-       int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+       int  (*select_task_rq)(struct rq *rq, struct task_struct *p,
+                              int sd_flag, int flags);
 
        void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
        void (*post_schedule) (struct rq *this_rq);
@@ -1077,36 +1090,8 @@ struct load_weight {
        unsigned long weight, inv_weight;
 };
 
-/*
- * CFS stats for a schedulable entity (task, task-group etc)
- *
- * Current field usage histogram:
- *
- *     4 se->block_start
- *     4 se->run_node
- *     4 se->sleep_start
- *     6 se->load.weight
- */
-struct sched_entity {
-       struct load_weight      load;           /* for load-balancing */
-       struct rb_node          run_node;
-       struct list_head        group_node;
-       unsigned int            on_rq;
-
-       u64                     exec_start;
-       u64                     sum_exec_runtime;
-       u64                     vruntime;
-       u64                     prev_sum_exec_runtime;
-
-       u64                     last_wakeup;
-       u64                     avg_overlap;
-
-       u64                     nr_migrations;
-
-       u64                     start_runtime;
-       u64                     avg_wakeup;
-
 #ifdef CONFIG_SCHEDSTATS
+struct sched_statistics {
        u64                     wait_start;
        u64                     wait_max;
        u64                     wait_count;
@@ -1138,6 +1123,24 @@ struct sched_entity {
        u64                     nr_wakeups_affine_attempts;
        u64                     nr_wakeups_passive;
        u64                     nr_wakeups_idle;
+};
+#endif
+
+struct sched_entity {
+       struct load_weight      load;           /* for load-balancing */
+       struct rb_node          run_node;
+       struct list_head        group_node;
+       unsigned int            on_rq;
+
+       u64                     exec_start;
+       u64                     sum_exec_runtime;
+       u64                     vruntime;
+       u64                     prev_sum_exec_runtime;
+
+       u64                     nr_migrations;
+
+#ifdef CONFIG_SCHEDSTATS
+       struct sched_statistics statistics;
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1847,6 +1850,7 @@ extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 
 #ifdef CONFIG_HOTPLUG_CPU
+extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
 extern void idle_task_exit(void);
 #else
 static inline void idle_task_exit(void) {}
index eb77e8c..5fe94b8 100644 (file)
@@ -604,8 +604,7 @@ config RT_GROUP_SCHED
        default n
        help
          This feature lets you explicitly allocate real CPU bandwidth
-         to users or control groups (depending on the "Basis for grouping tasks"
-         setting below. If enabled, it will also make it impossible to
+         to task groups. If enabled, it will also make it impossible to
          schedule realtime tasks for non-root users until you allocate
          realtime bandwidth for them.
          See Documentation/scheduler/sched-rt-group.txt for more information.
index 9e4697e..2f05303 100644 (file)
@@ -15,7 +15,6 @@
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
 #include <asm/uaccess.h>
-#include "cred-internals.h"
 
 /*
  * Leveraged for setting/resetting capabilities
index 25bba73..914aedc 100644 (file)
@@ -164,6 +164,7 @@ static inline void check_for_tasks(int cpu)
 }
 
 struct take_cpu_down_param {
+       struct task_struct *caller;
        unsigned long mod;
        void *hcpu;
 };
@@ -172,6 +173,7 @@ struct take_cpu_down_param {
 static int __ref take_cpu_down(void *_param)
 {
        struct take_cpu_down_param *param = _param;
+       unsigned int cpu = (unsigned long)param->hcpu;
        int err;
 
        /* Ensure this CPU doesn't handle any more interrupts. */
@@ -182,6 +184,8 @@ static int __ref take_cpu_down(void *_param)
        raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
                                param->hcpu);
 
+       if (task_cpu(param->caller) == cpu)
+               move_task_off_dead_cpu(cpu, param->caller);
        /* Force idle task to run as soon as we yield: it should
           immediately notice cpu is offline and die quickly. */
        sched_idle_next();
@@ -192,10 +196,10 @@ static int __ref take_cpu_down(void *_param)
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
        int err, nr_calls = 0;
-       cpumask_var_t old_allowed;
        void *hcpu = (void *)(long)cpu;
        unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
        struct take_cpu_down_param tcd_param = {
+               .caller = current,
                .mod = mod,
                .hcpu = hcpu,
        };
@@ -206,9 +210,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        if (!cpu_online(cpu))
                return -EINVAL;
 
-       if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
-               return -ENOMEM;
-
        cpu_hotplug_begin();
        set_cpu_active(cpu, false);
        err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
@@ -225,10 +226,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
                goto out_release;
        }
 
-       /* Ensure that we are not runnable on dying cpu */
-       cpumask_copy(old_allowed, &current->cpus_allowed);
-       set_cpus_allowed_ptr(current, cpu_active_mask);
-
        err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
        if (err) {
                set_cpu_active(cpu, true);
@@ -237,7 +234,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
                                            hcpu) == NOTIFY_BAD)
                        BUG();
 
-               goto out_allowed;
+               goto out_release;
        }
        BUG_ON(cpu_online(cpu));
 
@@ -255,8 +252,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
        check_for_tasks(cpu);
 
-out_allowed:
-       set_cpus_allowed_ptr(current, old_allowed);
 out_release:
        cpu_hotplug_done();
        if (!err) {
@@ -264,7 +259,6 @@ out_release:
                                            hcpu) == NOTIFY_BAD)
                        BUG();
        }
-       free_cpumask_var(old_allowed);
        return err;
 }
 
index d109467..9a50c5f 100644 (file)
@@ -2182,19 +2182,52 @@ void __init cpuset_init_smp(void)
 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
        mutex_lock(&callback_mutex);
-       cpuset_cpus_allowed_locked(tsk, pmask);
+       task_lock(tsk);
+       guarantee_online_cpus(task_cs(tsk), pmask);
+       task_unlock(tsk);
        mutex_unlock(&callback_mutex);
 }
 
-/**
- * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
- * Must be called with callback_mutex held.
- **/
-void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
+int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 {
-       task_lock(tsk);
-       guarantee_online_cpus(task_cs(tsk), pmask);
-       task_unlock(tsk);
+       const struct cpuset *cs;
+       int cpu;
+
+       rcu_read_lock();
+       cs = task_cs(tsk);
+       if (cs)
+               cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+       rcu_read_unlock();
+
+       /*
+        * We own tsk->cpus_allowed, nobody can change it under us.
+        *
+        * But we used cs && cs->cpus_allowed lockless and thus can
+        * race with cgroup_attach_task() or update_cpumask() and get
+        * the wrong tsk->cpus_allowed. However, both cases imply the
+        * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
+        * which takes task_rq_lock().
+        *
+        * If we are called after it dropped the lock we must see all
+        * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
+        * set any mask even if it is not right from task_cs() pov,
+        * the pending set_cpus_allowed_ptr() will fix things.
+        */
+
+       cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
+       if (cpu >= nr_cpu_ids) {
+               /*
+                * Either tsk->cpus_allowed is wrong (see above) or it
+                * is actually empty. The latter case is only possible
+                * if we are racing with remove_tasks_in_empty_cpuset().
+                * Like above we can temporary set any mask and rely on
+                * set_cpus_allowed_ptr() as synchronization point.
+                */
+               cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+               cpu = cpumask_any(cpu_active_mask);
+       }
+
+       return cpu;
 }
 
 void cpuset_init_current_mems_allowed(void)
@@ -2383,22 +2416,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 }
 
 /**
- * cpuset_lock - lock out any changes to cpuset structures
- *
- * The out of memory (oom) code needs to mutex_lock cpusets
- * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset.  Expose callback_mutex via this
- * cpuset_lock() routine, so the oom code can lock it, before
- * locking the task list.  The tasklist_lock is a spinlock, so
- * must be taken inside callback_mutex.
- */
-
-void cpuset_lock(void)
-{
-       mutex_lock(&callback_mutex);
-}
-
-/**
  * cpuset_unlock - release lock on cpuset changes
  *
  * Undo the lock taken in a previous cpuset_lock() call.
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
deleted file mode 100644 (file)
index 2dc4fc2..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Internal credentials stuff
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-/*
- * user.c
- */
-static inline void sched_switch_user(struct task_struct *p)
-{
-#ifdef CONFIG_USER_SCHED
-       sched_move_task(p);
-#endif /* CONFIG_USER_SCHED */
-}
-
index 62af181..8f3672a 100644 (file)
@@ -17,7 +17,6 @@
 #include <linux/init_task.h>
 #include <linux/security.h>
 #include <linux/cn_proc.h>
-#include "cred-internals.h"
 
 #if 0
 #define kdebug(FMT, ...) \
@@ -560,8 +559,6 @@ int commit_creds(struct cred *new)
                atomic_dec(&old->user->processes);
        alter_cred_subscribers(old, -2);
 
-       sched_switch_user(task);
-
        /* send notifications */
        if (new->uid   != old->uid  ||
            new->euid  != old->euid ||
index 7f2683a..eabca5a 100644 (file)
@@ -55,7 +55,6 @@
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
-#include "cred-internals.h"
 
 static void exit_mm(struct task_struct * tsk);
 
index 6af210a..4956ed0 100644 (file)
@@ -493,8 +493,11 @@ struct rq {
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 #ifdef CONFIG_NO_HZ
+       u64 nohz_stamp;
        unsigned char in_nohz_recently;
 #endif
+       unsigned int skip_clock_update;
+
        /* capture load from *all* tasks on this cpu: */
        struct load_weight load;
        unsigned long nr_load_updates;
@@ -592,6 +595,13 @@ static inline
 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
        rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+
+       /*
+        * A queue event has occurred, and we're going to schedule.  In
+        * this case, we can save a useless back to back clock update.
+        */
+       if (test_tsk_need_resched(p))
+               rq->skip_clock_update = 1;
 }
 
 static inline int cpu_of(struct rq *rq)
@@ -626,7 +636,8 @@ static inline int cpu_of(struct rq *rq)
 
 inline void update_rq_clock(struct rq *rq)
 {
-       rq->clock = sched_clock_cpu(cpu_of(rq));
+       if (!rq->skip_clock_update)
+               rq->clock = sched_clock_cpu(cpu_of(rq));
 }
 
 /*
@@ -904,16 +915,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
 /*
- * Check whether the task is waking, we use this to synchronize against
- * ttwu() so that task_cpu() reports a stable number.
- *
- * We need to make an exception for PF_STARTING tasks because the fork
- * path might require task_rq_lock() to work, eg. it can call
- * set_cpus_allowed_ptr() from the cpuset clone_ns code.
+ * Check whether the task is waking, we use this to synchronize ->cpus_allowed
+ * against ttwu().
  */
 static inline int task_is_waking(struct task_struct *p)
 {
-       return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
+       return unlikely(p->state == TASK_WAKING);
 }
 
 /*
@@ -926,11 +933,9 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
        struct rq *rq;
 
        for (;;) {
-               while (task_is_waking(p))
-                       cpu_relax();
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p) && !task_is_waking(p)))
+               if (likely(rq == task_rq(p)))
                        return rq;
                raw_spin_unlock(&rq->lock);
        }
@@ -947,12 +952,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
        struct rq *rq;
 
        for (;;) {
-               while (task_is_waking(p))
-                       cpu_relax();
                local_irq_save(*flags);
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p) && !task_is_waking(p)))
+               if (likely(rq == task_rq(p)))
                        return rq;
                raw_spin_unlock_irqrestore(&rq->lock, *flags);
        }
@@ -1229,6 +1232,17 @@ void wake_up_idle_cpu(int cpu)
        if (!tsk_is_polling(rq->idle))
                smp_send_reschedule(cpu);
 }
+
+int nohz_ratelimit(int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       u64 diff = rq->clock - rq->nohz_stamp;
+
+       rq->nohz_stamp = rq->clock;
+
+       return diff < (NSEC_PER_SEC / HZ) >> 1;
+}
+
 #endif /* CONFIG_NO_HZ */
 
 static u64 sched_avg_period(void)
@@ -1771,8 +1785,6 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
                }
        }
-       update_rq_clock(rq1);
-       update_rq_clock(rq2);
 }
 
 /*
@@ -1803,7 +1815,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 }
 #endif
 
-static void calc_load_account_active(struct rq *this_rq);
+static void calc_load_account_idle(struct rq *this_rq);
 static void update_sysctl(void);
 static int get_update_sysctl_factor(void);
 
@@ -1860,62 +1872,43 @@ static void set_load_weight(struct task_struct *p)
        p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 }
 
-static void update_avg(u64 *avg, u64 sample)
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
-       s64 diff = sample - *avg;
-       *avg += diff >> 3;
-}
-
-static void
-enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
-{
-       if (wakeup)
-               p->se.start_runtime = p->se.sum_exec_runtime;
-
+       update_rq_clock(rq);
        sched_info_queued(p);
-       p->sched_class->enqueue_task(rq, p, wakeup, head);
+       p->sched_class->enqueue_task(rq, p, flags);
        p->se.on_rq = 1;
 }
 
-static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
-       if (sleep) {
-               if (p->se.last_wakeup) {
-                       update_avg(&p->se.avg_overlap,
-                               p->se.sum_exec_runtime - p->se.last_wakeup);
-                       p->se.last_wakeup = 0;
-               } else {
-                       update_avg(&p->se.avg_wakeup,
-                               sysctl_sched_wakeup_granularity);
-               }
-       }
-
+       update_rq_clock(rq);
        sched_info_dequeued(p);
-       p->sched_class->dequeue_task(rq, p, sleep);
+       p->sched_class->dequeue_task(rq, p, flags);
        p->se.on_rq = 0;
 }
 
 /*
  * activate_task - move a task to the runqueue.
  */
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+static void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
        if (task_contributes_to_load(p))
                rq->nr_uninterruptible--;
 
-       enqueue_task(rq, p, wakeup, false);
+       enqueue_task(rq, p, flags);
        inc_nr_running(rq);
 }
 
 /*
  * deactivate_task - remove a task from the runqueue.
  */
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
        if (task_contributes_to_load(p))
                rq->nr_uninterruptible++;
 
-       dequeue_task(rq, p, sleep);
+       dequeue_task(rq, p, flags);
        dec_nr_running(rq);
 }
 
@@ -2273,6 +2266,9 @@ void task_oncpu_function_call(struct task_struct *p,
 }
 
 #ifdef CONFIG_SMP
+/*
+ * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
        int dest_cpu;
@@ -2289,12 +2285,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                return dest_cpu;
 
        /* No more Mr. Nice Guy. */
-       if (dest_cpu >= nr_cpu_ids) {
-               rcu_read_lock();
-               cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
-               rcu_read_unlock();
-               dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
-
+       if (unlikely(dest_cpu >= nr_cpu_ids)) {
+               dest_cpu = cpuset_cpus_allowed_fallback(p);
                /*
                 * Don't tell them about moving exiting tasks or
                 * kernel threads (both mm NULL), since they never
@@ -2311,17 +2303,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 }
 
 /*
- * Gets called from 3 sites (exec, fork, wakeup), since it is called without
- * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
- * by:
- *
- *  exec:           is unstable, retry loop
- *  fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
+ * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
  */
 static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
 {
-       int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+       int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
 
        /*
         * In order not to call set_task_cpu() on a blocking task we need
@@ -2339,6 +2326,12 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
 
        return cpu;
 }
+
+static void update_avg(u64 *avg, u64 sample)
+{
+       s64 diff = sample - *avg;
+       *avg += diff >> 3;
+}
 #endif
 
 /***
@@ -2360,16 +2353,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 {
        int cpu, orig_cpu, this_cpu, success = 0;
        unsigned long flags;
+       unsigned long en_flags = ENQUEUE_WAKEUP;
        struct rq *rq;
 
-       if (!sched_feat(SYNC_WAKEUPS))
-               wake_flags &= ~WF_SYNC;
-
        this_cpu = get_cpu();
 
        smp_wmb();
        rq = task_rq_lock(p, &flags);
-       update_rq_clock(rq);
        if (!(p->state & state))
                goto out;
 
@@ -2389,28 +2379,26 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
         *
         * First fix up the nr_uninterruptible count:
         */
-       if (task_contributes_to_load(p))
-               rq->nr_uninterruptible--;
+       if (task_contributes_to_load(p)) {
+               if (likely(cpu_online(orig_cpu)))
+                       rq->nr_uninterruptible--;
+               else
+                       this_rq()->nr_uninterruptible--;
+       }
        p->state = TASK_WAKING;
 
-       if (p->sched_class->task_waking)
+       if (p->sched_class->task_waking) {
                p->sched_class->task_waking(rq, p);
+               en_flags |= ENQUEUE_WAKING;
+       }
 
-       __task_rq_unlock(rq);
-
-       cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-       if (cpu != orig_cpu) {
-               /*
-                * Since we migrate the task without holding any rq->lock,
-                * we need to be careful with task_rq_lock(), since that
-                * might end up locking an invalid rq.
-                */
+       cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
+       if (cpu != orig_cpu)
                set_task_cpu(p, cpu);
-       }
+       __task_rq_unlock(rq);
 
        rq = cpu_rq(cpu);
        raw_spin_lock(&rq->lock);
-       update_rq_clock(rq);
 
        /*
         * We migrated the task without holding either rq->lock, however
@@ -2438,34 +2426,18 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 
 out_activate:
 #endif /* CONFIG_SMP */
-       schedstat_inc(p, se.nr_wakeups);
+       schedstat_inc(p, se.statistics.nr_wakeups);
        if (wake_flags & WF_SYNC)
-               schedstat_inc(p, se.nr_wakeups_sync);
+               schedstat_inc(p, se.statistics.nr_wakeups_sync);
        if (orig_cpu != cpu)
-               schedstat_inc(p, se.nr_wakeups_migrate);
+               schedstat_inc(p, se.statistics.nr_wakeups_migrate);
        if (cpu == this_cpu)
-               schedstat_inc(p, se.nr_wakeups_local);
+               schedstat_inc(p, se.statistics.nr_wakeups_local);
        else
-               schedstat_inc(p, se.nr_wakeups_remote);
-       activate_task(rq, p, 1);
+               schedstat_inc(p, se.statistics.nr_wakeups_remote);
+       activate_task(rq, p, en_flags);
        success = 1;
 
-       /*
-        * Only attribute actual wakeups done by this task.
-        */
-       if (!in_interrupt()) {
-               struct sched_entity *se = &current->se;
-               u64 sample = se->sum_exec_runtime;
-
-               if (se->last_wakeup)
-                       sample -= se->last_wakeup;
-               else
-                       sample -= se->start_runtime;
-               update_avg(&se->avg_wakeup, sample);
-
-               se->last_wakeup = se->sum_exec_runtime;
-       }
-
 out_running:
        trace_sched_wakeup(rq, p, success);
        check_preempt_curr(rq, p, wake_flags);
@@ -2527,42 +2499,9 @@ static void __sched_fork(struct task_struct *p)
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
        p->se.nr_migrations             = 0;
-       p->se.last_wakeup               = 0;
-       p->se.avg_overlap               = 0;
-       p->se.start_runtime             = 0;
-       p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
 
 #ifdef CONFIG_SCHEDSTATS
-       p->se.wait_start                        = 0;
-       p->se.wait_max                          = 0;
-       p->se.wait_count                        = 0;
-       p->se.wait_sum                          = 0;
-
-       p->se.sleep_start                       = 0;
-       p->se.sleep_max                         = 0;
-       p->se.sum_sleep_runtime                 = 0;
-
-       p->se.block_start                       = 0;
-       p->se.block_max                         = 0;
-       p->se.exec_max                          = 0;
-       p->se.slice_max                         = 0;
-
-       p->se.nr_migrations_cold                = 0;
-       p->se.nr_failed_migrations_affine       = 0;
-       p->se.nr_failed_migrations_running      = 0;
-       p->se.nr_failed_migrations_hot          = 0;
-       p->se.nr_forced_migrations              = 0;
-
-       p->se.nr_wakeups                        = 0;
-       p->se.nr_wakeups_sync                   = 0;
-       p->se.nr_wakeups_migrate                = 0;
-       p->se.nr_wakeups_local                  = 0;
-       p->se.nr_wakeups_remote                 = 0;
-       p->se.nr_wakeups_affine                 = 0;
-       p->se.nr_wakeups_affine_attempts        = 0;
-       p->se.nr_wakeups_passive                = 0;
-       p->se.nr_wakeups_idle                   = 0;
-
+       memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 
        INIT_LIST_HEAD(&p->rt.run_list);
@@ -2583,11 +2522,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
 
        __sched_fork(p);
        /*
-        * We mark the process as waking here. This guarantees that
+        * We mark the process as running here. This guarantees that
         * nobody will actually run it, and a signal or other external
         * event cannot wake it up and insert it on the runqueue either.
         */
-       p->state = TASK_WAKING;
+       p->state = TASK_RUNNING;
 
        /*
         * Revert to default priority/policy on fork if requested.
@@ -2654,29 +2593,25 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
        int cpu __maybe_unused = get_cpu();
 
 #ifdef CONFIG_SMP
+       rq = task_rq_lock(p, &flags);
+       p->state = TASK_WAKING;
+
        /*
         * Fork balancing, do it here and not earlier because:
         *  - cpus_allowed can change in the fork path
         *  - any previously selected cpu might disappear through hotplug
         *
-        * We still have TASK_WAKING but PF_STARTING is gone now, meaning
-        * ->cpus_allowed is stable, we have preemption disabled, meaning
-        * cpu_online_mask is stable.
+        * We set TASK_WAKING so that select_task_rq() can drop rq->lock
+        * without people poking at ->cpus_allowed.
         */
-       cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
+       cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
        set_task_cpu(p, cpu);
-#endif
 
-       /*
-        * Since the task is not on the rq and we still have TASK_WAKING set
-        * nobody else will migrate this task.
-        */
-       rq = cpu_rq(cpu);
-       raw_spin_lock_irqsave(&rq->lock, flags);
-
-       BUG_ON(p->state != TASK_WAKING);
        p->state = TASK_RUNNING;
-       update_rq_clock(rq);
+       task_rq_unlock(rq, &flags);
+#endif
+
+       rq = task_rq_lock(p, &flags);
        activate_task(rq, p, 0);
        trace_sched_wakeup_new(rq, p, 1);
        check_preempt_curr(rq, p, WF_FORK);
@@ -3015,6 +2950,61 @@ static unsigned long calc_load_update;
 unsigned long avenrun[3];
 EXPORT_SYMBOL(avenrun);
 
+static long calc_load_fold_active(struct rq *this_rq)
+{
+       long nr_active, delta = 0;
+
+       nr_active = this_rq->nr_running;
+       nr_active += (long) this_rq->nr_uninterruptible;
+
+       if (nr_active != this_rq->calc_load_active) {
+               delta = nr_active - this_rq->calc_load_active;
+               this_rq->calc_load_active = nr_active;
+       }
+
+       return delta;
+}
+
+#ifdef CONFIG_NO_HZ
+/*
+ * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_tasks_idle;
+
+static void calc_load_account_idle(struct rq *this_rq)
+{
+       long delta;
+
+       delta = calc_load_fold_active(this_rq);
+       if (delta)
+               atomic_long_add(delta, &calc_load_tasks_idle);
+}
+
+static long calc_load_fold_idle(void)
+{
+       long delta = 0;
+
+       /*
+        * Its got a race, we don't care...
+        */
+       if (atomic_long_read(&calc_load_tasks_idle))
+               delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+
+       return delta;
+}
+#else
+static void calc_load_account_idle(struct rq *this_rq)
+{
+}
+
+static inline long calc_load_fold_idle(void)
+{
+       return 0;
+}
+#endif
+
 /**
  * get_avenrun - get the load average array
  * @loads:     pointer to dest load array
@@ -3061,20 +3051,22 @@ void calc_global_load(void)
 }
 
 /*
- * Either called from update_cpu_load() or from a cpu going idle
+ * Called from update_cpu_load() to periodically update this CPU's
+ * active count.
  */
 static void calc_load_account_active(struct rq *this_rq)
 {
-       long nr_active, delta;
+       long delta;
 
-       nr_active = this_rq->nr_running;
-       nr_active += (long) this_rq->nr_uninterruptible;
+       if (time_before(jiffies, this_rq->calc_load_update))
+               return;
 
-       if (nr_active != this_rq->calc_load_active) {
-               delta = nr_active - this_rq->calc_load_active;
-               this_rq->calc_load_active = nr_active;
+       delta  = calc_load_fold_active(this_rq);
+       delta += calc_load_fold_idle();
+       if (delta)
                atomic_long_add(delta, &calc_load_tasks);
-       }
+
+       this_rq->calc_load_update += LOAD_FREQ;
 }
 
 /*
@@ -3106,10 +3098,7 @@ static void update_cpu_load(struct rq *this_rq)
                this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
        }
 
-       if (time_after_eq(jiffies, this_rq->calc_load_update)) {
-               this_rq->calc_load_update += LOAD_FREQ;
-               calc_load_account_active(this_rq);
-       }
+       calc_load_account_active(this_rq);
 }
 
 #ifdef CONFIG_SMP
@@ -3122,32 +3111,21 @@ void sched_exec(void)
 {
        struct task_struct *p = current;
        struct migration_req req;
-       int dest_cpu, this_cpu;
        unsigned long flags;
        struct rq *rq;
-
-again:
-       this_cpu = get_cpu();
-       dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
-       if (dest_cpu == this_cpu) {
-               put_cpu();
-               return;
-       }
+       int dest_cpu;
 
        rq = task_rq_lock(p, &flags);
-       put_cpu();
+       dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+       if (dest_cpu == smp_processor_id())
+               goto unlock;
 
        /*
         * select_task_rq() can race against ->cpus_allowed
         */
-       if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
-           || unlikely(!cpu_active(dest_cpu))) {
-               task_rq_unlock(rq, &flags);
-               goto again;
-       }
-
-       /* force the process onto the specified CPU */
-       if (migrate_task(p, dest_cpu, &req)) {
+       if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
+           likely(cpu_active(dest_cpu)) &&
+           migrate_task(p, dest_cpu, &req)) {
                /* Need to wait for migration thread (might exit: take ref). */
                struct task_struct *mt = rq->migration_thread;
 
@@ -3159,6 +3137,7 @@ again:
 
                return;
        }
+unlock:
        task_rq_unlock(rq, &flags);
 }
 
@@ -3630,23 +3609,9 @@ static inline void schedule_debug(struct task_struct *prev)
 
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-       if (prev->state == TASK_RUNNING) {
-               u64 runtime = prev->se.sum_exec_runtime;
-
-               runtime -= prev->se.prev_sum_exec_runtime;
-               runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
-
-               /*
-                * In order to avoid avg_overlap growing stale when we are
-                * indeed overlapping and hence not getting put to sleep, grow
-                * the avg_overlap on preemption.
-                *
-                * We use the average preemption runtime because that
-                * correlates to the amount of cache footprint a task can
-                * build up.
-                */
-               update_avg(&prev->se.avg_overlap, runtime);
-       }
+       if (prev->se.on_rq)
+               update_rq_clock(rq);
+       rq->skip_clock_update = 0;
        prev->sched_class->put_prev_task(rq, prev);
 }
 
@@ -3709,14 +3674,13 @@ need_resched_nonpreemptible:
                hrtick_clear(rq);
 
        raw_spin_lock_irq(&rq->lock);
-       update_rq_clock(rq);
        clear_tsk_need_resched(prev);
 
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                if (unlikely(signal_pending_state(prev->state, prev)))
                        prev->state = TASK_RUNNING;
                else
-                       deactivate_task(rq, prev, 1);
+                       deactivate_task(rq, prev, DEQUEUE_SLEEP);
                switch_count = &prev->nvcsw;
        }
 
@@ -4266,7 +4230,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        BUG_ON(prio < 0 || prio > MAX_PRIO);
 
        rq = task_rq_lock(p, &flags);
-       update_rq_clock(rq);
 
        oldprio = p->prio;
        prev_class = p->sched_class;
@@ -4287,7 +4250,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        if (running)
                p->sched_class->set_curr_task(rq);
        if (on_rq) {
-               enqueue_task(rq, p, 0, oldprio < prio);
+               enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
 
                check_class_changed(rq, p, prev_class, oldprio, running);
        }
@@ -4309,7 +4272,6 @@ void set_user_nice(struct task_struct *p, long nice)
         * the task might be in the middle of scheduling on another CPU.
         */
        rq = task_rq_lock(p, &flags);
-       update_rq_clock(rq);
        /*
         * The RT priorities are set via sched_setscheduler(), but we still
         * allow the 'normal' nice value to be set - but as expected
@@ -4331,7 +4293,7 @@ void set_user_nice(struct task_struct *p, long nice)
        delta = p->prio - old_prio;
 
        if (on_rq) {
-               enqueue_task(rq, p, 0, false);
+               enqueue_task(rq, p, 0);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -4592,7 +4554,6 @@ recheck:
                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                goto recheck;
        }
-       update_rq_clock(rq);
        on_rq = p->se.on_rq;
        running = task_current(rq, p);
        if (on_rq)
@@ -5358,7 +5319,18 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        struct rq *rq;
        int ret = 0;
 
+       /*
+        * Serialize against TASK_WAKING so that ttwu() and wunt() can
+        * drop the rq->lock and still rely on ->cpus_allowed.
+        */
+again:
+       while (task_is_waking(p))
+               cpu_relax();
        rq = task_rq_lock(p, &flags);
+       if (task_is_waking(p)) {
+               task_rq_unlock(rq, &flags);
+               goto again;
+       }
 
        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                ret = -EINVAL;
@@ -5516,30 +5488,29 @@ static int migration_thread(void *data)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-
-static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
-{
-       int ret;
-
-       local_irq_disable();
-       ret = __migrate_task(p, src_cpu, dest_cpu);
-       local_irq_enable();
-       return ret;
-}
-
 /*
  * Figure out where task on dead CPU should go, use force if necessary.
  */
-static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
-       int dest_cpu;
+       struct rq *rq = cpu_rq(dead_cpu);
+       int needs_cpu, uninitialized_var(dest_cpu);
+       unsigned long flags;
 
-again:
-       dest_cpu = select_fallback_rq(dead_cpu, p);
+       local_irq_save(flags);
 
-       /* It can have affinity changed while we were choosing. */
-       if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
-               goto again;
+       raw_spin_lock(&rq->lock);
+       needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
+       if (needs_cpu)
+               dest_cpu = select_fallback_rq(dead_cpu, p);
+       raw_spin_unlock(&rq->lock);
+       /*
+        * It can only fail if we race with set_cpus_allowed(),
+        * in the racer should migrate the task anyway.
+        */
+       if (needs_cpu)
+               __migrate_task(p, dead_cpu, dest_cpu);
+       local_irq_restore(flags);
 }
 
 /*
@@ -5603,7 +5574,6 @@ void sched_idle_next(void)
 
        __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
 
-       update_rq_clock(rq);
        activate_task(rq, p, 0);
 
        raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5658,7 +5628,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
        for ( ; ; ) {
                if (!rq->nr_running)
                        break;
-               update_rq_clock(rq);
                next = pick_next_task(rq);
                if (!next)
                        break;
@@ -5934,7 +5903,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-               cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
                migrate_live_tasks(cpu);
                rq = cpu_rq(cpu);
                kthread_stop(rq->migration_thread);
@@ -5942,13 +5910,11 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                rq->migration_thread = NULL;
                /* Idle task back to normal (off runqueue, low prio) */
                raw_spin_lock_irq(&rq->lock);
-               update_rq_clock(rq);
                deactivate_task(rq, rq->idle, 0);
                __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
                rq->idle->sched_class = &idle_sched_class;
                migrate_dead_tasks(cpu);
                raw_spin_unlock_irq(&rq->lock);
-               cpuset_unlock();
                migrate_nr_uninterruptible(rq);
                BUG_ON(rq->nr_running != 0);
                calc_global_load_remove(rq);
@@ -6305,6 +6271,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        struct rq *rq = cpu_rq(cpu);
        struct sched_domain *tmp;
 
+       for (tmp = sd; tmp; tmp = tmp->parent)
+               tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
+
        /* Remove the sched domains which do not contribute to scheduling. */
        for (tmp = sd; tmp; ) {
                struct sched_domain *parent = tmp->parent;
@@ -7892,7 +7861,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
 {
        int on_rq;
 
-       update_rq_clock(rq);
        on_rq = p->se.on_rq;
        if (on_rq)
                deactivate_task(rq, p, 0);
@@ -7919,9 +7887,9 @@ void normalize_rt_tasks(void)
 
                p->se.exec_start                = 0;
 #ifdef CONFIG_SCHEDSTATS
-               p->se.wait_start                = 0;
-               p->se.sleep_start               = 0;
-               p->se.block_start               = 0;
+               p->se.statistics.wait_start     = 0;
+               p->se.statistics.sleep_start    = 0;
+               p->se.statistics.block_start    = 0;
 #endif
 
                if (!rt_task(p)) {
@@ -8254,8 +8222,6 @@ void sched_move_task(struct task_struct *tsk)
 
        rq = task_rq_lock(tsk, &flags);
 
-       update_rq_clock(rq);
-
        running = task_current(rq, tsk);
        on_rq = tsk->se.on_rq;
 
@@ -8274,7 +8240,7 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
        if (on_rq)
-               enqueue_task(rq, tsk, 0, false);
+               enqueue_task(rq, tsk, 0);
 
        task_rq_unlock(rq, &flags);
 }
index 9b49db1..9cf1baf 100644 (file)
@@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
        PN(se->vruntime);
        PN(se->sum_exec_runtime);
 #ifdef CONFIG_SCHEDSTATS
-       PN(se->wait_start);
-       PN(se->sleep_start);
-       PN(se->block_start);
-       PN(se->sleep_max);
-       PN(se->block_max);
-       PN(se->exec_max);
-       PN(se->slice_max);
-       PN(se->wait_max);
-       PN(se->wait_sum);
-       P(se->wait_count);
+       PN(se->statistics.wait_start);
+       PN(se->statistics.sleep_start);
+       PN(se->statistics.block_start);
+       PN(se->statistics.sleep_max);
+       PN(se->statistics.block_max);
+       PN(se->statistics.exec_max);
+       PN(se->statistics.slice_max);
+       PN(se->statistics.wait_max);
+       PN(se->statistics.wait_sum);
+       P(se->statistics.wait_count);
 #endif
        P(se->load.weight);
 #undef PN
@@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
        SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
                SPLIT_NS(p->se.vruntime),
                SPLIT_NS(p->se.sum_exec_runtime),
-               SPLIT_NS(p->se.sum_sleep_runtime));
+               SPLIT_NS(p->se.statistics.sum_sleep_runtime));
 #else
        SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
                0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
@@ -173,11 +173,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        task_group_path(tg, path, sizeof(path));
 
        SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
-#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
-       {
-               uid_t uid = cfs_rq->tg->uid;
-               SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
-       }
 #else
        SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
@@ -407,40 +402,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        PN(se.exec_start);
        PN(se.vruntime);
        PN(se.sum_exec_runtime);
-       PN(se.avg_overlap);
-       PN(se.avg_wakeup);
 
        nr_switches = p->nvcsw + p->nivcsw;
 
 #ifdef CONFIG_SCHEDSTATS
-       PN(se.wait_start);
-       PN(se.sleep_start);
-       PN(se.block_start);
-       PN(se.sleep_max);
-       PN(se.block_max);
-       PN(se.exec_max);
-       PN(se.slice_max);
-       PN(se.wait_max);
-       PN(se.wait_sum);
-       P(se.wait_count);
-       PN(se.iowait_sum);
-       P(se.iowait_count);
+       PN(se.statistics.wait_start);
+       PN(se.statistics.sleep_start);
+       PN(se.statistics.block_start);
+       PN(se.statistics.sleep_max);
+       PN(se.statistics.block_max);
+       PN(se.statistics.exec_max);
+       PN(se.statistics.slice_max);
+       PN(se.statistics.wait_max);
+       PN(se.statistics.wait_sum);
+       P(se.statistics.wait_count);
+       PN(se.statistics.iowait_sum);
+       P(se.statistics.iowait_count);
        P(sched_info.bkl_count);
        P(se.nr_migrations);
-       P(se.nr_migrations_cold);
-       P(se.nr_failed_migrations_affine);
-       P(se.nr_failed_migrations_running);
-       P(se.nr_failed_migrations_hot);
-       P(se.nr_forced_migrations);
-       P(se.nr_wakeups);
-       P(se.nr_wakeups_sync);
-       P(se.nr_wakeups_migrate);
-       P(se.nr_wakeups_local);
-       P(se.nr_wakeups_remote);
-       P(se.nr_wakeups_affine);
-       P(se.nr_wakeups_affine_attempts);
-       P(se.nr_wakeups_passive);
-       P(se.nr_wakeups_idle);
+       P(se.statistics.nr_migrations_cold);
+       P(se.statistics.nr_failed_migrations_affine);
+       P(se.statistics.nr_failed_migrations_running);
+       P(se.statistics.nr_failed_migrations_hot);
+       P(se.statistics.nr_forced_migrations);
+       P(se.statistics.nr_wakeups);
+       P(se.statistics.nr_wakeups_sync);
+       P(se.statistics.nr_wakeups_migrate);
+       P(se.statistics.nr_wakeups_local);
+       P(se.statistics.nr_wakeups_remote);
+       P(se.statistics.nr_wakeups_affine);
+       P(se.statistics.nr_wakeups_affine_attempts);
+       P(se.statistics.nr_wakeups_passive);
+       P(se.statistics.nr_wakeups_idle);
 
        {
                u64 avg_atom, avg_per_cpu;
@@ -491,31 +484,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 void proc_sched_set_task(struct task_struct *p)
 {
 #ifdef CONFIG_SCHEDSTATS
-       p->se.wait_max                          = 0;
-       p->se.wait_sum                          = 0;
-       p->se.wait_count                        = 0;
-       p->se.iowait_sum                        = 0;
-       p->se.iowait_count                      = 0;
-       p->se.sleep_max                         = 0;
-       p->se.sum_sleep_runtime                 = 0;
-       p->se.block_max                         = 0;
-       p->se.exec_max                          = 0;
-       p->se.slice_max                         = 0;
-       p->se.nr_migrations                     = 0;
-       p->se.nr_migrations_cold                = 0;
-       p->se.nr_failed_migrations_affine       = 0;
-       p->se.nr_failed_migrations_running      = 0;
-       p->se.nr_failed_migrations_hot          = 0;
-       p->se.nr_forced_migrations              = 0;
-       p->se.nr_wakeups                        = 0;
-       p->se.nr_wakeups_sync                   = 0;
-       p->se.nr_wakeups_migrate                = 0;
-       p->se.nr_wakeups_local                  = 0;
-       p->se.nr_wakeups_remote                 = 0;
-       p->se.nr_wakeups_affine                 = 0;
-       p->se.nr_wakeups_affine_attempts        = 0;
-       p->se.nr_wakeups_passive                = 0;
-       p->se.nr_wakeups_idle                   = 0;
-       p->sched_info.bkl_count                 = 0;
+       memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 }
index 5a5ea2c..cbd8b8a 100644 (file)
@@ -35,8 +35,8 @@
  * (to see the precise effective timeslice length of your workload,
  *  run vmstat and monitor the context-switches (cs) field)
  */
-unsigned int sysctl_sched_latency = 5000000ULL;
-unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+unsigned int sysctl_sched_latency = 6000000ULL;
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 
 /*
  * The initial- and re-scaling of tunables is configurable
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
-unsigned int sysctl_sched_min_granularity = 1000000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
+unsigned int sysctl_sched_min_granularity = 2000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
 
 /*
  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  */
-static unsigned int sched_nr_latency = 5;
+static unsigned int sched_nr_latency = 3;
 
 /*
  * After fork, child runs first. If set to 0 (default) then
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 {
        unsigned long delta_exec_weighted;
 
-       schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
+       schedstat_set(curr->statistics.exec_max,
+                     max((u64)delta_exec, curr->statistics.exec_max));
 
        curr->sum_exec_runtime += delta_exec;
        schedstat_add(cfs_rq, exec_clock, delta_exec);
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-       schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
+       schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
 }
 
 /*
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-       schedstat_set(se->wait_max, max(se->wait_max,
-                       rq_of(cfs_rq)->clock - se->wait_start));
-       schedstat_set(se->wait_count, se->wait_count + 1);
-       schedstat_set(se->wait_sum, se->wait_sum +
-                       rq_of(cfs_rq)->clock - se->wait_start);
+       schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
+                       rq_of(cfs_rq)->clock - se->statistics.wait_start));
+       schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
+       schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
+                       rq_of(cfs_rq)->clock - se->statistics.wait_start);
 #ifdef CONFIG_SCHEDSTATS
        if (entity_is_task(se)) {
                trace_sched_stat_wait(task_of(se),
-                       rq_of(cfs_rq)->clock - se->wait_start);
+                       rq_of(cfs_rq)->clock - se->statistics.wait_start);
        }
 #endif
-       schedstat_set(se->wait_start, 0);
+       schedstat_set(se->statistics.wait_start, 0);
 }
 
 static inline void
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
        if (entity_is_task(se))
                tsk = task_of(se);
 
-       if (se->sleep_start) {
-               u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+       if (se->statistics.sleep_start) {
+               u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
 
                if ((s64)delta < 0)
                        delta = 0;
 
-               if (unlikely(delta > se->sleep_max))
-                       se->sleep_max = delta;
+               if (unlikely(delta > se->statistics.sleep_max))
+                       se->statistics.sleep_max = delta;
 
-               se->sleep_start = 0;
-               se->sum_sleep_runtime += delta;
+               se->statistics.sleep_start = 0;
+               se->statistics.sum_sleep_runtime += delta;
 
                if (tsk) {
                        account_scheduler_latency(tsk, delta >> 10, 1);
                        trace_sched_stat_sleep(tsk, delta);
                }
        }
-       if (se->block_start) {
-               u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+       if (se->statistics.block_start) {
+               u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
 
                if ((s64)delta < 0)
                        delta = 0;
 
-               if (unlikely(delta > se->block_max))
-                       se->block_max = delta;
+               if (unlikely(delta > se->statistics.block_max))
+                       se->statistics.block_max = delta;
 
-               se->block_start = 0;
-               se->sum_sleep_runtime += delta;
+               se->statistics.block_start = 0;
+               se->statistics.sum_sleep_runtime += delta;
 
                if (tsk) {
                        if (tsk->in_iowait) {
-                               se->iowait_sum += delta;
-                               se->iowait_count++;
+                               se->statistics.iowait_sum += delta;
+                               se->statistics.iowait_count++;
                                trace_sched_stat_iowait(tsk, delta);
                        }
 
@@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
                vruntime += sched_vslice(cfs_rq, se);
 
        /* sleeps up to a single latency don't count. */
-       if (!initial && sched_feat(FAIR_SLEEPERS)) {
+       if (!initial) {
                unsigned long thresh = sysctl_sched_latency;
 
                /*
-                * Convert the sleeper threshold into virtual time.
-                * SCHED_IDLE is a special sub-class.  We care about
-                * fairness only relative to other SCHED_IDLE tasks,
-                * all of which have the same weight.
-                */
-               if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
-                                task_of(se)->policy != SCHED_IDLE))
-                       thresh = calc_delta_fair(thresh, se);
-
-               /*
                 * Halve their sleep time's effect, to allow
                 * for a gentler effect of sleepers:
                 */
@@ -766,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        se->vruntime = vruntime;
 }
 
-#define ENQUEUE_WAKEUP 1
-#define ENQUEUE_MIGRATE 2
-
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -776,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update the normalized vruntime before updating min_vruntime
         * through callig update_curr().
         */
-       if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
+       if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
                se->vruntime += cfs_rq->min_vruntime;
 
        /*
@@ -812,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
        /*
         * Update run-time statistics of the 'current'.
@@ -820,15 +808,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
        update_curr(cfs_rq);
 
        update_stats_dequeue(cfs_rq, se);
-       if (sleep) {
+       if (flags & DEQUEUE_SLEEP) {
 #ifdef CONFIG_SCHEDSTATS
                if (entity_is_task(se)) {
                        struct task_struct *tsk = task_of(se);
 
                        if (tsk->state & TASK_INTERRUPTIBLE)
-                               se->sleep_start = rq_of(cfs_rq)->clock;
+                               se->statistics.sleep_start = rq_of(cfs_rq)->clock;
                        if (tsk->state & TASK_UNINTERRUPTIBLE)
-                               se->block_start = rq_of(cfs_rq)->clock;
+                               se->statistics.block_start = rq_of(cfs_rq)->clock;
                }
 #endif
        }
@@ -845,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
         * update can refer to the ->curr item and we need to reflect this
         * movement in our normalized position.
         */
-       if (!sleep)
+       if (!(flags & DEQUEUE_SLEEP))
                se->vruntime -= cfs_rq->min_vruntime;
 }
 
@@ -912,7 +900,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
         * when there are only lesser-weight tasks around):
         */
        if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
-               se->slice_max = max(se->slice_max,
+               se->statistics.slice_max = max(se->statistics.slice_max,
                        se->sum_exec_runtime - se->prev_sum_exec_runtime);
        }
 #endif
@@ -1054,16 +1042,10 @@ static inline void hrtick_update(struct rq *rq)
  * then put the task into the rbtree:
  */
 static void
-enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
-       int flags = 0;
-
-       if (wakeup)
-               flags |= ENQUEUE_WAKEUP;
-       if (p->state == TASK_WAKING)
-               flags |= ENQUEUE_MIGRATE;
 
        for_each_sched_entity(se) {
                if (se->on_rq)
@@ -1081,18 +1063,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
  * decreased. We remove the task from the rbtree and
  * update the fair scheduling stats:
  */
-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
 
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
-               dequeue_entity(cfs_rq, se, sleep);
+               dequeue_entity(cfs_rq, se, flags);
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight)
                        break;
-               sleep = 1;
+               flags |= DEQUEUE_SLEEP;
        }
 
        hrtick_update(rq);
@@ -1240,7 +1222,6 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
-       struct task_struct *curr = current;
        unsigned long this_load, load;
        int idx, this_cpu, prev_cpu;
        unsigned long tl_per_task;
@@ -1255,18 +1236,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
        load      = source_load(prev_cpu, idx);
        this_load = target_load(this_cpu, idx);
 
-       if (sync) {
-              if (sched_feat(SYNC_LESS) &&
-                  (curr->se.avg_overlap > sysctl_sched_migration_cost ||
-                   p->se.avg_overlap > sysctl_sched_migration_cost))
-                      sync = 0;
-       } else {
-               if (sched_feat(SYNC_MORE) &&
-                   (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-                    p->se.avg_overlap < sysctl_sched_migration_cost))
-                       sync = 1;
-       }
-
        /*
         * If sync wakeup then subtract the (maximum possible)
         * effect of the currently running task from the load
@@ -1306,7 +1275,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
        if (sync && balanced)
                return 1;
 
-       schedstat_inc(p, se.nr_wakeups_affine_attempts);
+       schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
        tl_per_task = cpu_avg_load_per_task(this_cpu);
 
        if (balanced ||
@@ -1318,7 +1287,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
                 * there is no bad imbalance.
                 */
                schedstat_inc(sd, ttwu_move_affine);
-               schedstat_inc(p, se.nr_wakeups_affine);
+               schedstat_inc(p, se.statistics.nr_wakeups_affine);
 
                return 1;
        }
@@ -1406,29 +1375,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 /*
  * Try and locate an idle CPU in the sched_domain.
  */
-static int
-select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_sibling(struct task_struct *p, int target)
 {
        int cpu = smp_processor_id();
        int prev_cpu = task_cpu(p);
+       struct sched_domain *sd;
        int i;
 
        /*
-        * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
-        * test in select_task_rq_fair) and the prev_cpu is idle then that's
-        * always a better target than the current cpu.
+        * If the task is going to be woken-up on this cpu and if it is
+        * already idle, then it is the right target.
+        */
+       if (target == cpu && idle_cpu(cpu))
+               return cpu;
+
+       /*
+        * If the task is going to be woken-up on the cpu where it previously
+        * ran and if it is currently idle, then it the right target.
         */
-       if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+       if (target == prev_cpu && idle_cpu(prev_cpu))
                return prev_cpu;
 
        /*
-        * Otherwise, iterate the domain and find an elegible idle cpu.
+        * Otherwise, iterate the domains and find an elegible idle cpu.
         */
-       for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
-               if (!cpu_rq(i)->cfs.nr_running) {
-                       target = i;
+       for_each_domain(target, sd) {
+               if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
                        break;
+
+               for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+                       if (idle_cpu(i)) {
+                               target = i;
+                               break;
+                       }
                }
+
+               /*
+                * Lets stop looking for an idle sibling when we reached
+                * the domain that spans the current cpu and prev_cpu.
+                */
+               if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
+                   cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+                       break;
        }
 
        return target;
@@ -1445,7 +1433,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
  *
  * preempt must be disabled.
  */
-static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+static int
+select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
 {
        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
        int cpu = smp_processor_id();
@@ -1456,8 +1445,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
        int sync = wake_flags & WF_SYNC;
 
        if (sd_flag & SD_BALANCE_WAKE) {
-               if (sched_feat(AFFINE_WAKEUPS) &&
-                   cpumask_test_cpu(cpu, &p->cpus_allowed))
+               if (cpumask_test_cpu(cpu, &p->cpus_allowed))
                        want_affine = 1;
                new_cpu = prev_cpu;
        }
@@ -1491,34 +1479,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                }
 
                /*
-                * While iterating the domains looking for a spanning
-                * WAKE_AFFINE domain, adjust the affine target to any idle cpu
-                * in cache sharing domains along the way.
+                * If both cpu and prev_cpu are part of this domain,
+                * cpu is a valid SD_WAKE_AFFINE target.
                 */
-               if (want_affine) {
-                       int target = -1;
-
-                       /*
-                        * If both cpu and prev_cpu are part of this domain,
-                        * cpu is a valid SD_WAKE_AFFINE target.
-                        */
-                       if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
-                               target = cpu;
-
-                       /*
-                        * If there's an idle sibling in this domain, make that
-                        * the wake_affine target instead of the current cpu.
-                        */
-                       if (tmp->flags & SD_SHARE_PKG_RESOURCES)
-                               target = select_idle_sibling(p, tmp, target);
-
-                       if (target >= 0) {
-                               if (tmp->flags & SD_WAKE_AFFINE) {
-                                       affine_sd = tmp;
-                                       want_affine = 0;
-                               }
-                               cpu = target;
-                       }
+               if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+                   cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+                       affine_sd = tmp;
+                       want_affine = 0;
                }
 
                if (!want_sd && !want_affine)
@@ -1531,22 +1498,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                        sd = tmp;
        }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
        if (sched_feat(LB_SHARES_UPDATE)) {
                /*
                 * Pick the largest domain to update shares over
                 */
                tmp = sd;
-               if (affine_sd && (!tmp ||
-                                 cpumask_weight(sched_domain_span(affine_sd)) >
-                                 cpumask_weight(sched_domain_span(sd))))
+               if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
                        tmp = affine_sd;
 
-               if (tmp)
+               if (tmp) {
+                       raw_spin_unlock(&rq->lock);
                        update_shares(tmp);
+                       raw_spin_lock(&rq->lock);
+               }
        }
+#endif
 
-       if (affine_sd && wake_affine(affine_sd, p, sync))
-               return cpu;
+       if (affine_sd) {
+               if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+                       return select_idle_sibling(p, cpu);
+               else
+                       return select_idle_sibling(p, prev_cpu);
+       }
 
        while (sd) {
                int load_idx = sd->forkexec_idx;
@@ -1576,10 +1550,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 
                /* Now try balancing at a lower domain level of new_cpu */
                cpu = new_cpu;
-               weight = cpumask_weight(sched_domain_span(sd));
+               weight = sd->span_weight;
                sd = NULL;
                for_each_domain(cpu, tmp) {
-                       if (weight <= cpumask_weight(sched_domain_span(tmp)))
+                       if (weight <= tmp->span_weight)
                                break;
                        if (tmp->flags & sd_flag)
                                sd = tmp;
@@ -1591,63 +1565,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 }
 #endif /* CONFIG_SMP */
 
-/*
- * Adaptive granularity
- *
- * se->avg_wakeup gives the average time a task runs until it does a wakeup,
- * with the limit of wakeup_gran -- when it never does a wakeup.
- *
- * So the smaller avg_wakeup is the faster we want this task to preempt,
- * but we don't want to treat the preemptee unfairly and therefore allow it
- * to run for at least the amount of time we'd like to run.
- *
- * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
- *
- * NOTE: we use *nr_running to scale with load, this nicely matches the
- *       degrading latency on load.
- */
-static unsigned long
-adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
-{
-       u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
-       u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
-       u64 gran = 0;
-
-       if (this_run < expected_wakeup)
-               gran = expected_wakeup - this_run;
-
-       return min_t(s64, gran, sysctl_sched_wakeup_granularity);
-}
-
 static unsigned long
 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
 {
        unsigned long gran = sysctl_sched_wakeup_granularity;
 
-       if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
-               gran = adaptive_gran(curr, se);
-
        /*
         * Since its curr running now, convert the gran from real-time
         * to virtual-time in his units.
+        *
+        * By using 'se' instead of 'curr' we penalize light tasks, so
+        * they get preempted easier. That is, if 'se' < 'curr' then
+        * the resulting gran will be larger, therefore penalizing the
+        * lighter, if otoh 'se' > 'curr' then the resulting gran will
+        * be smaller, again penalizing the lighter task.
+        *
+        * This is especially important for buddies when the leftmost
+        * task is higher priority than the buddy.
         */
-       if (sched_feat(ASYM_GRAN)) {
-               /*
-                * By using 'se' instead of 'curr' we penalize light tasks, so
-                * they get preempted easier. That is, if 'se' < 'curr' then
-                * the resulting gran will be larger, therefore penalizing the
-                * lighter, if otoh 'se' > 'curr' then the resulting gran will
-                * be smaller, again penalizing the lighter task.
-                *
-                * This is especially important for buddies when the leftmost
-                * task is higher priority than the buddy.
-                */
-               if (unlikely(se->load.weight != NICE_0_LOAD))
-                       gran = calc_delta_fair(gran, se);
-       } else {
-               if (unlikely(curr->load.weight != NICE_0_LOAD))
-                       gran = calc_delta_fair(gran, curr);
-       }
+       if (unlikely(se->load.weight != NICE_0_LOAD))
+               gran = calc_delta_fair(gran, se);
 
        return gran;
 }
@@ -1705,7 +1642,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        struct task_struct *curr = rq->curr;
        struct sched_entity *se = &curr->se, *pse = &p->se;
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-       int sync = wake_flags & WF_SYNC;
        int scale = cfs_rq->nr_running >= sched_nr_latency;
 
        if (unlikely(rt_prio(p->prio)))
@@ -1738,14 +1674,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        if (unlikely(curr->policy == SCHED_IDLE))
                goto preempt;
 
-       if (sched_feat(WAKEUP_SYNC) && sync)
-               goto preempt;
-
-       if (sched_feat(WAKEUP_OVERLAP) &&
-                       se->avg_overlap < sysctl_sched_migration_cost &&
-                       pse->avg_overlap < sysctl_sched_migration_cost)
-               goto preempt;
-
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
 
@@ -1844,13 +1772,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         * 3) are cache-hot on their current CPU.
         */
        if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
-               schedstat_inc(p, se.nr_failed_migrations_affine);
+               schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
                return 0;
        }
        *all_pinned = 0;
 
        if (task_running(rq, p)) {
-               schedstat_inc(p, se.nr_failed_migrations_running);
+               schedstat_inc(p, se.statistics.nr_failed_migrations_running);
                return 0;
        }
 
@@ -1866,14 +1794,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 #ifdef CONFIG_SCHEDSTATS
                if (tsk_cache_hot) {
                        schedstat_inc(sd, lb_hot_gained[idle]);
-                       schedstat_inc(p, se.nr_forced_migrations);
+                       schedstat_inc(p, se.statistics.nr_forced_migrations);
                }
 #endif
                return 1;
        }
 
        if (tsk_cache_hot) {
-               schedstat_inc(p, se.nr_failed_migrations_hot);
+               schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
                return 0;
        }
        return 1;
@@ -2311,7 +2239,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
 
 unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
 {
-       unsigned long weight = cpumask_weight(sched_domain_span(sd));
+       unsigned long weight = sd->span_weight;
        unsigned long smt_gain = sd->smt_gain;
 
        smt_gain /= weight;
@@ -2344,7 +2272,7 @@ unsigned long scale_rt_power(int cpu)
 
 static void update_cpu_power(struct sched_domain *sd, int cpu)
 {
-       unsigned long weight = cpumask_weight(sched_domain_span(sd));
+       unsigned long weight = sd->span_weight;
        unsigned long power = SCHED_LOAD_SCALE;
        struct sched_group *sdg = sd->groups;
 
@@ -3112,8 +3040,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 
        /* move a task from busiest_rq to target_rq */
        double_lock_balance(busiest_rq, target_rq);
-       update_rq_clock(busiest_rq);
-       update_rq_clock(target_rq);
 
        /* Search for an sd spanning us and the target CPU. */
        for_each_domain(target_cpu, sd) {
index d5059fd..83c66e8 100644 (file)
@@ -1,11 +1,4 @@
 /*
- * Disregards a certain amount of sleep time (sched_latency_ns) and
- * considers the task to be running during that period. This gives it
- * a service deficit on wakeup, allowing it to run sooner.
- */
-SCHED_FEAT(FAIR_SLEEPERS, 1)
-
-/*
  * Only give sleepers 50% of their service deficit. This allows
  * them to run sooner, but does not allow tons of sleepers to
  * rip the spread apart.
@@ -13,13 +6,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1)
 SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
 
 /*
- * By not normalizing the sleep time, heavy tasks get an effective
- * longer period, and lighter task an effective shorter period they
- * are considered running.
- */
-SCHED_FEAT(NORMALIZED_SLEEPER, 0)
-
-/*
  * Place new tasks ahead so that they do not starve already running
  * tasks
  */
@@ -31,37 +17,6 @@ SCHED_FEAT(START_DEBIT, 1)
 SCHED_FEAT(WAKEUP_PREEMPT, 1)
 
 /*
- * Compute wakeup_gran based on task behaviour, clipped to
- *  [0, sched_wakeup_gran_ns]
- */
-SCHED_FEAT(ADAPTIVE_GRAN, 1)
-
-/*
- * When converting the wakeup granularity to virtual time, do it such
- * that heavier tasks preempting a lighter task have an edge.
- */
-SCHED_FEAT(ASYM_GRAN, 1)
-
-/*
- * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
- */
-SCHED_FEAT(WAKEUP_SYNC, 0)
-
-/*
- * Wakeup preempt based on task behaviour. Tasks that do not overlap
- * don't get preempted.
- */
-SCHED_FEAT(WAKEUP_OVERLAP, 0)
-
-/*
- * Use the SYNC wakeup hint, pipes and the likes use this to indicate
- * the remote end is likely to consume the data we just wrote, and
- * therefore has cache benefit from being placed on the same cpu, see
- * also AFFINE_WAKEUPS.
- */
-SCHED_FEAT(SYNC_WAKEUPS, 1)
-
-/*
  * Based on load and program behaviour, see if it makes sense to place
  * a newly woken task on the same cpu as the task that woke it --
  * improve cache locality. Typically used with SYNC wakeups as
@@ -70,16 +25,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
 SCHED_FEAT(AFFINE_WAKEUPS, 1)
 
 /*
- * Weaken SYNC hint based on overlap
- */
-SCHED_FEAT(SYNC_LESS, 1)
-
-/*
- * Add SYNC hint based on overlap
- */
-SCHED_FEAT(SYNC_MORE, 0)
-
-/*
  * Prefer to schedule the task we woke last (assuming it failed
  * wakeup-preemption), since its likely going to consume data we
  * touched, increases cache locality.
index a8a6d8a..9fa0f40 100644 (file)
@@ -6,7 +6,8 @@
  */
 
 #ifdef CONFIG_SMP
-static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
+static int
+select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
 {
        return task_cpu(p); /* IDLE tasks as never migrated */
 }
@@ -22,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
        schedstat_inc(rq, sched_goidle);
-       /* adjust the active tasks as we might go into a long sleep */
-       calc_load_account_active(rq);
+       calc_load_account_idle(rq);
        return rq->idle;
 }
 
@@ -32,7 +32,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
  * message if some code attempts to do it:
  */
 static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 {
        raw_spin_unlock_irq(&rq->lock);
        printk(KERN_ERR "bad: scheduling from the idle thread!\n");
index b5b920a..8afb953 100644 (file)
@@ -613,7 +613,7 @@ static void update_curr_rt(struct rq *rq)
        if (unlikely((s64)delta_exec < 0))
                delta_exec = 0;
 
-       schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
+       schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
 
        curr->se.sum_exec_runtime += delta_exec;
        account_group_exec_runtime(curr, delta_exec);
@@ -888,20 +888,20 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
  * Adding/removing a task to/from a priority array:
  */
 static void
-enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
        struct sched_rt_entity *rt_se = &p->rt;
 
-       if (wakeup)
+       if (flags & ENQUEUE_WAKEUP)
                rt_se->timeout = 0;
 
-       enqueue_rt_entity(rt_se, head);
+       enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
 
        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
 }
 
-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
        struct sched_rt_entity *rt_se = &p->rt;
 
@@ -948,10 +948,9 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
-static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
+static int
+select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
 {
-       struct rq *rq = task_rq(p);
-
        if (sd_flag != SD_BALANCE_WAKE)
                return smp_processor_id();
 
index f992762..f25735a 100644 (file)
@@ -262,6 +262,9 @@ void tick_nohz_stop_sched_tick(int inidle)
                goto end;
        }
 
+       if (nohz_ratelimit(cpu))
+               goto end;
+
        ts->idle_calls++;
        /* Read jiffies and the time when jiffies were updated last */
        do {
index 766467b..8e1c8c0 100644 (file)
@@ -16,7 +16,6 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/user_namespace.h>
-#include "cred-internals.h"
 
 struct user_namespace init_user_ns = {
        .kref = {
@@ -137,9 +136,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
        struct hlist_head *hashent = uidhashentry(ns, uid);
        struct user_struct *up, *new;
 
-       /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
-        * atomic.
-        */
+       /* Make uid_hash_find() + uid_hash_insert() atomic. */
        spin_lock_irq(&uidhash_lock);
        up = uid_hash_find(uid, hashent);
        spin_unlock_irq(&uidhash_lock);
@@ -161,11 +158,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
                spin_lock_irq(&uidhash_lock);
                up = uid_hash_find(uid, hashent);
                if (up) {
-                       /* This case is not possible when CONFIG_USER_SCHED
-                        * is defined, since we serialize alloc_uid() using
-                        * uids_mutex. Hence no need to call
-                        * sched_destroy_user() or remove_user_sysfs_dir().
-                        */
                        key_put(new->uid_keyring);
                        key_put(new->session_keyring);
                        kmem_cache_free(uid_cachep, new);
@@ -178,8 +170,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 
        return up;
 
-       put_user_ns(new->user_ns);
-       kmem_cache_free(uid_cachep, new);
 out_unlock:
        return NULL;
 }