#include <linux/latencytop.h>
#include <linux/sched.h>
+#include <linux/cpumask.h>
/*
* Targeted preemption latency for CPU-bound tasks:
- * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length
/*
* Minimal preemption granularity for CPU-bound tasks:
- * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 2000000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
+unsigned int sysctl_sched_min_granularity = 750000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
/*
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
*/
-static unsigned int sched_nr_latency = 3;
+static unsigned int sched_nr_latency = 8;
/*
* After fork, child runs first. If set to 0 (default) then
*/
unsigned int sysctl_sched_child_runs_first __read_mostly;
-/*
- * sys_sched_yield() compat mode
- *
- * This option switches the agressive yield implementation of the
- * old scheduler back on.
- */
-unsigned int __read_mostly sysctl_sched_compat_yield;
-
/*
* SCHED_OTHER wake-up granularity.
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+/*
+ * The exponential sliding window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+
static const struct sched_class fair_sched_class;
/**************************************************************
return cfs_rq->tg->cfs_rq[this_cpu];
}
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ if (!cfs_rq->on_list) {
+ /*
+ * Ensure we either appear before our parent (if already
+ * enqueued) or force our parent to appear after us when it is
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases.
+ */
+ if (cfs_rq->tg->parent &&
+ cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ } else {
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ }
+
+ cfs_rq->on_list = 1;
+ }
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->on_list) {
+ list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+ cfs_rq->on_list = 0;
+ }
+}
+
/* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
return &cpu_rq(this_cpu)->cfs;
}
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
}
cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+#ifndef CONFIG_64BIT
+ smp_wmb();
+ cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
}
/*
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
-static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *left = cfs_rq->rb_leftmost;
return rb_entry(left, struct sched_entity, run_node);
}
+static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+{
+ struct rb_node *next = rb_next(&se->run_node);
+
+ if (!next)
+ return NULL;
+
+ return rb_entry(next, struct sched_entity, run_node);
+}
+
+#ifdef CONFIG_SCHED_DEBUG
static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
* Scheduling class statistics methods:
*/
-#ifdef CONFIG_SCHED_DEBUG
int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
WRT_SYSCTL(sched_min_granularity);
WRT_SYSCTL(sched_latency);
WRT_SYSCTL(sched_wakeup_granularity);
- WRT_SYSCTL(sched_shares_ratelimit);
#undef WRT_SYSCTL
return 0;
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
+static void update_cfs_shares(struct cfs_rq *cfs_rq);
+
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
+
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+ cfs_rq->load_unacc_exec_time += delta_exec;
+#endif
}
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- u64 now = rq_of(cfs_rq)->clock;
+ u64 now = rq_of(cfs_rq)->clock_task;
unsigned long delta_exec;
if (unlikely(!curr))
/*
* We are starting a new run period:
*/
- se->exec_start = rq_of(cfs_rq)->clock;
+ se->exec_start = rq_of(cfs_rq)->clock_task;
}
/**************************************************
list_add(&se->group_node, &cfs_rq->tasks);
}
cfs_rq->nr_running++;
- se->on_rq = 1;
}
static void
list_del_init(&se->group_node);
}
cfs_rq->nr_running--;
- se->on_rq = 0;
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+# ifdef CONFIG_SMP
+static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+ int global_update)
+{
+ struct task_group *tg = cfs_rq->tg;
+ long load_avg;
+
+ load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+ load_avg -= cfs_rq->load_contribution;
+
+ if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
+ atomic_add(load_avg, &tg->load_weight);
+ cfs_rq->load_contribution += load_avg;
+ }
+}
+
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+ u64 period = sysctl_sched_shares_window;
+ u64 now, delta;
+ unsigned long load = cfs_rq->load.weight;
+
+ if (cfs_rq->tg == &root_task_group)
+ return;
+
+ now = rq_of(cfs_rq)->clock_task;
+ delta = now - cfs_rq->load_stamp;
+
+ /* truncate load history at 4 idle periods */
+ if (cfs_rq->load_stamp > cfs_rq->load_last &&
+ now - cfs_rq->load_last > 4 * period) {
+ cfs_rq->load_period = 0;
+ cfs_rq->load_avg = 0;
+ delta = period - 1;
+ }
+
+ cfs_rq->load_stamp = now;
+ cfs_rq->load_unacc_exec_time = 0;
+ cfs_rq->load_period += delta;
+ if (load) {
+ cfs_rq->load_last = now;
+ cfs_rq->load_avg += delta * load;
+ }
+
+ /* consider updating load contribution on each fold or truncate */
+ if (global_update || cfs_rq->load_period > period
+ || !cfs_rq->load_period)
+ update_cfs_rq_load_contribution(cfs_rq, global_update);
+
+ while (cfs_rq->load_period > period) {
+ /*
+ * Inline assembly required to prevent the compiler
+ * optimising this loop into a divmod call.
+ * See __iter_div_u64_rem() for another example of this.
+ */
+ asm("" : "+rm" (cfs_rq->load_period));
+ cfs_rq->load_period /= 2;
+ cfs_rq->load_avg /= 2;
+ }
+
+ if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
+ list_del_leaf_cfs_rq(cfs_rq);
+}
+
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+{
+ long load_weight, load, shares;
+
+ load = cfs_rq->load.weight;
+
+ load_weight = atomic_read(&tg->load_weight);
+ load_weight += load;
+ load_weight -= cfs_rq->load_contribution;
+
+ shares = (tg->shares * load);
+ if (load_weight)
+ shares /= load_weight;
+
+ if (shares < MIN_SHARES)
+ shares = MIN_SHARES;
+ if (shares > tg->shares)
+ shares = tg->shares;
+
+ return shares;
+}
+
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+ update_cfs_load(cfs_rq, 0);
+ update_cfs_shares(cfs_rq);
+ }
+}
+# else /* CONFIG_SMP */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+{
+ return tg->shares;
+}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+# endif /* CONFIG_SMP */
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ unsigned long weight)
+{
+ if (se->on_rq) {
+ /* commit outstanding execution time */
+ if (cfs_rq->curr == se)
+ update_curr(cfs_rq);
+ account_entity_dequeue(cfs_rq, se);
+ }
+
+ update_load_set(&se->load, weight);
+
+ if (se->on_rq)
+ account_entity_enqueue(cfs_rq, se);
+}
+
+static void update_cfs_shares(struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg;
+ struct sched_entity *se;
+ long shares;
+
+ tg = cfs_rq->tg;
+ se = tg->se[cpu_of(rq_of(cfs_rq))];
+ if (!se)
+ return;
+#ifndef CONFIG_SMP
+ if (likely(se->load.weight == tg->shares))
+ return;
+#endif
+ shares = calc_cfs_shares(cfs_rq, tg);
+
+ reweight_entity(cfs_rq_of(se), se, shares);
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
se->vruntime = vruntime;
}
-#define ENQUEUE_WAKEUP 1
-#define ENQUEUE_MIGRATE 2
-
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
* Update the normalized vruntime before updating min_vruntime
* through callig update_curr().
*/
- if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
+ if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
se->vruntime += cfs_rq->min_vruntime;
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ update_cfs_load(cfs_rq, 0);
account_entity_enqueue(cfs_rq, se);
+ update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
check_spread(cfs_rq, se);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
+ se->on_rq = 1;
+
+ if (cfs_rq->nr_running == 1)
+ list_add_leaf_cfs_rq(cfs_rq);
+}
+
+static void __clear_buddies_last(struct sched_entity *se)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->last == se)
+ cfs_rq->last = NULL;
+ else
+ break;
+ }
}
-static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void __clear_buddies_next(struct sched_entity *se)
{
- if (!se || cfs_rq->last == se)
- cfs_rq->last = NULL;
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->next == se)
+ cfs_rq->next = NULL;
+ else
+ break;
+ }
+}
- if (!se || cfs_rq->next == se)
- cfs_rq->next = NULL;
+static void __clear_buddies_skip(struct sched_entity *se)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->skip == se)
+ cfs_rq->skip = NULL;
+ else
+ break;
+ }
}
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- for_each_sched_entity(se)
- __clear_buddies(cfs_rq_of(se), se);
+ if (cfs_rq->last == se)
+ __clear_buddies_last(se);
+
+ if (cfs_rq->next == se)
+ __clear_buddies_next(se);
+
+ if (cfs_rq->skip == se)
+ __clear_buddies_skip(se);
}
static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update run-time statistics of the 'current'.
update_curr(cfs_rq);
update_stats_dequeue(cfs_rq, se);
- if (sleep) {
+ if (flags & DEQUEUE_SLEEP) {
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
+ se->on_rq = 0;
+ update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
update_min_vruntime(cfs_rq);
+ update_cfs_shares(cfs_rq);
/*
* Normalize the entity after updating the min_vruntime because the
* update can refer to the ->curr item and we need to reflect this
* movement in our normalized position.
*/
- if (!sleep)
+ if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
}
return;
if (cfs_rq->nr_running > 1) {
- struct sched_entity *se = __pick_next_entity(cfs_rq);
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
s64 delta = curr->vruntime - se->vruntime;
+ if (delta < 0)
+ return;
+
if (delta > ideal_runtime)
resched_task(rq_of(cfs_rq)->curr);
}
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+/*
+ * Pick the next process, keeping these things in mind, in this order:
+ * 1) keep things fair between processes/task groups
+ * 2) pick the "next" process, since someone really wants that to run
+ * 3) pick the "last" process, for cache locality
+ * 4) do not run the "skip" process, if something else is available
+ */
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
- struct sched_entity *se = __pick_next_entity(cfs_rq);
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *left = se;
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
- se = cfs_rq->next;
+ /*
+ * Avoid running the skip buddy, if running something else can
+ * be done without getting too unfair.
+ */
+ if (cfs_rq->skip == se) {
+ struct sched_entity *second = __pick_next_entity(se);
+ if (second && wakeup_preempt_entity(second, left) < 1)
+ se = second;
+ }
/*
* Prefer last buddy, try to return the CPU to a preempted task.
if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
se = cfs_rq->last;
+ /*
+ * Someone really wants this to run. If it's not unfair, run it.
+ */
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+ se = cfs_rq->next;
+
clear_buddies(cfs_rq, se);
return se;
*/
update_curr(cfs_rq);
+ /*
+ * Update share accounting for long-running entities.
+ */
+ update_entity_shares_tick(cfs_rq);
+
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
* then put the task into the rbtree:
*/
static void
-enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
- int flags = 0;
-
- if (wakeup)
- flags |= ENQUEUE_WAKEUP;
- if (p->state == TASK_WAKING)
- flags |= ENQUEUE_MIGRATE;
for_each_sched_entity(se) {
if (se->on_rq)
flags = ENQUEUE_WAKEUP;
}
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ update_cfs_load(cfs_rq, 0);
+ update_cfs_shares(cfs_rq);
+ }
+
hrtick_update(rq);
}
+static void set_next_buddy(struct sched_entity *se);
+
/*
* The dequeue_task method is called before nr_running is
* decreased. We remove the task from the rbtree and
* update the fair scheduling stats:
*/
-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+ int task_sleep = flags & DEQUEUE_SLEEP;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- dequeue_entity(cfs_rq, se, sleep);
+ dequeue_entity(cfs_rq, se, flags);
+
/* Don't dequeue parent if it has other entities besides us */
- if (cfs_rq->load.weight)
+ if (cfs_rq->load.weight) {
+ /*
+ * Bias pick_next to pick a task from this cfs_rq, as
+ * p is sleeping when it is within its sched_slice.
+ */
+ if (task_sleep && parent_entity(se))
+ set_next_buddy(parent_entity(se));
break;
- sleep = 1;
+ }
+ flags |= DEQUEUE_SLEEP;
}
- hrtick_update(rq);
-}
-
-/*
- * sched_yield() support is very simple - we dequeue and enqueue.
- *
- * If compat_yield is turned on then we requeue to the end of the tree.
- */
-static void yield_task_fair(struct rq *rq)
-{
- struct task_struct *curr = rq->curr;
- struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- struct sched_entity *rightmost, *se = &curr->se;
-
- /*
- * Are we the only task in the tree?
- */
- if (unlikely(cfs_rq->nr_running == 1))
- return;
-
- clear_buddies(cfs_rq, se);
-
- if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
- update_rq_clock(rq);
- /*
- * Update run-time statistics of the 'current'.
- */
- update_curr(cfs_rq);
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
- return;
+ update_cfs_load(cfs_rq, 0);
+ update_cfs_shares(cfs_rq);
}
- /*
- * Find the rightmost entry in the rbtree:
- */
- rightmost = __pick_last_entity(cfs_rq);
- /*
- * Already in the rightmost position?
- */
- if (unlikely(!rightmost || entity_before(rightmost, se)))
- return;
- /*
- * Minimally necessary key value to be last in the tree:
- * Upon rescheduling, sched_class::put_prev_task() will place
- * 'current' within the tree based on its new key value.
- */
- se->vruntime = rightmost->vruntime + 1;
+ hrtick_update(rq);
}
#ifdef CONFIG_SMP
-static void task_waking_fair(struct rq *rq, struct task_struct *p)
+static void task_waking_fair(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 min_vruntime;
- se->vruntime -= cfs_rq->min_vruntime;
+#ifndef CONFIG_64BIT
+ u64 min_vruntime_copy;
+
+ do {
+ min_vruntime_copy = cfs_rq->min_vruntime_copy;
+ smp_rmb();
+ min_vruntime = cfs_rq->min_vruntime;
+ } while (min_vruntime != min_vruntime_copy);
+#else
+ min_vruntime = cfs_rq->min_vruntime;
+#endif
+
+ se->vruntime -= min_vruntime;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
* Adding load to a group doesn't make a group heavier, but can cause movement
* of group shares between cpus. Assuming the shares were perfectly aligned one
* can calculate the shift in shares.
- *
- * The problem is that perfectly aligning the shares is rather expensive, hence
- * we try to avoid doing that too often - see update_shares(), which ratelimits
- * this change.
- *
- * We compensate this by not only taking the current delta into account, but
- * also considering the delta between when the shares were last adjusted and
- * now.
- *
- * We still saw a performance dip, some tracing learned us that between
- * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
- * significantly. Therefore try to bias the error in direction of failing
- * the affine wakeup.
- *
*/
-static long effective_load(struct task_group *tg, int cpu,
- long wl, long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
struct sched_entity *se = tg->se[cpu];
if (!tg->parent)
return wl;
- /*
- * By not taking the decrease of shares on the other cpu into
- * account our error leans towards reducing the affine wakeups.
- */
- if (!wl && sched_feat(ASYM_EFF_LOAD))
- return wl;
-
for_each_sched_entity(se) {
- long S, rw, s, a, b;
- long more_w;
-
- /*
- * Instead of using this increment, also add the difference
- * between when the shares were last updated and now.
- */
- more_w = se->my_q->load.weight - se->my_q->rq_weight;
- wl += more_w;
- wg += more_w;
+ long lw, w;
- S = se->my_q->tg->shares;
- s = se->my_q->shares;
- rw = se->my_q->rq_weight;
+ tg = se->my_q->tg;
+ w = se->my_q->load.weight;
- a = S*(rw + wl);
- b = S*rw + s*wg;
+ /* use this cpu's instantaneous contribution */
+ lw = atomic_read(&tg->load_weight);
+ lw -= se->my_q->load_contribution;
+ lw += w + wg;
- wl = s*(a-b);
+ wl += w;
- if (likely(b))
- wl /= b;
+ if (lw > 0 && wl < lw)
+ wl = (wl * tg->shares) / lw;
+ else
+ wl = tg->shares;
- /*
- * Assume the group is already running and will
- * thus already be accounted for in the weight.
- *
- * That is, moving shares between CPUs, does not
- * alter the group weight.
- */
+ /* zero point is MIN_SHARES */
+ if (wl < MIN_SHARES)
+ wl = MIN_SHARES;
+ wl -= se->load.weight;
wg = 0;
}
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
- unsigned long this_load, load;
+ s64 this_load, load;
int idx, this_cpu, prev_cpu;
unsigned long tl_per_task;
- unsigned int imbalance;
struct task_group *tg;
unsigned long weight;
int balanced;
* effect of the currently running task from the load
* of the current CPU:
*/
+ rcu_read_lock();
if (sync) {
tg = task_group(current);
weight = current->se.load.weight;
tg = task_group(p);
weight = p->se.load.weight;
- imbalance = 100 + (sd->imbalance_pct - 100) / 2;
-
/*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
* due to the sync cause above having dropped this_load to 0, we'll
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
- balanced = !this_load ||
- 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
- imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
+ if (this_load > 0) {
+ s64 this_eff_load, prev_eff_load;
+
+ this_eff_load = 100;
+ this_eff_load *= power_of(prev_cpu);
+ this_eff_load *= this_load +
+ effective_load(tg, this_cpu, weight, weight);
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= power_of(this_cpu);
+ prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+
+ balanced = this_eff_load <= prev_eff_load;
+ } else
+ balanced = true;
+ rcu_read_unlock();
/*
* If the currently running task will sleep within
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int load_idx)
{
- struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+ struct sched_group *idlest = NULL, *group = sd->groups;
unsigned long min_load = ULONG_MAX, this_load = 0;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
}
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power;
if (local_group) {
this_load = avg_load;
- this = group;
} else if (avg_load < min_load) {
min_load = avg_load;
idlest = group;
/*
* Try and locate an idle CPU in the sched_domain.
*/
-static int
-select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_sibling(struct task_struct *p, int target)
{
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
+ struct sched_domain *sd;
int i;
/*
- * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
- * test in select_task_rq_fair) and the prev_cpu is idle then that's
- * always a better target than the current cpu.
+ * If the task is going to be woken-up on this cpu and if it is
+ * already idle, then it is the right target.
+ */
+ if (target == cpu && idle_cpu(cpu))
+ return cpu;
+
+ /*
+ * If the task is going to be woken-up on the cpu where it previously
+ * ran and if it is currently idle, then it the right target.
*/
- if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+ if (target == prev_cpu && idle_cpu(prev_cpu))
return prev_cpu;
/*
- * Otherwise, iterate the domain and find an elegible idle cpu.
+ * Otherwise, iterate the domains and find an elegible idle cpu.
*/
- for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
- if (!cpu_rq(i)->cfs.nr_running) {
- target = i;
+ rcu_read_lock();
+ for_each_domain(target, sd) {
+ if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
break;
+
+ for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+ if (idle_cpu(i)) {
+ target = i;
+ break;
+ }
}
+
+ /*
+ * Lets stop looking for an idle sibling when we reached
+ * the domain that spans the current cpu and prev_cpu.
+ */
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
+ cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+ break;
}
+ rcu_read_unlock();
return target;
}
*
* preempt must be disabled.
*/
-static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+static int
+select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
int new_cpu = cpu;
- int want_affine = 0, cpu_idle = !current->pid;
+ int want_affine = 0;
int want_sd = 1;
int sync = wake_flags & WF_SYNC;
if (sd_flag & SD_BALANCE_WAKE) {
- if (sched_feat(AFFINE_WAKEUPS) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (cpumask_test_cpu(cpu, &p->cpus_allowed))
want_affine = 1;
new_cpu = prev_cpu;
}
+ rcu_read_lock();
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
continue;
nr_running += cpu_rq(i)->cfs.nr_running;
}
- capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+ capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
if (tmp->flags & SD_POWERSAVINGS_BALANCE)
nr_running /= 2;
}
/*
- * While iterating the domains looking for a spanning
- * WAKE_AFFINE domain, adjust the affine target to any idle cpu
- * in cache sharing domains along the way.
+ * If both cpu and prev_cpu are part of this domain,
+ * cpu is a valid SD_WAKE_AFFINE target.
*/
- if (want_affine) {
- int target = -1;
-
- /*
- * If both cpu and prev_cpu are part of this domain,
- * cpu is a valid SD_WAKE_AFFINE target.
- */
- if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
- target = cpu;
-
- /*
- * If there's an idle sibling in this domain, make that
- * the wake_affine target instead of the current cpu.
- */
- if (!cpu_idle && tmp->flags & SD_SHARE_PKG_RESOURCES)
- target = select_idle_sibling(p, tmp, target);
-
- if (target >= 0) {
- if (tmp->flags & SD_WAKE_AFFINE) {
- affine_sd = tmp;
- want_affine = 0;
- if (target != cpu)
- cpu_idle = 1;
- }
- cpu = target;
- }
+ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+ cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+ affine_sd = tmp;
+ want_affine = 0;
}
if (!want_sd && !want_affine)
sd = tmp;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
- if (sched_feat(LB_SHARES_UPDATE)) {
- /*
- * Pick the largest domain to update shares over
- */
- tmp = sd;
- if (affine_sd && (!tmp ||
- cpumask_weight(sched_domain_span(affine_sd)) >
- cpumask_weight(sched_domain_span(sd))))
- tmp = affine_sd;
-
- if (tmp)
- update_shares(tmp);
- }
-#endif
-
if (affine_sd) {
- if (cpu_idle || cpu == prev_cpu || wake_affine(affine_sd, p, sync))
- return cpu;
+ if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+ prev_cpu = cpu;
+
+ new_cpu = select_idle_sibling(p, prev_cpu);
+ goto unlock;
}
while (sd) {
/* Now try balancing at a lower domain level of new_cpu */
cpu = new_cpu;
- weight = cpumask_weight(sched_domain_span(sd));
+ weight = sd->span_weight;
sd = NULL;
for_each_domain(cpu, tmp) {
- if (weight <= cpumask_weight(sched_domain_span(tmp)))
+ if (weight <= tmp->span_weight)
break;
if (tmp->flags & sd_flag)
sd = tmp;
}
/* while loop will break here if sd == NULL */
}
+unlock:
+ rcu_read_unlock();
return new_cpu;
}
/*
* Since its curr running now, convert the gran from real-time
* to virtual-time in his units.
+ *
+ * By using 'se' instead of 'curr' we penalize light tasks, so
+ * they get preempted easier. That is, if 'se' < 'curr' then
+ * the resulting gran will be larger, therefore penalizing the
+ * lighter, if otoh 'se' > 'curr' then the resulting gran will
+ * be smaller, again penalizing the lighter task.
+ *
+ * This is especially important for buddies when the leftmost
+ * task is higher priority than the buddy.
*/
- if (sched_feat(ASYM_GRAN)) {
- /*
- * By using 'se' instead of 'curr' we penalize light tasks, so
- * they get preempted easier. That is, if 'se' < 'curr' then
- * the resulting gran will be larger, therefore penalizing the
- * lighter, if otoh 'se' > 'curr' then the resulting gran will
- * be smaller, again penalizing the lighter task.
- *
- * This is especially important for buddies when the leftmost
- * task is higher priority than the buddy.
- */
- if (unlikely(se->load.weight != NICE_0_LOAD))
- gran = calc_delta_fair(gran, se);
- } else {
- if (unlikely(curr->load.weight != NICE_0_LOAD))
- gran = calc_delta_fair(gran, curr);
- }
-
- return gran;
+ return calc_delta_fair(gran, se);
}
/*
static void set_last_buddy(struct sched_entity *se)
{
- if (likely(task_of(se)->policy != SCHED_IDLE)) {
- for_each_sched_entity(se)
- cfs_rq_of(se)->last = se;
- }
+ if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+ return;
+
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->last = se;
}
static void set_next_buddy(struct sched_entity *se)
{
- if (likely(task_of(se)->policy != SCHED_IDLE)) {
- for_each_sched_entity(se)
- cfs_rq_of(se)->next = se;
- }
+ if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+ return;
+
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->next = se;
+}
+
+static void set_skip_buddy(struct sched_entity *se)
+{
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->skip = se;
}
/*
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
int scale = cfs_rq->nr_running >= sched_nr_latency;
-
- if (unlikely(rt_prio(p->prio)))
- goto preempt;
-
- if (unlikely(p->sched_class != &fair_sched_class))
- return;
+ int next_buddy_marked = 0;
if (unlikely(se == pse))
return;
- if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
+ if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
set_next_buddy(pse);
+ next_buddy_marked = 1;
+ }
/*
* We can come here with TIF_NEED_RESCHED already set from new task
if (test_tsk_need_resched(curr))
return;
+ /* Idle tasks are by definition preempted by non-idle tasks. */
+ if (unlikely(curr->policy == SCHED_IDLE) &&
+ likely(p->policy != SCHED_IDLE))
+ goto preempt;
+
/*
- * Batch and idle tasks do not preempt (their preemption is driven by
- * the tick):
+ * Batch and idle tasks do not preempt non-idle tasks (their preemption
+ * is driven by the tick):
*/
if (unlikely(p->policy != SCHED_NORMAL))
return;
- /* Idle tasks are by definition preempted by everybody. */
- if (unlikely(curr->policy == SCHED_IDLE))
- goto preempt;
if (!sched_feat(WAKEUP_PREEMPT))
return;
update_curr(cfs_rq);
find_matching_se(&se, &pse);
BUG_ON(!pse);
- if (wakeup_preempt_entity(se, pse) == 1)
+ if (wakeup_preempt_entity(se, pse) == 1) {
+ /*
+ * Bias pick_next to pick the sched entity that is
+ * triggering this preemption.
+ */
+ if (!next_buddy_marked)
+ set_next_buddy(pse);
goto preempt;
+ }
return;
}
}
+/*
+ * sched_yield() is very simple
+ *
+ * The magic of dealing with the ->skip buddy is in pick_next_entity.
+ */
+static void yield_task_fair(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+ struct sched_entity *se = &curr->se;
+
+ /*
+ * Are we the only task in the tree?
+ */
+ if (unlikely(rq->nr_running == 1))
+ return;
+
+ clear_buddies(cfs_rq, se);
+
+ if (curr->policy != SCHED_BATCH) {
+ update_rq_clock(rq);
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+ }
+
+ set_skip_buddy(se);
+}
+
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+{
+ struct sched_entity *se = &p->se;
+
+ if (!se->on_rq)
+ return false;
+
+ /* Tell the scheduler that we'd really like pse to run next. */
+ set_next_buddy(se);
+
+ yield_task_fair(rq);
+
+ return true;
+}
+
#ifdef CONFIG_SMP
/**************************************************
* Fair scheduling class load-balancing methods:
* 2) too many balance attempts have failed.
*/
- tsk_cache_hot = task_hot(p, rq->clock, sd);
+ tsk_cache_hot = task_hot(p, rq->clock_task, sd);
if (!tsk_cache_hot ||
sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATS
balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move, struct sched_domain *sd,
enum cpu_idle_type idle, int *all_pinned,
- int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
+ struct cfs_rq *busiest_cfs_rq)
{
- int loops = 0, pulled = 0, pinned = 0;
+ int loops = 0, pulled = 0;
long rem_load_move = max_load_move;
struct task_struct *p, *n;
if (max_load_move == 0)
goto out;
- pinned = 1;
-
list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
if (loops++ > sysctl_sched_nr_migrate)
break;
if ((p->se.load.weight >> 1) > rem_load_move ||
- !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
+ !can_migrate_task(p, busiest, this_cpu, sd, idle,
+ all_pinned))
continue;
pull_task(busiest, p, this_rq, this_cpu);
*/
if (rem_load_move <= 0)
break;
-
- if (p->prio < *this_best_prio)
- *this_best_prio = p->prio;
}
out:
/*
*/
schedstat_add(sd, lb_gained[idle], pulled);
- if (all_pinned)
- *all_pinned = pinned;
-
return max_load_move - rem_load_move;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * update tg->load_weight by folding this cpu's load_avg
+ */
+static int update_shares_cpu(struct task_group *tg, int cpu)
+{
+ struct cfs_rq *cfs_rq;
+ unsigned long flags;
+ struct rq *rq;
+
+ if (!tg->se[cpu])
+ return 0;
+
+ rq = cpu_rq(cpu);
+ cfs_rq = tg->cfs_rq[cpu];
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ update_rq_clock(rq);
+ update_cfs_load(cfs_rq, 1);
+
+ /*
+ * We need to update shares after updating tg->load_weight in
+ * order to adjust the weight of groups with long running tasks.
+ */
+ update_cfs_shares(cfs_rq);
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ return 0;
+}
+
+static void update_shares(int cpu)
+{
+ struct cfs_rq *cfs_rq;
+ struct rq *rq = cpu_rq(cpu);
+
+ rcu_read_lock();
+ for_each_leaf_cfs_rq(rq, cfs_rq)
+ update_shares_cpu(cfs_rq->tg, cpu);
+ rcu_read_unlock();
+}
+
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned, int *this_best_prio)
+ int *all_pinned)
{
long rem_load_move = max_load_move;
int busiest_cpu = cpu_of(busiest);
rem_load = div_u64(rem_load, busiest_h_load + 1);
moved_load = balance_tasks(this_rq, this_cpu, busiest,
- rem_load, sd, idle, all_pinned, this_best_prio,
+ rem_load, sd, idle, all_pinned,
busiest_cfs_rq);
if (!moved_load)
return max_load_move - rem_load_move;
}
#else
+static inline void update_shares(int cpu)
+{
+}
+
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned, int *this_best_prio)
+ int *all_pinned)
{
return balance_tasks(this_rq, this_cpu, busiest,
max_load_move, sd, idle, all_pinned,
- this_best_prio, &busiest->cfs);
+ &busiest->cfs);
}
#endif
int *all_pinned)
{
unsigned long total_load_moved = 0, load_moved;
- int this_best_prio = this_rq->curr->prio;
do {
load_moved = load_balance_fair(this_rq, this_cpu, busiest,
max_load_move - total_load_moved,
- sd, idle, all_pinned, &this_best_prio);
+ sd, idle, all_pinned);
total_load_moved += load_moved;
unsigned long this_load;
unsigned long this_load_per_task;
unsigned long this_nr_running;
+ unsigned long this_has_capacity;
+ unsigned int this_idle_cpus;
/* Statistics of the busiest group */
+ unsigned int busiest_idle_cpus;
unsigned long max_load;
unsigned long busiest_load_per_task;
unsigned long busiest_nr_running;
unsigned long busiest_group_capacity;
+ unsigned long busiest_has_capacity;
+ unsigned int busiest_group_weight;
int group_imb; /* Is there imbalance in this sd */
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
unsigned long sum_nr_running; /* Nr tasks running in the group */
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long group_capacity;
+ unsigned long idle_cpus;
+ unsigned long group_weight;
int group_imb; /* Is there an imbalance in the group ? */
+ int group_has_capacity; /* Is there extra capacity in the group? */
};
/**
unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
{
- return SCHED_LOAD_SCALE;
+ return SCHED_POWER_SCALE;
}
unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
{
- unsigned long weight = cpumask_weight(sched_domain_span(sd));
+ unsigned long weight = sd->span_weight;
unsigned long smt_gain = sd->smt_gain;
smt_gain /= weight;
struct rq *rq = cpu_rq(cpu);
u64 total, available;
- sched_avg_update(rq);
-
total = sched_avg_period() + (rq->clock - rq->age_stamp);
- available = total - rq->rt_avg;
- if (unlikely((s64)total < SCHED_LOAD_SCALE))
- total = SCHED_LOAD_SCALE;
+ if (unlikely(total < rq->rt_avg)) {
+ /* Ensures that power won't end up being negative */
+ available = 0;
+ } else {
+ available = total - rq->rt_avg;
+ }
- total >>= SCHED_LOAD_SHIFT;
+ if (unlikely((s64)total < SCHED_POWER_SCALE))
+ total = SCHED_POWER_SCALE;
+
+ total >>= SCHED_POWER_SHIFT;
return div_u64(available, total);
}
static void update_cpu_power(struct sched_domain *sd, int cpu)
{
- unsigned long weight = cpumask_weight(sched_domain_span(sd));
- unsigned long power = SCHED_LOAD_SCALE;
+ unsigned long weight = sd->span_weight;
+ unsigned long power = SCHED_POWER_SCALE;
struct sched_group *sdg = sd->groups;
- if (sched_feat(ARCH_POWER))
- power *= arch_scale_freq_power(sd, cpu);
- else
- power *= default_scale_freq_power(sd, cpu);
-
- power >>= SCHED_LOAD_SHIFT;
-
if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
if (sched_feat(ARCH_POWER))
power *= arch_scale_smt_power(sd, cpu);
else
power *= default_scale_smt_power(sd, cpu);
- power >>= SCHED_LOAD_SHIFT;
+ power >>= SCHED_POWER_SHIFT;
}
+ sdg->cpu_power_orig = power;
+
+ if (sched_feat(ARCH_POWER))
+ power *= arch_scale_freq_power(sd, cpu);
+ else
+ power *= default_scale_freq_power(sd, cpu);
+
+ power >>= SCHED_POWER_SHIFT;
+
power *= scale_rt_power(cpu);
- power >>= SCHED_LOAD_SHIFT;
+ power >>= SCHED_POWER_SHIFT;
if (!power)
power = 1;
+ cpu_rq(cpu)->cpu_power = power;
sdg->cpu_power = power;
}
sdg->cpu_power = power;
}
+/*
+ * Try and fix up capacity for tiny siblings, this is needed when
+ * things like SD_ASYM_PACKING need f_b_g to select another sibling
+ * which on its own isn't powerful enough.
+ *
+ * See update_sd_pick_busiest() and check_asym_packing().
+ */
+static inline int
+fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+{
+ /*
+ * Only siblings can have significantly less than SCHED_POWER_SCALE
+ */
+ if (!(sd->flags & SD_SHARE_CPUPOWER))
+ return 0;
+
+ /*
+ * If ~90% of the cpu_power is still there, we're good.
+ */
+ if (group->cpu_power * 32 > group->cpu_power_orig * 29)
+ return 1;
+
+ return 0;
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @sd: The sched_domain whose statistics are to be updated.
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
* @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @sd_idle: Idle status of the sched_domain containing group.
* @local_group: Does group contain this_cpu.
* @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
*/
static inline void update_sg_lb_stats(struct sched_domain *sd,
struct sched_group *group, int this_cpu,
- enum cpu_idle_type idle, int load_idx, int *sd_idle,
+ enum cpu_idle_type idle, int load_idx,
int local_group, const struct cpumask *cpus,
int *balance, struct sg_lb_stats *sgs)
{
- unsigned long load, max_cpu_load, min_cpu_load;
+ unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
int i;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long avg_load_per_task = 0;
/* Tally up the load of all CPUs in the group */
max_cpu_load = 0;
min_cpu_load = ~0UL;
+ max_nr_running = 0;
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
struct rq *rq = cpu_rq(i);
- if (*sd_idle && rq->nr_running)
- *sd_idle = 0;
-
/* Bias balancing toward cpus of our domain */
if (local_group) {
if (idle_cpu(i) && !first_idle_cpu) {
load = target_load(i, load_idx);
} else {
load = source_load(i, load_idx);
- if (load > max_cpu_load)
+ if (load > max_cpu_load) {
max_cpu_load = load;
+ max_nr_running = rq->nr_running;
+ }
if (min_cpu_load > load)
min_cpu_load = load;
}
sgs->group_load += load;
sgs->sum_nr_running += rq->nr_running;
sgs->sum_weighted_load += weighted_cpuload(i);
-
+ if (idle_cpu(i))
+ sgs->idle_cpus++;
}
/*
* domains. In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
- if (idle != CPU_NEWLY_IDLE && local_group &&
- balance_cpu != this_cpu) {
- *balance = 0;
- return;
+ if (idle != CPU_NEWLY_IDLE && local_group) {
+ if (balance_cpu != this_cpu) {
+ *balance = 0;
+ return;
+ }
+ update_group_power(sd, this_cpu);
}
- update_group_power(sd, this_cpu);
-
/* Adjust by relative CPU power of the group */
- sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power;
/*
* Consider the group unbalanced when the imbalance is larger
- * than the average weight of two tasks.
+ * than the average weight of a task.
*
* APZ: with cgroup the avg task weight can vary wildly and
* might not be a suitable number - should we keep a
if (sgs->sum_nr_running)
avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
- if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+ if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
sgs->group_imb = 1;
- sgs->group_capacity =
- DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+ sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power,
+ SCHED_POWER_SCALE);
+ if (!sgs->group_capacity)
+ sgs->group_capacity = fix_small_capacity(sd, group);
+ sgs->group_weight = group->group_weight;
+
+ if (sgs->group_capacity > sgs->sum_nr_running)
+ sgs->group_has_capacity = 1;
+}
+
+/**
+ * update_sd_pick_busiest - return 1 on busiest group
+ * @sd: sched_domain whose statistics are to be checked
+ * @sds: sched_domain statistics
+ * @sg: sched_group candidate to be checked for being the busiest
+ * @sgs: sched_group statistics
+ * @this_cpu: the current cpu
+ *
+ * Determine if @sg is a busier group than the previously selected
+ * busiest group.
+ */
+static bool update_sd_pick_busiest(struct sched_domain *sd,
+ struct sd_lb_stats *sds,
+ struct sched_group *sg,
+ struct sg_lb_stats *sgs,
+ int this_cpu)
+{
+ if (sgs->avg_load <= sds->max_load)
+ return false;
+
+ if (sgs->sum_nr_running > sgs->group_capacity)
+ return true;
+
+ if (sgs->group_imb)
+ return true;
+
+ /*
+ * ASYM_PACKING needs to move all the work to the lowest
+ * numbered CPUs in the group, therefore mark all groups
+ * higher than ourself as busy.
+ */
+ if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+ this_cpu < group_first_cpu(sg)) {
+ if (!sds->busiest)
+ return true;
+
+ if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+ return true;
+ }
+
+ return false;
}
/**
* @sd: sched_domain whose statistics are to be updated.
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing group.
* @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sds: variable to hold the statistics for this sched_domain.
*/
static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
- enum cpu_idle_type idle, int *sd_idle,
- const struct cpumask *cpus, int *balance,
- struct sd_lb_stats *sds)
+ enum cpu_idle_type idle, const struct cpumask *cpus,
+ int *balance, struct sd_lb_stats *sds)
{
struct sched_domain *child = sd->child;
- struct sched_group *group = sd->groups;
+ struct sched_group *sg = sd->groups;
struct sg_lb_stats sgs;
int load_idx, prefer_sibling = 0;
do {
int local_group;
- local_group = cpumask_test_cpu(this_cpu,
- sched_group_cpus(group));
+ local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
memset(&sgs, 0, sizeof(sgs));
- update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
+ update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
local_group, cpus, balance, &sgs);
if (local_group && !(*balance))
return;
sds->total_load += sgs.group_load;
- sds->total_pwr += group->cpu_power;
+ sds->total_pwr += sg->cpu_power;
/*
* In case the child domain prefers tasks go to siblings
- * first, lower the group capacity to one so that we'll try
- * and move all the excess tasks away.
+ * first, lower the sg capacity to one so that we'll try
+ * and move all the excess tasks away. We lower the capacity
+ * of a group only if the local group has the capacity to fit
+ * these excess tasks, i.e. nr_running < group_capacity. The
+ * extra check prevents the case where you always pull from the
+ * heaviest group when it is already under-utilized (possible
+ * with a large weight task outweighs the tasks on the system).
*/
- if (prefer_sibling)
+ if (prefer_sibling && !local_group && sds->this_has_capacity)
sgs.group_capacity = min(sgs.group_capacity, 1UL);
if (local_group) {
sds->this_load = sgs.avg_load;
- sds->this = group;
+ sds->this = sg;
sds->this_nr_running = sgs.sum_nr_running;
sds->this_load_per_task = sgs.sum_weighted_load;
- } else if (sgs.avg_load > sds->max_load &&
- (sgs.sum_nr_running > sgs.group_capacity ||
- sgs.group_imb)) {
+ sds->this_has_capacity = sgs.group_has_capacity;
+ sds->this_idle_cpus = sgs.idle_cpus;
+ } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
sds->max_load = sgs.avg_load;
- sds->busiest = group;
+ sds->busiest = sg;
sds->busiest_nr_running = sgs.sum_nr_running;
+ sds->busiest_idle_cpus = sgs.idle_cpus;
sds->busiest_group_capacity = sgs.group_capacity;
sds->busiest_load_per_task = sgs.sum_weighted_load;
+ sds->busiest_has_capacity = sgs.group_has_capacity;
+ sds->busiest_group_weight = sgs.group_weight;
sds->group_imb = sgs.group_imb;
}
- update_sd_power_savings_stats(group, sds, local_group, &sgs);
- group = group->next;
- } while (group != sd->groups);
+ update_sd_power_savings_stats(sg, sds, local_group, &sgs);
+ sg = sg->next;
+ } while (sg != sd->groups);
+}
+
+int __weak arch_sd_sibling_asym_packing(void)
+{
+ return 0*SD_ASYM_PACKING;
+}
+
+/**
+ * check_asym_packing - Check to see if the group is packed into the
+ * sched doman.
+ *
+ * This is primarily intended to used at the sibling level. Some
+ * cores like POWER7 prefer to use lower numbered SMT threads. In the
+ * case of POWER7, it can move to lower SMT modes only when higher
+ * threads are idle. When in lower SMT modes, the threads will
+ * perform better since they share less core resources. Hence when we
+ * have idle threads, we want them to be the higher ones.
+ *
+ * This packing function is run on idle threads. It checks to see if
+ * the busiest CPU in this domain (core in the P7 case) has a higher
+ * CPU number than the packing function is being run on. Here we are
+ * assuming lower CPU number will be equivalent to lower a SMT thread
+ * number.
+ *
+ * Returns 1 when packing is required and a task should be moved to
+ * this CPU. The amount of the imbalance is returned in *imbalance.
+ *
+ * @sd: The sched_domain whose packing is to be checked.
+ * @sds: Statistics of the sched_domain which is to be packed
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: returns amount of imbalanced due to packing.
+ */
+static int check_asym_packing(struct sched_domain *sd,
+ struct sd_lb_stats *sds,
+ int this_cpu, unsigned long *imbalance)
+{
+ int busiest_cpu;
+
+ if (!(sd->flags & SD_ASYM_PACKING))
+ return 0;
+
+ if (!sds->busiest)
+ return 0;
+
+ busiest_cpu = group_first_cpu(sds->busiest);
+ if (this_cpu > busiest_cpu)
+ return 0;
+
+ *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+ SCHED_POWER_SCALE);
+ return 1;
}
/**
cpu_avg_load_per_task(this_cpu);
scaled_busy_load_per_task = sds->busiest_load_per_task
- * SCHED_LOAD_SCALE;
+ * SCHED_POWER_SCALE;
scaled_busy_load_per_task /= sds->busiest->cpu_power;
if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
min(sds->busiest_load_per_task, sds->max_load);
pwr_now += sds->this->cpu_power *
min(sds->this_load_per_task, sds->this_load);
- pwr_now /= SCHED_LOAD_SCALE;
+ pwr_now /= SCHED_POWER_SCALE;
/* Amount of load we'd subtract */
- tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+ tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
sds->busiest->cpu_power;
if (sds->max_load > tmp)
pwr_move += sds->busiest->cpu_power *
/* Amount of load we'd add */
if (sds->max_load * sds->busiest->cpu_power <
- sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+ sds->busiest_load_per_task * SCHED_POWER_SCALE)
tmp = (sds->max_load * sds->busiest->cpu_power) /
sds->this->cpu_power;
else
- tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+ tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
sds->this->cpu_power;
pwr_move += sds->this->cpu_power *
min(sds->this_load_per_task, sds->this_load + tmp);
- pwr_move /= SCHED_LOAD_SCALE;
+ pwr_move /= SCHED_POWER_SCALE;
/* Move if we gain throughput */
if (pwr_move > pwr_now)
load_above_capacity = (sds->busiest_nr_running -
sds->busiest_group_capacity);
- load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
+ load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
load_above_capacity /= sds->busiest->cpu_power;
}
/* How much load to actually move to equalise the imbalance */
*imbalance = min(max_pull * sds->busiest->cpu_power,
(sds->avg_load - sds->this_load) * sds->this->cpu_power)
- / SCHED_LOAD_SCALE;
+ / SCHED_POWER_SCALE;
/*
* if *imbalance is less than the average load per runnable task
- * there is no gaurantee that any tasks will be moved so we'll have
+ * there is no guarantee that any tasks will be moved so we'll have
* a think about bumping its value to force at least one task to be
* moved
*/
return fix_small_imbalance(sds, this_cpu, imbalance);
}
+
/******* find_busiest_group() helpers end here *********************/
/**
* @imbalance: Variable which stores amount of weighted load which should
* be moved to restore balance/put a group to idle.
* @idle: The idle status of this_cpu.
- * @sd_idle: The idleness of sd
* @cpus: The set of CPUs under consideration for load-balancing.
* @balance: Pointer to a variable indicating if this_cpu
* is the appropriate cpu to perform load balancing at this_level.
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long *imbalance, enum cpu_idle_type idle,
- int *sd_idle, const struct cpumask *cpus, int *balance)
+ const struct cpumask *cpus, int *balance)
{
struct sd_lb_stats sds;
* Compute the various statistics relavent for load balancing at
* this level.
*/
- update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
- balance, &sds);
+ update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
- /* Cases where imbalance does not exist from POV of this_cpu */
- /* 1) this_cpu is not the appropriate cpu to perform load balancing
- * at this level.
- * 2) There is no busy sibling group to pull from.
- * 3) This group is the busiest group.
- * 4) This group is more busy than the avg busieness at this
- * sched_domain.
- * 5) The imbalance is within the specified limit.
+ /*
+ * this_cpu is not the appropriate cpu to perform load balancing at
+ * this level.
*/
if (!(*balance))
goto ret;
+ if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
+ check_asym_packing(sd, &sds, this_cpu, imbalance))
+ return sds.busiest;
+
+ /* There is no busy sibling group to pull tasks from */
if (!sds.busiest || sds.busiest_nr_running == 0)
goto out_balanced;
+ sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
+
+ /*
+ * If the busiest group is imbalanced the below checks don't
+ * work because they assumes all things are equal, which typically
+ * isn't true due to cpus_allowed constraints and the like.
+ */
+ if (sds.group_imb)
+ goto force_balance;
+
+ /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+ if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+ !sds.busiest_has_capacity)
+ goto force_balance;
+
+ /*
+ * If the local group is more busy than the selected busiest group
+ * don't try and pull any tasks.
+ */
if (sds.this_load >= sds.max_load)
goto out_balanced;
- sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
-
+ /*
+ * Don't pull any tasks if this group is already above the domain
+ * average load.
+ */
if (sds.this_load >= sds.avg_load)
goto out_balanced;
- if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
- goto out_balanced;
+ if (idle == CPU_IDLE) {
+ /*
+ * This cpu is idle. If the busiest group load doesn't
+ * have more tasks than the number of available cpu's and
+ * there is no imbalance between this and busiest group
+ * wrt to idle cpu's, it is balanced.
+ */
+ if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
+ sds.busiest_nr_running <= sds.busiest_group_weight)
+ goto out_balanced;
+ } else {
+ /*
+ * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
+ * imbalance_pct to be conservative.
+ */
+ if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+ goto out_balanced;
+ }
+force_balance:
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(&sds, this_cpu, imbalance);
return sds.busiest;
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
static struct rq *
-find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
- unsigned long imbalance, const struct cpumask *cpus)
+find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
+ enum cpu_idle_type idle, unsigned long imbalance,
+ const struct cpumask *cpus)
{
struct rq *busiest = NULL, *rq;
unsigned long max_load = 0;
for_each_cpu(i, sched_group_cpus(group)) {
unsigned long power = power_of(i);
- unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+ unsigned long capacity = DIV_ROUND_CLOSEST(power,
+ SCHED_POWER_SCALE);
unsigned long wl;
+ if (!capacity)
+ capacity = fix_small_capacity(sd, group);
+
if (!cpumask_test_cpu(i, cpus))
continue;
* the load can be moved away from the cpu that is potentially
* running at a lower capacity.
*/
- wl = (wl * SCHED_LOAD_SCALE) / power;
+ wl = (wl * SCHED_POWER_SCALE) / power;
if (wl > max_load) {
max_load = wl;
/* Working cpumask for load_balance and load_balance_newidle. */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
+static int need_active_balance(struct sched_domain *sd, int idle,
+ int busiest_cpu, int this_cpu)
{
if (idle == CPU_NEWLY_IDLE) {
+
+ /*
+ * ASYM_PACKING needs to force migrate tasks from busy but
+ * higher numbered CPUs in order to pack all tasks in the
+ * lowest numbered CPUs.
+ */
+ if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
+ return 1;
+
/*
* The only task running in a non-idle cpu can be moved to this
* cpu in an attempt to completely freeup the other CPU
* move_tasks() will succeed. ld_moved will be true and this
* active balance code will not be triggered.
*/
- if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- return 0;
-
if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
return 0;
}
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
+static int active_load_balance_cpu_stop(void *data);
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
struct sched_domain *sd, enum cpu_idle_type idle,
int *balance)
{
- int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+ int ld_moved, all_pinned = 0, active_balance = 0;
struct sched_group *group;
unsigned long imbalance;
struct rq *busiest;
cpumask_copy(cpus, cpu_active_mask);
- /*
- * When power savings policy is enabled for the parent domain, idle
- * sibling can pick up load irrespective of busy siblings. In this case,
- * let the state of idle sibling percolate up as CPU_IDLE, instead of
- * portraying it as CPU_NOT_IDLE.
- */
- if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
- !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- sd_idle = 1;
-
schedstat_inc(sd, lb_count[idle]);
redo:
- update_shares(sd);
- group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+ group = find_busiest_group(sd, this_cpu, &imbalance, idle,
cpus, balance);
if (*balance == 0)
goto out_balanced;
}
- busiest = find_busiest_queue(group, idle, imbalance, cpus);
+ busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
+ all_pinned = 1;
local_irq_save(flags);
double_rq_lock(this_rq, busiest);
ld_moved = move_tasks(this_rq, this_cpu, busiest,
if (!ld_moved) {
schedstat_inc(sd, lb_failed[idle]);
- sd->nr_balance_failed++;
+ /*
+ * Increment the failure counter only on periodic balance.
+ * We do not want newidle balance, which can be very
+ * frequent, pollute the failure counter causing
+ * excessive cache_hot migrations and active balances.
+ */
+ if (idle != CPU_NEWLY_IDLE)
+ sd->nr_balance_failed++;
- if (need_active_balance(sd, sd_idle, idle)) {
+ if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
- /* don't kick the migration_thread, if the curr
- * task on busiest cpu can't be moved to this_cpu
+ /* don't kick the active_load_balance_cpu_stop,
+ * if the curr task on busiest cpu can't be
+ * moved to this_cpu
*/
if (!cpumask_test_cpu(this_cpu,
&busiest->curr->cpus_allowed)) {
goto out_one_pinned;
}
+ /*
+ * ->active_balance synchronizes accesses to
+ * ->active_balance_work. Once set, it's cleared
+ * only after active load balance is finished.
+ */
if (!busiest->active_balance) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
active_balance = 1;
}
raw_spin_unlock_irqrestore(&busiest->lock, flags);
+
if (active_balance)
- wake_up_process(busiest->migration_thread);
+ stop_one_cpu_nowait(cpu_of(busiest),
+ active_load_balance_cpu_stop, busiest,
+ &busiest->active_balance_work);
/*
* We've kicked active balancing, reset the failure
sd->balance_interval *= 2;
}
- if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- ld_moved = -1;
-
goto out;
out_balanced:
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
- if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- ld_moved = -1;
- else
- ld_moved = 0;
+ ld_moved = 0;
out:
- if (ld_moved)
- update_shares(sd);
return ld_moved;
}
*/
raw_spin_unlock(&this_rq->lock);
+ update_shares(this_cpu);
+ rcu_read_lock();
for_each_domain(this_cpu, sd) {
unsigned long interval;
int balance = 1;
break;
}
}
+ rcu_read_unlock();
raw_spin_lock(&this_rq->lock);
}
/*
- * active_load_balance is run by migration threads. It pushes running tasks
- * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
- * running on each physical CPU where possible, and avoids physical /
- * logical imbalances.
- *
- * Called with busiest_rq locked.
+ * active_load_balance_cpu_stop is run by cpu stopper. It pushes
+ * running tasks off the busiest CPU onto idle CPUs. It requires at
+ * least 1 task to be running on each physical CPU where possible, and
+ * avoids physical / logical imbalances.
*/
-static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
+static int active_load_balance_cpu_stop(void *data)
{
+ struct rq *busiest_rq = data;
+ int busiest_cpu = cpu_of(busiest_rq);
int target_cpu = busiest_rq->push_cpu;
+ struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
- struct rq *target_rq;
+
+ raw_spin_lock_irq(&busiest_rq->lock);
+
+ /* make sure the requested cpu hasn't gone down in the meantime */
+ if (unlikely(busiest_cpu != smp_processor_id() ||
+ !busiest_rq->active_balance))
+ goto out_unlock;
/* Is there any task to move? */
if (busiest_rq->nr_running <= 1)
- return;
-
- target_rq = cpu_rq(target_cpu);
+ goto out_unlock;
/*
* This condition is "impossible", if it occurs
double_lock_balance(busiest_rq, target_rq);
/* Search for an sd spanning us and the target CPU. */
+ rcu_read_lock();
for_each_domain(target_cpu, sd) {
if ((sd->flags & SD_LOAD_BALANCE) &&
cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
else
schedstat_inc(sd, alb_failed);
}
+ rcu_read_unlock();
double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+ busiest_rq->active_balance = 0;
+ raw_spin_unlock_irq(&busiest_rq->lock);
+ return 0;
}
#ifdef CONFIG_NO_HZ
+
+static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
+
+static void trigger_sched_softirq(void *data)
+{
+ raise_softirq_irqoff(SCHED_SOFTIRQ);
+}
+
+static inline void init_sched_softirq_csd(struct call_single_data *csd)
+{
+ csd->func = trigger_sched_softirq;
+ csd->info = NULL;
+ csd->flags = 0;
+ csd->priv = 0;
+}
+
+/*
+ * idle load balancing details
+ * - One of the idle CPUs nominates itself as idle load_balancer, while
+ * entering idle.
+ * - This idle load balancer CPU will also go into tickless mode when
+ * it is idle, just like all other idle CPUs
+ * - When one of the busy CPUs notice that there may be an idle rebalancing
+ * needed, they will kick the idle load balancer, which then does idle
+ * load balancing for all the idle CPUs.
+ */
static struct {
atomic_t load_balancer;
- cpumask_var_t cpu_mask;
- cpumask_var_t ilb_grp_nohz_mask;
-} nohz ____cacheline_aligned = {
- .load_balancer = ATOMIC_INIT(-1),
-};
+ atomic_t first_pick_cpu;
+ atomic_t second_pick_cpu;
+ cpumask_var_t idle_cpus_mask;
+ cpumask_var_t grp_idle_mask;
+ unsigned long next_balance; /* in jiffy units */
+} nohz ____cacheline_aligned;
int get_nohz_load_balancer(void)
{
*/
static inline int is_semi_idle_group(struct sched_group *ilb_group)
{
- cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+ cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
sched_group_cpus(ilb_group));
/*
* A sched_group is semi-idle when it has atleast one busy cpu
* and atleast one idle cpu.
*/
- if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+ if (cpumask_empty(nohz.grp_idle_mask))
return 0;
- if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+ if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
return 0;
return 1;
{
struct sched_domain *sd;
struct sched_group *ilb_group;
+ int ilb = nr_cpu_ids;
/*
* Have idle load balancer selection from semi-idle packages only
* Optimize for the case when we have no idle CPUs or only one
* idle CPU. Don't walk the sched_domain hierarchy in such cases
*/
- if (cpumask_weight(nohz.cpu_mask) < 2)
+ if (cpumask_weight(nohz.idle_cpus_mask) < 2)
goto out_done;
+ rcu_read_lock();
for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
ilb_group = sd->groups;
do {
- if (is_semi_idle_group(ilb_group))
- return cpumask_first(nohz.ilb_grp_nohz_mask);
+ if (is_semi_idle_group(ilb_group)) {
+ ilb = cpumask_first(nohz.grp_idle_mask);
+ goto unlock;
+ }
ilb_group = ilb_group->next;
} while (ilb_group != sd->groups);
}
+unlock:
+ rcu_read_unlock();
out_done:
- return cpumask_first(nohz.cpu_mask);
+ return ilb;
}
#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
static inline int find_new_ilb(int call_cpu)
{
- return cpumask_first(nohz.cpu_mask);
+ return nr_cpu_ids;
}
#endif
+/*
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
+ * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
+ * CPU (if there is one).
+ */
+static void nohz_balancer_kick(int cpu)
+{
+ int ilb_cpu;
+
+ nohz.next_balance++;
+
+ ilb_cpu = get_nohz_load_balancer();
+
+ if (ilb_cpu >= nr_cpu_ids) {
+ ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
+ if (ilb_cpu >= nr_cpu_ids)
+ return;
+ }
+
+ if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
+ struct call_single_data *cp;
+
+ cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
+ cp = &per_cpu(remote_sched_softirq_cb, cpu);
+ __smp_call_function_single(ilb_cpu, cp, 0);
+ }
+ return;
+}
+
/*
* This routine will try to nominate the ilb (idle load balancing)
* owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus. If all the cpus in the system
- * go into this tickless mode, then there will be no ilb owner (as there is
- * no need for one) and all the cpus will sleep till the next wakeup event
- * arrives...
- *
- * For the ilb owner, tick is not stopped. And this tick will be used
- * for idle load balancing. ilb owner will still be part of
- * nohz.cpu_mask..
+ * load balancing on behalf of all those cpus.
*
- * While stopping the tick, this cpu will become the ilb owner if there
- * is no other owner. And will be the owner till that cpu becomes busy
- * or if all cpus in the system stop their ticks at which point
- * there is no need for ilb owner.
+ * When the ilb owner becomes busy, we will not have new ilb owner until some
+ * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
+ * idle load balancing by kicking one of the idle CPUs.
*
- * When the ilb owner becomes busy, it nominates another owner, during the
- * next busy scheduler_tick()
+ * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
+ * ilb owner CPU in future (when there is a need for idle load balancing on
+ * behalf of all idle CPUs).
*/
-int select_nohz_load_balancer(int stop_tick)
+void select_nohz_load_balancer(int stop_tick)
{
int cpu = smp_processor_id();
if (stop_tick) {
- cpu_rq(cpu)->in_nohz_recently = 1;
-
if (!cpu_active(cpu)) {
if (atomic_read(&nohz.load_balancer) != cpu)
- return 0;
+ return;
/*
* If we are going offline and still the leader,
* give up!
*/
- if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+ nr_cpu_ids) != cpu)
BUG();
- return 0;
+ return;
}
- cpumask_set_cpu(cpu, nohz.cpu_mask);
+ cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
- /* time for ilb owner also to sleep */
- if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
- if (atomic_read(&nohz.load_balancer) == cpu)
- atomic_set(&nohz.load_balancer, -1);
- return 0;
- }
+ if (atomic_read(&nohz.first_pick_cpu) == cpu)
+ atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
+ if (atomic_read(&nohz.second_pick_cpu) == cpu)
+ atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
- if (atomic_read(&nohz.load_balancer) == -1) {
- /* make me the ilb owner */
- if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
- return 1;
- } else if (atomic_read(&nohz.load_balancer) == cpu) {
+ if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
int new_ilb;
- if (!(sched_smt_power_savings ||
- sched_mc_power_savings))
- return 1;
+ /* make me the ilb owner */
+ if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
+ cpu) != nr_cpu_ids)
+ return;
+
/*
* Check to see if there is a more power-efficient
* ilb.
*/
new_ilb = find_new_ilb(cpu);
if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
- atomic_set(&nohz.load_balancer, -1);
+ atomic_set(&nohz.load_balancer, nr_cpu_ids);
resched_cpu(new_ilb);
- return 0;
+ return;
}
- return 1;
+ return;
}
} else {
- if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
- return 0;
+ if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ return;
- cpumask_clear_cpu(cpu, nohz.cpu_mask);
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
if (atomic_read(&nohz.load_balancer) == cpu)
- if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+ nr_cpu_ids) != cpu)
BUG();
}
- return 0;
+ return;
}
#endif
static DEFINE_SPINLOCK(balancing);
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
+/*
+ * Scale the max load_balance interval with the number of CPUs in the system.
+ * This trades load-balance latency on larger machines for less cross talk.
+ */
+static void update_max_interval(void)
+{
+ max_load_balance_interval = HZ*num_online_cpus()/10;
+}
+
/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
int update_next_balance = 0;
int need_serialize;
+ update_shares(cpu);
+
+ rcu_read_lock();
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
- if (unlikely(!interval))
- interval = 1;
- if (interval > HZ*NR_CPUS/10)
- interval = HZ*NR_CPUS/10;
+ interval = clamp(interval, 1UL, max_load_balance_interval);
need_serialize = sd->flags & SD_SERIALIZE;
if (load_balance(cpu, rq, sd, idle, &balance)) {
/*
* We've pulled tasks over so either we're no
- * longer idle, or one of our SMT siblings is
- * not idle.
+ * longer idle.
*/
idle = CPU_NOT_IDLE;
}
if (!balance)
break;
}
+ rcu_read_unlock();
/*
* next_balance will be updated only when there is a need.
rq->next_balance = next_balance;
}
+#ifdef CONFIG_NO_HZ
/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * In CONFIG_NO_HZ case, the idle balance kickee will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+{
+ struct rq *this_rq = cpu_rq(this_cpu);
+ struct rq *rq;
+ int balance_cpu;
+
+ if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
+ return;
+
+ for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+ if (balance_cpu == this_cpu)
+ continue;
+
+ /*
+ * If this cpu gets work to do, stop the load balancing
+ * work being done for other cpus. Next load
+ * balancing owner will pick it up.
+ */
+ if (need_resched()) {
+ this_rq->nohz_balance_kick = 0;
+ break;
+ }
+
+ raw_spin_lock_irq(&this_rq->lock);
+ update_rq_clock(this_rq);
+ update_cpu_load(this_rq);
+ raw_spin_unlock_irq(&this_rq->lock);
+
+ rebalance_domains(balance_cpu, CPU_IDLE);
+
+ rq = cpu_rq(balance_cpu);
+ if (time_after(this_rq->next_balance, rq->next_balance))
+ this_rq->next_balance = rq->next_balance;
+ }
+ nohz.next_balance = this_rq->next_balance;
+ this_rq->nohz_balance_kick = 0;
+}
+
+/*
+ * Current heuristic for kicking the idle load balancer
+ * - first_pick_cpu is the one of the busy CPUs. It will kick
+ * idle load balancer when it has more than one process active. This
+ * eliminates the need for idle load balancing altogether when we have
+ * only one running process in the system (common case).
+ * - If there are more than one busy CPU, idle load balancer may have
+ * to run for active_load_balance to happen (i.e., two busy CPUs are
+ * SMT or core siblings and can run better if they move to different
+ * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
+ * which will kick idle load balancer as soon as it has any load.
+ */
+static inline int nohz_kick_needed(struct rq *rq, int cpu)
+{
+ unsigned long now = jiffies;
+ int ret;
+ int first_pick_cpu, second_pick_cpu;
+
+ if (time_before(now, nohz.next_balance))
+ return 0;
+
+ if (rq->idle_at_tick)
+ return 0;
+
+ first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
+ second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
+
+ if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
+ second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
+ return 0;
+
+ ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
+ if (ret == nr_cpu_ids || ret == cpu) {
+ atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+ if (rq->nr_running > 1)
+ return 1;
+ } else {
+ ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
+ if (ret == nr_cpu_ids || ret == cpu) {
+ if (rq->nr_running)
+ return 1;
+ }
+ }
+ return 0;
+}
+#else
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
+#endif
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ */
static void run_rebalance_domains(struct softirq_action *h)
{
int this_cpu = smp_processor_id();
rebalance_domains(this_cpu, idle);
-#ifdef CONFIG_NO_HZ
/*
- * If this cpu is the owner for idle load balancing, then do the
+ * If this cpu has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle cpus whose ticks are
* stopped.
*/
- if (this_rq->idle_at_tick &&
- atomic_read(&nohz.load_balancer) == this_cpu) {
- struct rq *rq;
- int balance_cpu;
-
- for_each_cpu(balance_cpu, nohz.cpu_mask) {
- if (balance_cpu == this_cpu)
- continue;
-
- /*
- * If this cpu gets work to do, stop the load balancing
- * work being done for other cpus. Next load
- * balancing owner will pick it up.
- */
- if (need_resched())
- break;
-
- rebalance_domains(balance_cpu, CPU_IDLE);
-
- rq = cpu_rq(balance_cpu);
- if (time_after(this_rq->next_balance, rq->next_balance))
- this_rq->next_balance = rq->next_balance;
- }
- }
-#endif
+ nohz_idle_balance(this_cpu, idle);
}
static inline int on_null_domain(int cpu)
{
- return !rcu_dereference(cpu_rq(cpu)->sd);
+ return !rcu_dereference_sched(cpu_rq(cpu)->sd);
}
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
- *
- * In case of CONFIG_NO_HZ, this is the place where we nominate a new
- * idle load balancing owner or decide to stop the periodic load balancing,
- * if the whole system is idle.
*/
static inline void trigger_load_balance(struct rq *rq, int cpu)
{
-#ifdef CONFIG_NO_HZ
- /*
- * If we were in the nohz mode recently and busy at the current
- * scheduler tick, then check if we need to nominate new idle
- * load balancer.
- */
- if (rq->in_nohz_recently && !rq->idle_at_tick) {
- rq->in_nohz_recently = 0;
-
- if (atomic_read(&nohz.load_balancer) == cpu) {
- cpumask_clear_cpu(cpu, nohz.cpu_mask);
- atomic_set(&nohz.load_balancer, -1);
- }
-
- if (atomic_read(&nohz.load_balancer) == -1) {
- int ilb = find_new_ilb(cpu);
-
- if (ilb < nr_cpu_ids)
- resched_cpu(ilb);
- }
- }
-
- /*
- * If this cpu is idle and doing idle load balancing for all the
- * cpus with ticks stopped, is it time for that to stop?
- */
- if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
- cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
- resched_cpu(cpu);
- return;
- }
-
- /*
- * If this cpu is idle and the idle load balancing is done by
- * someone else, then no need raise the SCHED_SOFTIRQ
- */
- if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
- cpumask_test_cpu(cpu, nohz.cpu_mask))
- return;
-#endif
/* Don't need to rebalance while attached to NULL domain */
if (time_after_eq(jiffies, rq->next_balance) &&
likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ);
+#ifdef CONFIG_NO_HZ
+ else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+ nohz_balancer_kick(cpu);
+#endif
}
static void rq_online_fair(struct rq *rq)
raw_spin_lock_irqsave(&rq->lock, flags);
- if (unlikely(task_cpu(p) != this_cpu))
+ update_rq_clock(rq);
+
+ if (unlikely(task_cpu(p) != this_cpu)) {
+ rcu_read_lock();
__set_task_cpu(p, this_cpu);
+ rcu_read_unlock();
+ }
update_curr(cfs_rq);
* Priority of the task has changed. Check to see if we preempt
* the current task.
*/
-static void prio_changed_fair(struct rq *rq, struct task_struct *p,
- int oldprio, int running)
+static void
+prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
{
+ if (!p->se.on_rq)
+ return;
+
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
- if (running) {
+ if (rq->curr == p) {
if (p->prio > oldprio)
resched_task(rq->curr);
} else
check_preempt_curr(rq, p, 0);
}
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * Ensure the task's vruntime is normalized, so that when its
+ * switched back to the fair class the enqueue_entity(.flags=0) will
+ * do the right thing.
+ *
+ * If it was on_rq, then the dequeue_entity(.flags=0) will already
+ * have normalized the vruntime, if it was !on_rq, then only when
+ * the task is sleeping will it still have non-normalized vruntime.
+ */
+ if (!se->on_rq && p->state != TASK_RUNNING) {
+ /*
+ * Fix up our vruntime so that the current sleep doesn't
+ * cause 'unlimited' sleep bonus.
+ */
+ place_entity(cfs_rq, se, 0);
+ se->vruntime -= cfs_rq->min_vruntime;
+ }
+}
+
/*
* We switched to the sched_fair class.
*/
-static void switched_to_fair(struct rq *rq, struct task_struct *p,
- int running)
+static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
+ if (!p->se.on_rq)
+ return;
+
/*
* We were most likely switched from sched_rt, so
* kick off the schedule if running, otherwise just see
* if we can still preempt the current task.
*/
- if (running)
+ if (rq->curr == p)
resched_task(rq->curr);
else
check_preempt_curr(rq, p, 0);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int on_rq)
{
- struct cfs_rq *cfs_rq = task_cfs_rq(p);
-
- update_curr(cfs_rq);
+ /*
+ * If the task was not on the rq at the time of this cgroup movement
+ * it must have been asleep, sleeping tasks keep their ->vruntime
+ * absolute on their old rq until wakeup (needed for the fair sleeper
+ * bonus in place_entity()).
+ *
+ * If it was on the rq, we've just 'preempted' it, which does convert
+ * ->vruntime to a relative base.
+ *
+ * Make sure both cases convert their relative position when migrating
+ * to another cgroup's rq. This does somewhat interfere with the
+ * fair sleeper stuff for the first placement, but who cares.
+ */
+ if (!on_rq)
+ p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+ set_task_rq(p, task_cpu(p));
if (!on_rq)
- place_entity(cfs_rq, &p->se, 1);
+ p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
}
#endif
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
+ .yield_to_task = yield_to_task_fair,
.check_preempt_curr = check_preempt_wakeup,
.task_fork = task_fork_fair,
.prio_changed = prio_changed_fair,
+ .switched_from = switched_from_fair,
.switched_to = switched_to_fair,
.get_rr_interval = get_rr_interval_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
- .moved_group = moved_group_fair,
+ .task_move_group = task_move_group_fair,
#endif
};