#include <asm/tlb.h>
#include <asm/irq_regs.h>
#include <asm/mutex.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
#include "sched_cpupri.h"
#include "workqueue_sched.h"
static inline int rt_policy(int policy)
{
- if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
+ if (policy == SCHED_FIFO || policy == SCHED_RR)
return 1;
return 0;
}
#endif
/*
- * sched_domains_mutex serializes calls to arch_init_sched_domains,
+ * sched_domains_mutex serializes calls to init_sched_domains,
* detach_destroy_domains and partition_sched_domains.
*/
static DEFINE_MUTEX(sched_domains_mutex);
* (The default weight is 1024 - so there's no practical
* limitation from this.)
*/
-#define MIN_SHARES 2
+#define MIN_SHARES (1UL << 1)
#define MAX_SHARES (1UL << 18)
static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
*/
struct sched_entity *curr, *next, *last, *skip;
+#ifdef CONFIG_SCHED_DEBUG
unsigned int nr_spread_over;
+#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
*/
struct root_domain {
atomic_t refcount;
+ atomic_t rto_count;
+ struct rcu_head rcu;
cpumask_var_t span;
cpumask_var_t online;
* one runnable RT task.
*/
cpumask_var_t rto_mask;
- atomic_t rto_count;
struct cpupri cpupri;
};
u64 nohz_stamp;
unsigned char nohz_balance_kick;
#endif
- unsigned int skip_clock_update;
+ int skip_clock_update;
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
#endif
+#ifdef CONFIG_PARAVIRT
+ u64 prev_steal_time;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ u64 prev_steal_time_rq;
+#endif
/* calc_load related fields */
unsigned long calc_load_update;
unsigned int ttwu_count;
unsigned int ttwu_local;
#endif
+
+#ifdef CONFIG_SMP
+ struct task_struct *wake_list;
+#endif
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
- rcu_read_lock_sched_held() || \
lockdep_is_held(&sched_domains_mutex))
/*
/*
* Return the group to which this tasks belongs.
*
- * We use task_subsys_state_check() and extend the RCU verification
- * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
- * holds that lock for each task it moves into the cgroup. Therefore
- * by holding that lock, we pin the task to the current cgroup.
+ * We use task_subsys_state_check() and extend the RCU verification with
+ * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
+ * task it moves into the cgroup. Therefore by holding either of those locks,
+ * we pin the task to the current cgroup.
*/
static inline struct task_group *task_group(struct task_struct *p)
{
struct cgroup_subsys_state *css;
css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
- lockdep_is_held(&p->pi_lock));
+ lockdep_is_held(&p->pi_lock) ||
+ lockdep_is_held(&task_rq(p)->lock));
tg = container_of(css, struct task_group, css);
return autogroup_task_group(p, tg);
{
s64 delta;
- if (rq->skip_clock_update)
+ if (rq->skip_clock_update > 0)
return;
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
int i;
struct sched_domain *sd;
+ rcu_read_lock();
for_each_domain(cpu, sd) {
- for_each_cpu(i, sched_domain_span(sd))
- if (!idle_cpu(i))
- return i;
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (!idle_cpu(i)) {
+ cpu = i;
+ goto unlock;
+ }
+ }
}
+unlock:
+ rcu_read_unlock();
return cpu;
}
/*
{
u64 tmp;
+ /*
+ * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
+ * entities since MIN_SHARES = 2. Treat weight as 1 if less than
+ * 2^SCHED_LOAD_RESOLUTION.
+ */
+ if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
+ tmp = (u64)delta_exec * scale_load_down(weight);
+ else
+ tmp = (u64)delta_exec;
+
if (!lw->inv_weight) {
- if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
+ unsigned long w = scale_load_down(lw->weight);
+
+ if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
lw->inv_weight = 1;
+ else if (unlikely(!w))
+ lw->inv_weight = WMULT_CONST;
else
- lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
- / (lw->weight+1);
+ lw->inv_weight = WMULT_CONST / w;
}
- tmp = (u64)delta_exec * weight;
/*
* Check whether we'd overflow the 64-bit multiplication:
*/
return rq->avg_load_per_task;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/*
- * Compute the cpu's hierarchical load factor for each task group.
- * This needs to be done in a top-down fashion because the load of a child
- * group is a fraction of its parents load.
- */
-static int tg_load_down(struct task_group *tg, void *data)
-{
- unsigned long load;
- long cpu = (long)data;
-
- if (!tg->parent) {
- load = cpu_rq(cpu)->load.weight;
- } else {
- load = tg->parent->cfs_rq[cpu]->h_load;
- load *= tg->se[cpu]->load.weight;
- load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
- }
-
- tg->cfs_rq[cpu]->h_load = load;
-
- return 0;
-}
-
-static void update_h_load(long cpu)
-{
- walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
-}
-
-#endif
-
#ifdef CONFIG_PREEMPT
static void double_rq_lock(struct rq *rq1, struct rq *rq2);
static void set_load_weight(struct task_struct *p)
{
+ int prio = p->static_prio - MAX_RT_PRIO;
+ struct load_weight *load = &p->se.load;
+
/*
* SCHED_IDLE tasks get minimal weight:
*/
if (p->policy == SCHED_IDLE) {
- p->se.load.weight = WEIGHT_IDLEPRIO;
- p->se.load.inv_weight = WMULT_IDLEPRIO;
+ load->weight = scale_load(WEIGHT_IDLEPRIO);
+ load->inv_weight = WMULT_IDLEPRIO;
return;
}
- p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
- p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
+ load->weight = scale_load(prio_to_weight[prio]);
+ load->inv_weight = prio_to_wmult[prio];
}
static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
}
EXPORT_SYMBOL_GPL(account_system_vtime);
-static void update_rq_clock_task(struct rq *rq, s64 delta)
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
{
- s64 irq_delta;
+ if (unlikely(steal > NSEC_PER_SEC))
+ return div_u64(steal, TICK_NSEC);
+
+ return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ s64 steal = 0, irq_delta = 0;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
/*
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ if (static_branch((¶virt_steal_rq_enabled))) {
+ u64 st;
+
+ steal = paravirt_steal_clock(cpu_of(rq));
+ steal -= rq->prev_steal_time_rq;
+
+ if (unlikely(steal > delta))
+ steal = delta;
+
+ st = steal_ticks(steal);
+ steal = st * TICK_NSEC;
+
+ rq->prev_steal_time_rq += steal;
+
+ delta -= steal;
+ }
+#endif
+
rq->clock_task += delta;
- if (irq_delta && sched_feat(NONIRQ_POWER))
- sched_rt_avg_update(rq, irq_delta);
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+ sched_rt_avg_update(rq, irq_delta + steal);
+#endif
}
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
static int irqtime_account_hi_update(void)
{
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
#define sched_clock_irqtime (0)
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
- rq->clock_task += delta;
-}
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#endif
#include "sched_idletask.c"
#include "sched_fair.c"
!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
#ifdef CONFIG_LOCKDEP
+ /*
+ * The caller should hold either p->pi_lock or rq->lock, when changing
+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+ *
+ * sched_move_task() holds both and thus holding either pins the cgroup,
+ * see set_task_rq().
+ *
+ * Furthermore, all task_rq users should acquire both locks, see
+ * task_rq_lock().
+ */
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
lockdep_is_held(&task_rq(p)->lock)));
#endif
if (task_cpu(p) != new_cpu) {
p->se.nr_migrations++;
- perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
+ perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
}
__set_task_cpu(p, new_cpu);
static int migration_cpu_stop(void *data);
-/*
- * The task's runqueue lock must be held.
- * Returns true if you have to wait for migration thread.
- */
-static bool need_migrate_task(struct task_struct *p)
-{
- /*
- * If the task is not on a runqueue (and not running), then
- * the next wake-up will properly place the task.
- */
- bool running = p->on_rq || p->on_cpu;
- smp_rmb(); /* finish_lock_switch() */
- return running;
-}
-
/*
* wait_task_inactive - wait for a thread to unschedule.
*
#endif
static void
-ttwu_stat(struct rq *rq, struct task_struct *p, int cpu, int wake_flags)
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{
#ifdef CONFIG_SCHEDSTATS
+ struct rq *rq = this_rq();
+
#ifdef CONFIG_SMP
int this_cpu = smp_processor_id();
struct sched_domain *sd;
schedstat_inc(p, se.statistics.nr_wakeups_remote);
+ rcu_read_lock();
for_each_domain(this_cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
schedstat_inc(sd, ttwu_wake_remote);
break;
}
}
+ rcu_read_unlock();
}
+
+ if (wake_flags & WF_MIGRATED)
+ schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+
#endif /* CONFIG_SMP */
schedstat_inc(rq, ttwu_count);
if (wake_flags & WF_SYNC)
schedstat_inc(p, se.statistics.nr_wakeups_sync);
- if (cpu != task_cpu(p))
- schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-
#endif /* CONFIG_SCHEDSTATS */
}
wq_worker_waking_up(p, cpu_of(rq));
}
+/*
+ * Mark the task runnable and perform wakeup-preemption.
+ */
static void
-ttwu_post_activation(struct task_struct *p, struct rq *rq, int wake_flags)
+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
trace_sched_wakeup(p, true);
check_preempt_curr(rq, p, wake_flags);
if (p->sched_class->task_woken)
p->sched_class->task_woken(rq, p);
- if (unlikely(rq->idle_stamp)) {
+ if (rq->idle_stamp) {
u64 delta = rq->clock - rq->idle_stamp;
u64 max = 2*sysctl_sched_migration_cost;
#endif
}
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+#ifdef CONFIG_SMP
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible--;
+#endif
+
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+ ttwu_do_wakeup(rq, p, wake_flags);
+}
+
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+ struct rq *rq;
+ int ret = 0;
+
+ rq = __task_rq_lock(p);
+ if (p->on_rq) {
+ ttwu_do_wakeup(rq, p, wake_flags);
+ ret = 1;
+ }
+ __task_rq_unlock(rq);
+
+ return ret;
+}
+
+#ifdef CONFIG_SMP
+static void sched_ttwu_do_pending(struct task_struct *list)
+{
+ struct rq *rq = this_rq();
+
+ raw_spin_lock(&rq->lock);
+
+ while (list) {
+ struct task_struct *p = list;
+ list = list->wake_entry;
+ ttwu_do_activate(rq, p, 0);
+ }
+
+ raw_spin_unlock(&rq->lock);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void sched_ttwu_pending(void)
+{
+ struct rq *rq = this_rq();
+ struct task_struct *list = xchg(&rq->wake_list, NULL);
+
+ if (!list)
+ return;
+
+ sched_ttwu_do_pending(list);
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void scheduler_ipi(void)
+{
+ struct rq *rq = this_rq();
+ struct task_struct *list = xchg(&rq->wake_list, NULL);
+
+ if (!list)
+ return;
+
+ /*
+ * Not all reschedule IPI handlers call irq_enter/irq_exit, since
+ * traditionally all their work was done from the interrupt return
+ * path. Now that we actually do some work, we need to make sure
+ * we do call them.
+ *
+ * Some archs already do call them, luckily irq_enter/exit nest
+ * properly.
+ *
+ * Arguably we should visit all archs and update all handlers,
+ * however a fair share of IPIs are still resched only so this would
+ * somewhat pessimize the simple resched case.
+ */
+ irq_enter();
+ sched_ttwu_do_pending(list);
+ irq_exit();
+}
+
+static void ttwu_queue_remote(struct task_struct *p, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *next = rq->wake_list;
+
+ for (;;) {
+ struct task_struct *old = next;
+
+ p->wake_entry = next;
+ next = cmpxchg(&rq->wake_list, old, p);
+ if (next == old)
+ break;
+ }
+
+ if (!next)
+ smp_send_reschedule(cpu);
+}
+
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
+{
+ struct rq *rq;
+ int ret = 0;
+
+ rq = __task_rq_lock(p);
+ if (p->on_cpu) {
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ ttwu_do_wakeup(rq, p, wake_flags);
+ ret = 1;
+ }
+ __task_rq_unlock(rq);
+
+ return ret;
+
+}
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+#endif /* CONFIG_SMP */
+
+static void ttwu_queue(struct task_struct *p, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+#if defined(CONFIG_SMP)
+ if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+ sched_clock_cpu(cpu); /* sync clocks x-cpu */
+ ttwu_queue_remote(p, cpu);
+ return;
+ }
+#endif
+
+ raw_spin_lock(&rq->lock);
+ ttwu_do_activate(rq, p, 0);
+ raw_spin_unlock(&rq->lock);
+}
+
/**
* try_to_wake_up - wake up a thread
* @p: the thread to be awakened
* Returns %true if @p was woken up, %false if it was already running
* or @state didn't match @p's state.
*/
-static int try_to_wake_up(struct task_struct *p, unsigned int state,
- int wake_flags)
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
- int cpu, orig_cpu, this_cpu, success = 0;
unsigned long flags;
- unsigned long en_flags = ENQUEUE_WAKEUP;
- struct rq *rq;
-
- this_cpu = get_cpu();
+ int cpu, success = 0;
smp_wmb();
raw_spin_lock_irqsave(&p->pi_lock, flags);
- rq = __task_rq_lock(p);
if (!(p->state & state))
goto out;
+ success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
- if (p->on_rq)
- goto out_running;
+ if (p->on_rq && ttwu_remote(p, wake_flags))
+ goto stat;
- orig_cpu = cpu;
#ifdef CONFIG_SMP
- if (unlikely(task_running(rq, p)))
- goto out_activate;
+ /*
+ * If the owning (remote) cpu is still in the middle of schedule() with
+ * this task as prev, wait until its done referencing the task.
+ */
+ while (p->on_cpu) {
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ /*
+ * In case the architecture enables interrupts in
+ * context_switch(), we cannot busy wait, since that
+ * would lead to deadlocks when an interrupt hits and
+ * tries to wake up @prev. So bail and do a complete
+ * remote wakeup.
+ */
+ if (ttwu_activate_remote(p, wake_flags))
+ goto stat;
+#else
+ cpu_relax();
+#endif
+ }
+ /*
+ * Pairs with the smp_wmb() in finish_lock_switch().
+ */
+ smp_rmb();
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
- if (p->sched_class->task_waking) {
+ if (p->sched_class->task_waking)
p->sched_class->task_waking(p);
- en_flags |= ENQUEUE_WAKING;
- }
cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
- if (cpu != orig_cpu)
+ if (task_cpu(p) != cpu) {
+ wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
- __task_rq_unlock(rq);
-
- rq = cpu_rq(cpu);
- raw_spin_lock(&rq->lock);
-
- /*
- * We migrated the task without holding either rq->lock, however
- * since the task is not on the task list itself, nobody else
- * will try and migrate the task, hence the rq should match the
- * cpu we just moved it to.
- */
- WARN_ON(task_cpu(p) != cpu);
- WARN_ON(p->state != TASK_WAKING);
-
- if (p->sched_contributes_to_load)
- rq->nr_uninterruptible--;
-
-out_activate:
+ }
#endif /* CONFIG_SMP */
- ttwu_activate(rq, p, en_flags);
-out_running:
- ttwu_post_activation(p, rq, wake_flags);
- ttwu_stat(rq, p, cpu, wake_flags);
- success = 1;
+
+ ttwu_queue(p, cpu);
+stat:
+ ttwu_stat(p, cpu, wake_flags);
out:
- __task_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- put_cpu();
return success;
}
if (!p->on_rq)
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
- ttwu_post_activation(p, rq, 0);
- ttwu_stat(rq, p, smp_processor_id(), 0);
+ ttwu_do_wakeup(rq, p, 0);
+ ttwu_stat(p, smp_processor_id(), 0);
out:
raw_spin_unlock(&p->pi_lock);
}
/*
* fork()/clone()-time setup:
*/
-void sched_fork(struct task_struct *p, int clone_flags)
+void sched_fork(struct task_struct *p)
{
unsigned long flags;
int cpu = get_cpu();
#if defined(CONFIG_SMP)
p->on_cpu = 0;
#endif
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
* that must be done for every newly created context, then puts the task
* on the runqueue and wakes it.
*/
-void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
+void wake_up_new_task(struct task_struct *p)
{
unsigned long flags;
struct rq *rq;
- int cpu __maybe_unused = get_cpu();
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_SMP
- rq = task_rq_lock(p, &flags);
- p->state = TASK_WAKING;
-
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
- *
- * We set TASK_WAKING so that select_task_rq() can drop rq->lock
- * without people poking at ->cpus_allowed.
*/
- cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
- set_task_cpu(p, cpu);
-
- p->state = TASK_RUNNING;
- task_rq_unlock(rq, p, &flags);
+ set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
#endif
- rq = task_rq_lock(p, &flags);
+ rq = __task_rq_lock(p);
activate_task(rq, p, 0);
p->on_rq = 1;
trace_sched_wakeup_new(p, true);
p->sched_class->task_woken(rq, p);
#endif
task_rq_unlock(rq, p, &flags);
- put_cpu();
}
#ifdef CONFIG_PREEMPT_NOTIFIERS
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
local_irq_disable();
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
- perf_event_task_sched_in(current);
+ perf_event_task_sched_in(prev, current);
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
local_irq_enable();
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
{
struct task_struct *p = current;
unsigned long flags;
- struct rq *rq;
int dest_cpu;
- rq = task_rq_lock(p, &flags);
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
if (dest_cpu == smp_processor_id())
goto unlock;
- /*
- * select_task_rq() can race against ->cpus_allowed
- */
- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
- likely(cpu_active(dest_cpu)) && need_migrate_task(p)) {
+ if (likely(cpu_active(dest_cpu))) {
struct migration_arg arg = { p, dest_cpu };
- task_rq_unlock(rq, p, &flags);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
return;
}
unlock:
- task_rq_unlock(rq, p, &flags);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
#endif
return ns;
}
-/*
- * Return sum_exec_runtime for the thread group.
- * In case the task is currently running, return the sum plus current's
- * pending runtime that have not been accounted yet.
- *
- * Note that the thread group might have other running tasks as well,
- * so the return value not includes other pending runtime that other
- * running tasks might have.
- */
-unsigned long long thread_group_sched_runtime(struct task_struct *p)
-{
- struct task_cputime totals;
- unsigned long flags;
- struct rq *rq;
- u64 ns;
-
- rq = task_rq_lock(p, &flags);
- thread_group_cputime(p, &totals);
- ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
- task_rq_unlock(rq, p, &flags);
-
- return ns;
-}
-
/*
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
cpustat->idle = cputime64_add(cpustat->idle, cputime64);
}
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+ if (static_branch(¶virt_steal_enabled)) {
+ u64 steal, st = 0;
+
+ steal = paravirt_steal_clock(smp_processor_id());
+ steal -= this_rq()->prev_steal_time;
+
+ st = steal_ticks(steal);
+ this_rq()->prev_steal_time += st * TICK_NSEC;
+
+ account_steal_time(st);
+ return st;
+ }
+#endif
+ return false;
+}
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
/*
cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ if (steal_account_process_tick())
+ return;
+
if (irqtime_account_hi_update()) {
cpustat->irq = cputime64_add(cpustat->irq, tmp);
} else if (irqtime_account_si_update()) {
return;
}
+ if (steal_account_process_tick())
+ return;
+
if (user_tick)
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
- *
- * It also gets called by the fork code, when changing the parent's
- * timeslices.
*/
void scheduler_tick(void)
{
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
schedstat_inc(this_rq(), sched_count);
-#ifdef CONFIG_SCHEDSTATS
- if (unlikely(prev->lock_depth >= 0)) {
- schedstat_inc(this_rq(), rq_sched_info.bkl_count);
- schedstat_inc(prev, sched_info.bkl_count);
- }
-#endif
}
static void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- if (prev->on_rq)
+ if (prev->on_rq || rq->skip_clock_update < 0)
update_rq_clock(rq);
prev->sched_class->put_prev_task(rq, prev);
}
}
/*
- * schedule() is the main scheduler function.
+ * __schedule() is the main scheduler function.
*/
-asmlinkage void __sched schedule(void)
+static void __sched __schedule(void)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
if (to_wakeup)
try_to_wake_up_local(to_wakeup);
}
-
- /*
- * If we are going to sleep and we have plugged IO
- * queued, make sure to submit it to avoid deadlocks.
- */
- if (blk_needs_flush_plug(prev)) {
- raw_spin_unlock(&rq->lock);
- blk_flush_plug(prev);
- raw_spin_lock(&rq->lock);
- }
}
switch_count = &prev->nvcsw;
}
if (need_resched())
goto need_resched;
}
+
+static inline void sched_submit_work(struct task_struct *tsk)
+{
+ if (!tsk->state)
+ return;
+ /*
+ * If we are going to sleep and we have plugged IO queued,
+ * make sure to submit it to avoid deadlocks.
+ */
+ if (blk_needs_flush_plug(tsk))
+ blk_schedule_flush_plug(tsk);
+}
+
+asmlinkage void __sched schedule(void)
+{
+ struct task_struct *tsk = current;
+
+ sched_submit_work(tsk);
+ __schedule();
+}
EXPORT_SYMBOL(schedule);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
{
- bool ret = false;
-
- rcu_read_lock();
if (lock->owner != owner)
- goto fail;
+ return false;
/*
* Ensure we emit the owner->on_cpu, dereference _after_ checking
*/
barrier();
- ret = owner->on_cpu;
-fail:
- rcu_read_unlock();
-
- return ret;
+ return owner->on_cpu;
}
/*
if (!sched_feat(OWNER_SPIN))
return 0;
+ rcu_read_lock();
while (owner_running(lock, owner)) {
if (need_resched())
- return 0;
+ break;
arch_mutex_cpu_relax();
}
+ rcu_read_unlock();
/*
- * If the owner changed to another task there is likely
- * heavy contention, stop spinning.
+ * We break out the loop above on need_resched() and when the
+ * owner changed, which is a sign for heavy contention. Return
+ * success only when lock->owner is NULL.
*/
- if (lock->owner)
- return 0;
-
- return 1;
+ return lock->owner == NULL;
}
#endif
do {
add_preempt_count_notrace(PREEMPT_ACTIVE);
- schedule();
+ __schedule();
sub_preempt_count_notrace(PREEMPT_ACTIVE);
/*
do {
add_preempt_count(PREEMPT_ACTIVE);
local_irq_enable();
- schedule();
+ __schedule();
local_irq_disable();
sub_preempt_count(PREEMPT_ACTIVE);
static void __cond_resched(void)
{
add_preempt_count(PREEMPT_ACTIVE);
- schedule();
+ __schedule();
sub_preempt_count(PREEMPT_ACTIVE);
}
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
- cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+ do_set_cpus_allowed(idle, cpumask_of(cpu));
/*
* We're having a chicken and egg problem, even though we are
* holding rq->lock, the cpu isn't yet set to this cpu so the
raw_spin_unlock_irqrestore(&rq->lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT)
- task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
-#else
task_thread_info(idle)->preempt_count = 0;
-#endif
+
/*
* The idle tasks have their own, simple scheduling class:
*/
}
#ifdef CONFIG_SMP
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ if (p->sched_class && p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, new_mask);
+ else {
+ cpumask_copy(&p->cpus_allowed, new_mask);
+ p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+ }
+}
+
/*
* This is how migration works:
*
rq = task_rq_lock(p, &flags);
+ if (cpumask_equal(&p->cpus_allowed, new_mask))
+ goto out;
+
if (!cpumask_intersects(new_mask, cpu_active_mask)) {
ret = -EINVAL;
goto out;
}
- if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
- !cpumask_equal(&p->cpus_allowed, new_mask))) {
+ if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
ret = -EINVAL;
goto out;
}
- if (p->sched_class->set_cpus_allowed)
- p->sched_class->set_cpus_allowed(p, new_mask);
- else {
- cpumask_copy(&p->cpus_allowed, new_mask);
- p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
- }
+ do_set_cpus_allowed(p, new_mask);
/* Can the task run on the task's current CPU? If so, we're done */
if (cpumask_test_cpu(task_cpu(p), new_mask))
goto out;
dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (need_migrate_task(p)) {
+ if (p->on_rq) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
task_rq_unlock(rq, p, &flags);
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DYING:
+ sched_ttwu_pending();
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
#ifdef CONFIG_SMP
+static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
+
#ifdef CONFIG_SCHED_DEBUG
static __read_mostly int sched_domain_debug_enabled;
break;
}
- if (!group->cpu_power) {
+ if (!group->sgp->power) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
- if (group->cpu_power != SCHED_LOAD_SCALE) {
+ if (group->sgp->power != SCHED_POWER_SCALE) {
printk(KERN_CONT " (cpu_power = %d)",
- group->cpu_power);
+ group->sgp->power);
}
group = group->next;
static void sched_domain_debug(struct sched_domain *sd, int cpu)
{
- cpumask_var_t groupmask;
int level = 0;
if (!sched_domain_debug_enabled)
printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
- if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
- printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
- return;
- }
-
for (;;) {
- if (sched_domain_debug_one(sd, cpu, level, groupmask))
+ if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
break;
level++;
sd = sd->parent;
if (!sd)
break;
}
- free_cpumask_var(groupmask);
}
#else /* !CONFIG_SCHED_DEBUG */
# define sched_domain_debug(sd, cpu) do { } while (0)
return 1;
}
-static void free_rootdomain(struct root_domain *rd)
+static void free_rootdomain(struct rcu_head *rcu)
{
- synchronize_sched();
+ struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
cpupri_cleanup(&rd->cpupri);
-
free_cpumask_var(rd->rto_mask);
free_cpumask_var(rd->online);
free_cpumask_var(rd->span);
raw_spin_unlock_irqrestore(&rq->lock, flags);
if (old_rd)
- free_rootdomain(old_rd);
+ call_rcu_sched(&old_rd->rcu, free_rootdomain);
}
static int init_rootdomain(struct root_domain *rd)
return rd;
}
+static void free_sched_groups(struct sched_group *sg, int free_sgp)
+{
+ struct sched_group *tmp, *first;
+
+ if (!sg)
+ return;
+
+ first = sg;
+ do {
+ tmp = sg->next;
+
+ if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
+ kfree(sg->sgp);
+
+ kfree(sg);
+ sg = tmp;
+ } while (sg != first);
+}
+
+static void free_sched_domain(struct rcu_head *rcu)
+{
+ struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+ /*
+ * If its an overlapping domain it has private groups, iterate and
+ * nuke them all.
+ */
+ if (sd->flags & SD_OVERLAP) {
+ free_sched_groups(sd->groups, 1);
+ } else if (atomic_dec_and_test(&sd->groups->ref)) {
+ kfree(sd->groups->sgp);
+ kfree(sd->groups);
+ }
+ kfree(sd);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+{
+ call_rcu(&sd->rcu, free_sched_domain);
+}
+
+static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+{
+ for (; sd; sd = sd->parent)
+ destroy_sched_domain(sd, cpu);
+}
+
/*
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock.
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
- for (tmp = sd; tmp; tmp = tmp->parent)
- tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
-
/* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) {
struct sched_domain *parent = tmp->parent;
tmp->parent = parent->parent;
if (parent->parent)
parent->parent->child = tmp;
+ destroy_sched_domain(parent, cpu);
} else
tmp = tmp->parent;
}
if (sd && sd_degenerate(sd)) {
+ tmp = sd;
sd = sd->parent;
+ destroy_sched_domain(tmp, cpu);
if (sd)
sd->child = NULL;
}
sched_domain_debug(sd, cpu);
rq_attach_root(rq, rd);
+ tmp = rq->sd;
rcu_assign_pointer(rq->sd, sd);
+ destroy_sched_domains(tmp, cpu);
}
/* cpus with isolated domains */
__setup("isolcpus=", isolated_cpu_setup);
-/*
- * init_sched_build_groups takes the cpumask we wish to span, and a pointer
- * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- * (due to the fact that we keep track of groups covered with a struct cpumask).
- *
- * init_sched_build_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
- */
-static void
-init_sched_build_groups(const struct cpumask *span,
- const struct cpumask *cpu_map,
- int (*group_fn)(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg,
- struct cpumask *tmpmask),
- struct cpumask *covered, struct cpumask *tmpmask)
-{
- struct sched_group *first = NULL, *last = NULL;
- int i;
-
- cpumask_clear(covered);
-
- for_each_cpu(i, span) {
- struct sched_group *sg;
- int group = group_fn(i, cpu_map, &sg, tmpmask);
- int j;
-
- if (cpumask_test_cpu(i, covered))
- continue;
-
- cpumask_clear(sched_group_cpus(sg));
- sg->cpu_power = 0;
-
- for_each_cpu(j, span) {
- if (group_fn(j, cpu_map, NULL, tmpmask) != group)
- continue;
-
- cpumask_set_cpu(j, covered);
- cpumask_set_cpu(j, sched_group_cpus(sg));
- }
- if (!first)
- first = sg;
- if (last)
- last->next = sg;
- last = sg;
- }
- last->next = first;
-}
-
#define SD_NODES_PER_DOMAIN 16
#ifdef CONFIG_NUMA
*/
static int find_next_best_node(int node, nodemask_t *used_nodes)
{
- int i, n, val, min_val, best_node = 0;
+ int i, n, val, min_val, best_node = -1;
min_val = INT_MAX;
}
}
- node_set(best_node, *used_nodes);
+ if (best_node != -1)
+ node_set(best_node, *used_nodes);
return best_node;
}
for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
int next_node = find_next_best_node(node, &used_nodes);
-
+ if (next_node < 0)
+ break;
cpumask_or(span, span, cpumask_of_node(next_node));
}
}
+
+static const struct cpumask *cpu_node_mask(int cpu)
+{
+ lockdep_assert_held(&sched_domains_mutex);
+
+ sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
+
+ return sched_domains_tmpmask;
+}
+
+static const struct cpumask *cpu_allnodes_mask(int cpu)
+{
+ return cpu_possible_mask;
+}
#endif /* CONFIG_NUMA */
-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+static const struct cpumask *cpu_cpu_mask(int cpu)
+{
+ return cpumask_of_node(cpu_to_node(cpu));
+}
-/*
- * The cpus mask in sched_group and sched_domain hangs off the end.
- *
- * ( See the the comments in include/linux/sched.h:struct sched_group
- * and struct sched_domain. )
- */
-struct static_sched_group {
- struct sched_group sg;
- DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
-};
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
-struct static_sched_domain {
- struct sched_domain sd;
- DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+struct sd_data {
+ struct sched_domain **__percpu sd;
+ struct sched_group **__percpu sg;
+ struct sched_group_power **__percpu sgp;
};
struct s_data {
-#ifdef CONFIG_NUMA
- int sd_allnodes;
- cpumask_var_t domainspan;
- cpumask_var_t covered;
- cpumask_var_t notcovered;
-#endif
- cpumask_var_t nodemask;
- cpumask_var_t this_sibling_map;
- cpumask_var_t this_core_map;
- cpumask_var_t this_book_map;
- cpumask_var_t send_covered;
- cpumask_var_t tmpmask;
- struct sched_group **sched_group_nodes;
+ struct sched_domain ** __percpu sd;
struct root_domain *rd;
};
enum s_alloc {
- sa_sched_groups = 0,
sa_rootdomain,
- sa_tmpmask,
- sa_send_covered,
- sa_this_book_map,
- sa_this_core_map,
- sa_this_sibling_map,
- sa_nodemask,
- sa_sched_group_nodes,
-#ifdef CONFIG_NUMA
- sa_notcovered,
- sa_covered,
- sa_domainspan,
-#endif
+ sa_sd,
+ sa_sd_storage,
sa_none,
};
-/*
- * SMT sched-domains:
- */
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
+struct sched_domain_topology_level;
-static int
-cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *unused)
-{
- if (sg)
- *sg = &per_cpu(sched_groups, cpu).sg;
- return cpu;
-}
-#endif /* CONFIG_SCHED_SMT */
+typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
+typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-/*
- * multi-core sched-domains:
- */
-#ifdef CONFIG_SCHED_MC
-static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
+#define SDTL_OVERLAP 0x01
+
+struct sched_domain_topology_level {
+ sched_domain_init_f init;
+ sched_domain_mask_f mask;
+ int flags;
+ struct sd_data data;
+};
static int
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *mask)
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
{
- int group;
-#ifdef CONFIG_SCHED_SMT
- cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- group = cpumask_first(mask);
-#else
- group = cpu;
-#endif
- if (sg)
- *sg = &per_cpu(sched_group_core, group).sg;
- return group;
-}
-#endif /* CONFIG_SCHED_MC */
+ struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered = sched_domains_tmpmask;
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *child;
+ int i;
-/*
- * book sched-domains:
- */
-#ifdef CONFIG_SCHED_BOOK
-static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
+ cpumask_clear(covered);
-static int
-cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *mask)
-{
- int group = cpu;
-#ifdef CONFIG_SCHED_MC
- cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
- group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_SMT)
- cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- group = cpumask_first(mask);
-#endif
- if (sg)
- *sg = &per_cpu(sched_group_book, group).sg;
- return group;
-}
-#endif /* CONFIG_SCHED_BOOK */
+ for_each_cpu(i, span) {
+ struct cpumask *sg_span;
-static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
+ if (cpumask_test_cpu(i, covered))
+ continue;
-static int
-cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *mask)
-{
- int group;
-#ifdef CONFIG_SCHED_BOOK
- cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
- group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_MC)
- cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
- group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_SMT)
- cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- group = cpumask_first(mask);
-#else
- group = cpu;
-#endif
- if (sg)
- *sg = &per_cpu(sched_group_phys, group).sg;
- return group;
-}
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(i));
-#ifdef CONFIG_NUMA
-/*
- * The init_sched_build_groups can't handle what we want to do with node
- * groups, so roll our own. Now each node has its own list of groups which
- * gets dynamically allocated.
- */
-static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
-static struct sched_group ***sched_group_nodes_bycpu;
+ if (!sg)
+ goto fail;
-static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
+ sg_span = sched_group_cpus(sg);
-static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg,
- struct cpumask *nodemask)
-{
- int group;
+ child = *per_cpu_ptr(sdd->sd, i);
+ if (child->child) {
+ child = child->child;
+ cpumask_copy(sg_span, sched_domain_span(child));
+ } else
+ cpumask_set_cpu(i, sg_span);
- cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
- group = cpumask_first(nodemask);
+ cpumask_or(covered, covered, sg_span);
- if (sg)
- *sg = &per_cpu(sched_group_allnodes, group).sg;
- return group;
-}
+ sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+ atomic_inc(&sg->sgp->ref);
-static void init_numa_sched_groups_power(struct sched_group *group_head)
-{
- struct sched_group *sg = group_head;
- int j;
+ if (cpumask_test_cpu(cpu, sg_span))
+ groups = sg;
- if (!sg)
- return;
- do {
- for_each_cpu(j, sched_group_cpus(sg)) {
- struct sched_domain *sd;
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
+ last->next = first;
+ }
+ sd->groups = groups;
- sd = &per_cpu(phys_domains, j).sd;
- if (j != group_first_cpu(sd->groups)) {
- /*
- * Only add "power" once for each
- * physical package.
- */
- continue;
- }
+ return 0;
- sg->cpu_power += sd->groups->cpu_power;
- }
- sg = sg->next;
- } while (sg != group_head);
+fail:
+ free_sched_groups(first, 0);
+
+ return -ENOMEM;
}
-static int build_numa_sched_groups(struct s_data *d,
- const struct cpumask *cpu_map, int num)
+static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
{
- struct sched_domain *sd;
- struct sched_group *sg, *prev;
- int n, j;
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ struct sched_domain *child = sd->child;
- cpumask_clear(d->covered);
- cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
- if (cpumask_empty(d->nodemask)) {
- d->sched_group_nodes[num] = NULL;
- goto out;
+ if (child)
+ cpu = cpumask_first(sched_domain_span(child));
+
+ if (sg) {
+ *sg = *per_cpu_ptr(sdd->sg, cpu);
+ (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+ atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
}
- sched_domain_node_span(num, d->domainspan);
- cpumask_and(d->domainspan, d->domainspan, cpu_map);
+ return cpu;
+}
- sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, num);
- if (!sg) {
- printk(KERN_WARNING "Can not alloc domain group for node %d\n",
- num);
- return -ENOMEM;
- }
- d->sched_group_nodes[num] = sg;
+/*
+ * build_sched_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_power to 0.
+ *
+ * Assumes the sched_domain tree is fully constructed
+ */
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
+{
+ struct sched_group *first = NULL, *last = NULL;
+ struct sd_data *sdd = sd->private;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered;
+ int i;
- for_each_cpu(j, d->nodemask) {
- sd = &per_cpu(node_domains, j).sd;
- sd->groups = sg;
- }
+ get_group(cpu, sdd, &sd->groups);
+ atomic_inc(&sd->groups->ref);
- sg->cpu_power = 0;
- cpumask_copy(sched_group_cpus(sg), d->nodemask);
- sg->next = sg;
- cpumask_or(d->covered, d->covered, d->nodemask);
+ if (cpu != cpumask_first(sched_domain_span(sd)))
+ return 0;
- prev = sg;
- for (j = 0; j < nr_node_ids; j++) {
- n = (num + j) % nr_node_ids;
- cpumask_complement(d->notcovered, d->covered);
- cpumask_and(d->tmpmask, d->notcovered, cpu_map);
- cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
- if (cpumask_empty(d->tmpmask))
- break;
- cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
- if (cpumask_empty(d->tmpmask))
- continue;
- sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, num);
- if (!sg) {
- printk(KERN_WARNING
- "Can not alloc domain group for node %d\n", j);
- return -ENOMEM;
- }
- sg->cpu_power = 0;
- cpumask_copy(sched_group_cpus(sg), d->tmpmask);
- sg->next = prev->next;
- cpumask_or(d->covered, d->covered, d->tmpmask);
- prev->next = sg;
- prev = sg;
- }
-out:
- return 0;
-}
-#endif /* CONFIG_NUMA */
+ lockdep_assert_held(&sched_domains_mutex);
+ covered = sched_domains_tmpmask;
-#ifdef CONFIG_NUMA
-/* Free memory allocated for various sched_group structures */
-static void free_sched_groups(const struct cpumask *cpu_map,
- struct cpumask *nodemask)
-{
- int cpu, i;
+ cpumask_clear(covered);
- for_each_cpu(cpu, cpu_map) {
- struct sched_group **sched_group_nodes
- = sched_group_nodes_bycpu[cpu];
+ for_each_cpu(i, span) {
+ struct sched_group *sg;
+ int group = get_group(i, sdd, &sg);
+ int j;
- if (!sched_group_nodes)
+ if (cpumask_test_cpu(i, covered))
continue;
- for (i = 0; i < nr_node_ids; i++) {
- struct sched_group *oldsg, *sg = sched_group_nodes[i];
+ cpumask_clear(sched_group_cpus(sg));
+ sg->sgp->power = 0;
- cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
- if (cpumask_empty(nodemask))
+ for_each_cpu(j, span) {
+ if (get_group(j, sdd, NULL) != group)
continue;
- if (sg == NULL)
- continue;
- sg = sg->next;
-next_sg:
- oldsg = sg;
- sg = sg->next;
- kfree(oldsg);
- if (oldsg != sched_group_nodes[i])
- goto next_sg;
+ cpumask_set_cpu(j, covered);
+ cpumask_set_cpu(j, sched_group_cpus(sg));
}
- kfree(sched_group_nodes);
- sched_group_nodes_bycpu[cpu] = NULL;
+
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
}
+ last->next = first;
+
+ return 0;
}
-#else /* !CONFIG_NUMA */
-static void free_sched_groups(const struct cpumask *cpu_map,
- struct cpumask *nodemask)
-{
-}
-#endif /* CONFIG_NUMA */
/*
* Initialize sched groups cpu_power.
*/
static void init_sched_groups_power(int cpu, struct sched_domain *sd)
{
- struct sched_domain *child;
- struct sched_group *group;
- long power;
- int weight;
-
- WARN_ON(!sd || !sd->groups);
-
- if (cpu != group_first_cpu(sd->groups))
- return;
+ struct sched_group *sg = sd->groups;
- sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
+ WARN_ON(!sd || !sg);
- child = sd->child;
-
- sd->groups->cpu_power = 0;
+ do {
+ sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+ sg = sg->next;
+ } while (sg != sd->groups);
- if (!child) {
- power = SCHED_LOAD_SCALE;
- weight = cpumask_weight(sched_domain_span(sd));
- /*
- * SMT siblings share the power of a single core.
- * Usually multiple threads get a better yield out of
- * that one core than a single thread would have,
- * reflect that in sd->smt_gain.
- */
- if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
- power *= sd->smt_gain;
- power /= weight;
- power >>= SCHED_LOAD_SHIFT;
- }
- sd->groups->cpu_power += power;
+ if (cpu != group_first_cpu(sg))
return;
- }
- /*
- * Add cpu_power of each child group to this groups cpu_power.
- */
- group = child->groups;
- do {
- sd->groups->cpu_power += group->cpu_power;
- group = group->next;
- } while (group != child->groups);
+ update_group_power(sd, cpu);
}
/*
# define SD_INIT_NAME(sd, type) do { } while (0)
#endif
-#define SD_INIT(sd, type) sd_init_##type(sd)
-
-#define SD_INIT_FUNC(type) \
-static noinline void sd_init_##type(struct sched_domain *sd) \
-{ \
- memset(sd, 0, sizeof(*sd)); \
- *sd = SD_##type##_INIT; \
- sd->level = SD_LV_##type; \
- SD_INIT_NAME(sd, type); \
+#define SD_INIT_FUNC(type) \
+static noinline struct sched_domain * \
+sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
+{ \
+ struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
+ *sd = SD_##type##_INIT; \
+ SD_INIT_NAME(sd, type); \
+ sd->private = &tl->data; \
+ return sd; \
}
SD_INIT_FUNC(CPU)
#endif
static int default_relax_domain_level = -1;
+int sched_domain_level_max;
static int __init setup_relax_domain_level(char *str)
{
unsigned long val;
val = simple_strtoul(str, NULL, 0);
- if (val < SD_LV_MAX)
+ if (val < sched_domain_level_max)
default_relax_domain_level = val;
return 1;
}
}
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
const struct cpumask *cpu_map)
{
switch (what) {
- case sa_sched_groups:
- free_sched_groups(cpu_map, d->tmpmask); /* fall through */
- d->sched_group_nodes = NULL;
case sa_rootdomain:
- free_rootdomain(d->rd); /* fall through */
- case sa_tmpmask:
- free_cpumask_var(d->tmpmask); /* fall through */
- case sa_send_covered:
- free_cpumask_var(d->send_covered); /* fall through */
- case sa_this_book_map:
- free_cpumask_var(d->this_book_map); /* fall through */
- case sa_this_core_map:
- free_cpumask_var(d->this_core_map); /* fall through */
- case sa_this_sibling_map:
- free_cpumask_var(d->this_sibling_map); /* fall through */
- case sa_nodemask:
- free_cpumask_var(d->nodemask); /* fall through */
- case sa_sched_group_nodes:
-#ifdef CONFIG_NUMA
- kfree(d->sched_group_nodes); /* fall through */
- case sa_notcovered:
- free_cpumask_var(d->notcovered); /* fall through */
- case sa_covered:
- free_cpumask_var(d->covered); /* fall through */
- case sa_domainspan:
- free_cpumask_var(d->domainspan); /* fall through */
-#endif
+ if (!atomic_read(&d->rd->refcount))
+ free_rootdomain(&d->rd->rcu); /* fall through */
+ case sa_sd:
+ free_percpu(d->sd); /* fall through */
+ case sa_sd_storage:
+ __sdt_free(cpu_map); /* fall through */
case sa_none:
break;
}
static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
const struct cpumask *cpu_map)
{
-#ifdef CONFIG_NUMA
- if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
- return sa_none;
- if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
- return sa_domainspan;
- if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
- return sa_covered;
- /* Allocate the per-node list of sched groups */
- d->sched_group_nodes = kcalloc(nr_node_ids,
- sizeof(struct sched_group *), GFP_KERNEL);
- if (!d->sched_group_nodes) {
- printk(KERN_WARNING "Can not alloc sched group node list\n");
- return sa_notcovered;
- }
- sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
-#endif
- if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
- return sa_sched_group_nodes;
- if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
- return sa_nodemask;
- if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
- return sa_this_sibling_map;
- if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
- return sa_this_core_map;
- if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
- return sa_this_book_map;
- if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
- return sa_send_covered;
+ memset(d, 0, sizeof(*d));
+
+ if (__sdt_alloc(cpu_map))
+ return sa_sd_storage;
+ d->sd = alloc_percpu(struct sched_domain *);
+ if (!d->sd)
+ return sa_sd_storage;
d->rd = alloc_rootdomain();
- if (!d->rd) {
- printk(KERN_WARNING "Cannot alloc root domain\n");
- return sa_tmpmask;
- }
+ if (!d->rd)
+ return sa_sd;
return sa_rootdomain;
}
-static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
+/*
+ * NULL the sd_data elements we've used to build the sched_domain and
+ * sched_group structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
{
- struct sched_domain *sd = NULL;
-#ifdef CONFIG_NUMA
- struct sched_domain *parent;
-
- d->sd_allnodes = 0;
- if (cpumask_weight(cpu_map) >
- SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
- sd = &per_cpu(allnodes_domains, i).sd;
- SD_INIT(sd, ALLNODES);
- set_domain_attribute(sd, attr);
- cpumask_copy(sched_domain_span(sd), cpu_map);
- cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
- d->sd_allnodes = 1;
- }
- parent = sd;
-
- sd = &per_cpu(node_domains, i).sd;
- SD_INIT(sd, NODE);
- set_domain_attribute(sd, attr);
- sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
- sd->parent = parent;
- if (parent)
- parent->child = sd;
- cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
-#endif
- return sd;
-}
+ struct sd_data *sdd = sd->private;
-static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *parent, int i)
-{
- struct sched_domain *sd;
- sd = &per_cpu(phys_domains, i).sd;
- SD_INIT(sd, CPU);
- set_domain_attribute(sd, attr);
- cpumask_copy(sched_domain_span(sd), d->nodemask);
- sd->parent = parent;
- if (parent)
- parent->child = sd;
- cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
- return sd;
-}
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+ *per_cpu_ptr(sdd->sd, cpu) = NULL;
-static struct sched_domain *__build_book_sched_domain(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *parent, int i)
-{
- struct sched_domain *sd = parent;
-#ifdef CONFIG_SCHED_BOOK
- sd = &per_cpu(book_domains, i).sd;
- SD_INIT(sd, BOOK);
- set_domain_attribute(sd, attr);
- cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
- sd->parent = parent;
- parent->child = sd;
- cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
- return sd;
-}
+ if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
+ *per_cpu_ptr(sdd->sg, cpu) = NULL;
-static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *parent, int i)
-{
- struct sched_domain *sd = parent;
-#ifdef CONFIG_SCHED_MC
- sd = &per_cpu(core_domains, i).sd;
- SD_INIT(sd, MC);
- set_domain_attribute(sd, attr);
- cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
- sd->parent = parent;
- parent->child = sd;
- cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
- return sd;
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
+ *per_cpu_ptr(sdd->sgp, cpu) = NULL;
}
-static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *parent, int i)
-{
- struct sched_domain *sd = parent;
#ifdef CONFIG_SCHED_SMT
- sd = &per_cpu(cpu_domains, i).sd;
- SD_INIT(sd, SIBLING);
- set_domain_attribute(sd, attr);
- cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
- sd->parent = parent;
- parent->child = sd;
- cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
- return sd;
+static const struct cpumask *cpu_smt_mask(int cpu)
+{
+ return topology_thread_cpumask(cpu);
}
+#endif
-static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
- const struct cpumask *cpu_map, int cpu)
-{
- switch (l) {
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
- case SD_LV_SIBLING: /* set up CPU (sibling) groups */
- cpumask_and(d->this_sibling_map, cpu_map,
- topology_thread_cpumask(cpu));
- if (cpu == cpumask_first(d->this_sibling_map))
- init_sched_build_groups(d->this_sibling_map, cpu_map,
- &cpu_to_cpu_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_SIBLING, cpu_smt_mask, },
#endif
#ifdef CONFIG_SCHED_MC
- case SD_LV_MC: /* set up multi-core groups */
- cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
- if (cpu == cpumask_first(d->this_core_map))
- init_sched_build_groups(d->this_core_map, cpu_map,
- &cpu_to_core_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_MC, cpu_coregroup_mask, },
#endif
#ifdef CONFIG_SCHED_BOOK
- case SD_LV_BOOK: /* set up book groups */
- cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
- if (cpu == cpumask_first(d->this_book_map))
- init_sched_build_groups(d->this_book_map, cpu_map,
- &cpu_to_book_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_BOOK, cpu_book_mask, },
#endif
- case SD_LV_CPU: /* set up physical groups */
- cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
- if (!cpumask_empty(d->nodemask))
- init_sched_build_groups(d->nodemask, cpu_map,
- &cpu_to_phys_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_CPU, cpu_cpu_mask, },
#ifdef CONFIG_NUMA
- case SD_LV_ALLNODES:
- init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
+ { sd_init_ALLNODES, cpu_allnodes_mask, },
#endif
- default:
- break;
+ { NULL, },
+};
+
+static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for (tl = sched_domain_topology; tl->init; tl++) {
+ struct sd_data *sdd = &tl->data;
+
+ sdd->sd = alloc_percpu(struct sched_domain *);
+ if (!sdd->sd)
+ return -ENOMEM;
+
+ sdd->sg = alloc_percpu(struct sched_group *);
+ if (!sdd->sg)
+ return -ENOMEM;
+
+ sdd->sgp = alloc_percpu(struct sched_group_power *);
+ if (!sdd->sgp)
+ return -ENOMEM;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+ struct sched_group *sg;
+ struct sched_group_power *sgp;
+
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sd)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sd, j) = sd;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sg)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sg, j) = sg;
+
+ sgp = kzalloc_node(sizeof(struct sched_group_power),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sgp)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sgp, j) = sgp;
+ }
}
+
+ return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for (tl = sched_domain_topology; tl->init; tl++) {
+ struct sd_data *sdd = &tl->data;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
+ if (sd && (sd->flags & SD_OVERLAP))
+ free_sched_groups(sd->groups, 0);
+ kfree(*per_cpu_ptr(sdd->sd, j));
+ kfree(*per_cpu_ptr(sdd->sg, j));
+ kfree(*per_cpu_ptr(sdd->sgp, j));
+ }
+ free_percpu(sdd->sd);
+ free_percpu(sdd->sg);
+ free_percpu(sdd->sgp);
+ }
+}
+
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+ struct s_data *d, const struct cpumask *cpu_map,
+ struct sched_domain_attr *attr, struct sched_domain *child,
+ int cpu)
+{
+ struct sched_domain *sd = tl->init(tl, cpu);
+ if (!sd)
+ return child;
+
+ set_domain_attribute(sd, attr);
+ cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+ if (child) {
+ sd->level = child->level + 1;
+ sched_domain_level_max = max(sched_domain_level_max, sd->level);
+ child->parent = sd;
+ }
+ sd->child = child;
+
+ return sd;
}
/*
* Build sched domains for a given set of cpus and attach the sched domains
* to the individual cpus
*/
-static int __build_sched_domains(const struct cpumask *cpu_map,
- struct sched_domain_attr *attr)
+static int build_sched_domains(const struct cpumask *cpu_map,
+ struct sched_domain_attr *attr)
{
enum s_alloc alloc_state = sa_none;
- struct s_data d;
struct sched_domain *sd;
- int i;
-#ifdef CONFIG_NUMA
- d.sd_allnodes = 0;
-#endif
+ struct s_data d;
+ int i, ret = -ENOMEM;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
if (alloc_state != sa_rootdomain)
goto error;
- alloc_state = sa_sched_groups;
- /*
- * Set up domains for cpus specified by the cpu_map.
- */
+ /* Set up domains for cpus specified by the cpu_map. */
for_each_cpu(i, cpu_map) {
- cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
- cpu_map);
+ struct sched_domain_topology_level *tl;
+
+ sd = NULL;
+ for (tl = sched_domain_topology; tl->init; tl++) {
+ sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+ if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+ sd->flags |= SD_OVERLAP;
+ if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+ break;
+ }
- sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
- sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
- sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
- sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
- sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
- }
+ while (sd->child)
+ sd = sd->child;
- for_each_cpu(i, cpu_map) {
- build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
- build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
- build_sched_groups(&d, SD_LV_MC, cpu_map, i);
+ *per_cpu_ptr(d.sd, i) = sd;
}
- /* Set up physical groups */
- for (i = 0; i < nr_node_ids; i++)
- build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
-
-#ifdef CONFIG_NUMA
- /* Set up node groups */
- if (d.sd_allnodes)
- build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
-
- for (i = 0; i < nr_node_ids; i++)
- if (build_numa_sched_groups(&d, cpu_map, i))
- goto error;
-#endif
-
- /* Calculate CPU power for physical packages and nodes */
-#ifdef CONFIG_SCHED_SMT
- for_each_cpu(i, cpu_map) {
- sd = &per_cpu(cpu_domains, i).sd;
- init_sched_groups_power(i, sd);
- }
-#endif
-#ifdef CONFIG_SCHED_MC
- for_each_cpu(i, cpu_map) {
- sd = &per_cpu(core_domains, i).sd;
- init_sched_groups_power(i, sd);
- }
-#endif
-#ifdef CONFIG_SCHED_BOOK
+ /* Build the groups for the domains */
for_each_cpu(i, cpu_map) {
- sd = &per_cpu(book_domains, i).sd;
- init_sched_groups_power(i, sd);
- }
-#endif
-
- for_each_cpu(i, cpu_map) {
- sd = &per_cpu(phys_domains, i).sd;
- init_sched_groups_power(i, sd);
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ sd->span_weight = cpumask_weight(sched_domain_span(sd));
+ if (sd->flags & SD_OVERLAP) {
+ if (build_overlap_sched_groups(sd, i))
+ goto error;
+ } else {
+ if (build_sched_groups(sd, i))
+ goto error;
+ }
+ }
}
-#ifdef CONFIG_NUMA
- for (i = 0; i < nr_node_ids; i++)
- init_numa_sched_groups_power(d.sched_group_nodes[i]);
-
- if (d.sd_allnodes) {
- struct sched_group *sg;
+ /* Calculate CPU power for physical packages and nodes */
+ for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ if (!cpumask_test_cpu(i, cpu_map))
+ continue;
- cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
- d.tmpmask);
- init_numa_sched_groups_power(sg);
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ claim_allocations(i, sd);
+ init_sched_groups_power(i, sd);
+ }
}
-#endif
/* Attach the domains */
+ rcu_read_lock();
for_each_cpu(i, cpu_map) {
-#ifdef CONFIG_SCHED_SMT
- sd = &per_cpu(cpu_domains, i).sd;
-#elif defined(CONFIG_SCHED_MC)
- sd = &per_cpu(core_domains, i).sd;
-#elif defined(CONFIG_SCHED_BOOK)
- sd = &per_cpu(book_domains, i).sd;
-#else
- sd = &per_cpu(phys_domains, i).sd;
-#endif
+ sd = *per_cpu_ptr(d.sd, i);
cpu_attach_domain(sd, d.rd, i);
}
+ rcu_read_unlock();
- d.sched_group_nodes = NULL; /* don't free this we still need it */
- __free_domain_allocs(&d, sa_tmpmask, cpu_map);
- return 0;
-
+ ret = 0;
error:
__free_domain_allocs(&d, alloc_state, cpu_map);
- return -ENOMEM;
-}
-
-static int build_sched_domains(const struct cpumask *cpu_map)
-{
- return __build_sched_domains(cpu_map, NULL);
+ return ret;
}
static cpumask_var_t *doms_cur; /* current sched domains */
* For now this just excludes isolated cpus, but could be used to
* exclude other special cases in the future.
*/
-static int arch_init_sched_domains(const struct cpumask *cpu_map)
+static int init_sched_domains(const struct cpumask *cpu_map)
{
int err;
doms_cur = &fallback_doms;
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
dattr_cur = NULL;
- err = build_sched_domains(doms_cur[0]);
+ err = build_sched_domains(doms_cur[0], NULL);
register_sched_domain_sysctl();
return err;
}
-static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
- struct cpumask *tmpmask)
-{
- free_sched_groups(cpu_map, tmpmask);
-}
-
/*
* Detach sched domains from a group of cpus specified in cpu_map
* These cpus will now be attached to the NULL domain
*/
static void detach_destroy_domains(const struct cpumask *cpu_map)
{
- /* Save because hotplug lock held. */
- static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
int i;
+ rcu_read_lock();
for_each_cpu(i, cpu_map)
cpu_attach_domain(NULL, &def_root_domain, i);
- synchronize_sched();
- arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
+ rcu_read_unlock();
}
/* handle null as "default" */
goto match2;
}
/* no match - add a new doms_new */
- __build_sched_domains(doms_new[i],
- dattr_new ? dattr_new + i : NULL);
+ build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
match2:
;
}
}
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void arch_reinit_sched_domains(void)
+static void reinit_sched_domains(void)
{
get_online_cpus();
else
sched_mc_power_savings = level;
- arch_reinit_sched_domains();
+ reinit_sched_domains();
return count;
}
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
-#if defined(CONFIG_NUMA)
- sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
- GFP_KERNEL);
- BUG_ON(sched_group_nodes_bycpu == NULL);
-#endif
get_online_cpus();
mutex_lock(&sched_domains_mutex);
- arch_init_sched_domains(cpu_active_mask);
+ init_sched_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
&& addr < (unsigned long)__sched_text_end);
}
-static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+static void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT;
INIT_LIST_HEAD(&cfs_rq->tasks);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- cfs_rq->rq = rq;
- /* allow initial update_cfs_load() to truncate */
-#ifdef CONFIG_SMP
- cfs_rq->load_stamp = 1;
-#endif
-#endif
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+#ifndef CONFIG_64BIT
+ cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
}
static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
/* delimiter for bitsearch: */
__set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+#if defined CONFIG_SMP
rt_rq->highest_prio.curr = MAX_RT_PRIO;
-#ifdef CONFIG_SMP
rt_rq->highest_prio.next = MAX_RT_PRIO;
-#endif
-#endif
-#ifdef CONFIG_SMP
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
- plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
+ plist_head_init(&rt_rq->pushable_tasks);
#endif
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
rt_rq->rt_runtime = 0;
raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-
-#ifdef CONFIG_RT_GROUP_SCHED
- rt_rq->rt_nr_boosted = 0;
- rt_rq->rq = rq;
-#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity *parent)
{
struct rq *rq = cpu_rq(cpu);
- tg->cfs_rq[cpu] = cfs_rq;
- init_cfs_rq(cfs_rq, rq);
+
cfs_rq->tg = tg;
+ cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+ /* allow initial update_cfs_load() to truncate */
+ cfs_rq->load_stamp = 1;
+#endif
+ tg->cfs_rq[cpu] = cfs_rq;
tg->se[cpu] = se;
+
/* se could be NULL for root_task_group */
if (!se)
return;
{
struct rq *rq = cpu_rq(cpu);
- tg->rt_rq[cpu] = rt_rq;
- init_rt_rq(rt_rq, rq);
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ rt_rq->rt_nr_boosted = 0;
+ rt_rq->rq = rq;
rt_rq->tg = tg;
- rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
+ tg->rt_rq[cpu] = rt_rq;
tg->rt_se[cpu] = rt_se;
+
if (!rt_se)
return;
rq->nr_running = 0;
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
- init_cfs_rq(&rq->cfs, rq);
+ init_cfs_rq(&rq->cfs);
init_rt_rq(&rq->rt, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = root_task_group_load;
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_power = SCHED_LOAD_SCALE;
+ rq->cpu_power = SCHED_POWER_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
#endif
#ifdef CONFIG_RT_MUTEXES
- plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
+ plist_head_init(&init_task.pi_waiters);
#endif
/*
/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
#ifdef CONFIG_SMP
+ zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
#ifdef CONFIG_NO_HZ
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
scheduler_running = 1;
}
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
static inline int preempt_count_equals(int preempt_offset)
{
int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
return (nested == preempt_offset);
}
+static int __might_sleep_init_called;
+int __init __might_sleep_init(void)
+{
+ __might_sleep_init_called = 1;
+ return 0;
+}
+early_initcall(__might_sleep_init);
+
void __might_sleep(const char *file, int line, int preempt_offset)
{
-#ifdef in_atomic
static unsigned long prev_jiffy; /* ratelimiting */
if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
+ oops_in_progress)
+ return;
+ if (system_state != SYSTEM_RUNNING &&
+ (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
if (irqs_disabled())
print_irqtrace_events(current);
dump_stack();
-#endif
}
EXPORT_SYMBOL(__might_sleep);
#endif
if (!se)
goto err_free_rq;
+ init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
}
list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
-#else /* !CONFG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
static inline void free_fair_sched_group(struct task_group *tg)
{
}
{
int i;
- destroy_rt_bandwidth(&tg->rt_bandwidth);
+ if (tg->rt_se)
+ destroy_rt_bandwidth(&tg->rt_bandwidth);
for_each_possible_cpu(i) {
if (tg->rt_rq)
{
struct rt_rq *rt_rq;
struct sched_rt_entity *rt_se;
- struct rq *rq;
int i;
tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
ktime_to_ns(def_rt_bandwidth.rt_period), 0);
for_each_possible_cpu(i) {
- rq = cpu_rq(i);
-
rt_rq = kzalloc_node(sizeof(struct rt_rq),
GFP_KERNEL, cpu_to_node(i));
if (!rt_rq)
if (!rt_se)
goto err_free_rq;
+ init_rt_rq(rt_rq, cpu_rq(i));
+ rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
}
if (!tg->se[0])
return -EINVAL;
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- else if (shares > MAX_SHARES)
- shares = MAX_SHARES;
+ shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
mutex_lock(&shares_mutex);
if (tg->shares == shares)
return 0;
}
-static int
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *tsk, bool threadgroup)
-{
- int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
- if (retval)
- return retval;
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- retval = cpu_cgroup_can_attach_task(cgrp, c);
- if (retval) {
- rcu_read_unlock();
- return retval;
- }
- }
- rcu_read_unlock();
- }
- return 0;
-}
-
static void
-cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cont, struct task_struct *tsk,
- bool threadgroup)
+cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
sched_move_task(tsk);
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- sched_move_task(c);
- }
- rcu_read_unlock();
- }
}
static void
static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
u64 shareval)
{
- return sched_group_set_shares(cgroup_tg(cgrp), shareval);
+ return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
}
static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
{
struct task_group *tg = cgroup_tg(cgrp);
- return (u64) tg->shares;
+ return (u64) scale_load_down(tg->shares);
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
.name = "cpu",
.create = cpu_cgroup_create,
.destroy = cpu_cgroup_destroy,
- .can_attach = cpu_cgroup_can_attach,
- .attach = cpu_cgroup_attach,
+ .can_attach_task = cpu_cgroup_can_attach_task,
+ .attach_task = cpu_cgroup_attach_task,
.exit = cpu_cgroup_exit,
.populate = cpu_cgroup_populate,
.subsys_id = cpu_cgroup_subsys_id,