#include <linux/init.h>
#include <linux/uaccess.h>
#include <linux/highmem.h>
-#include <linux/smp_lock.h>
#include <asm/mmu_context.h>
#include <linux/interrupt.h>
#include <linux/capability.h>
u64 exec_clock;
u64 min_vruntime;
+#ifndef CONFIG_64BIT
+ u64 min_vruntime_copy;
+#endif
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
- struct sched_entity *curr, *next, *last;
+ struct sched_entity *curr, *next, *last, *skip;
unsigned int nr_spread_over;
struct task_group *tg;
struct cgroup_subsys_state *css;
- if (p->flags & PF_EXITING)
- return &root_task_group;
-
css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
lockdep_is_held(&task_rq(p)->lock));
tg = container_of(css, struct task_group, css);
#endif
/**
- * runqueue_is_locked
+ * runqueue_is_locked - Returns true if the current cpu runqueue is locked
* @cpu: the processor in question.
*
- * Returns true if the current cpu runqueue is locked.
* This interface allows printk to be called with the runqueue lock
* held and know whether or not it is OK to wake up the klogd.
*/
return rq->curr == p;
}
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline int task_running(struct rq *rq, struct task_struct *p)
{
+#ifdef CONFIG_SMP
+ return p->on_cpu;
+#else
return task_current(rq, p);
+#endif
}
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
+#ifdef CONFIG_SMP
+ /*
+ * We can optimise this out completely for !SMP, because the
+ * SMP rebalancing from interrupt is the only thing that cares
+ * here.
+ */
+ next->on_cpu = 1;
+#endif
}
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
+#ifdef CONFIG_SMP
+ /*
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
+ * We must ensure this doesn't happen until the switch is completely
+ * finished.
+ */
+ smp_wmb();
+ prev->on_cpu = 0;
+#endif
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
rq->lock.owner = current;
}
#else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
- return p->oncpu;
-#else
- return task_current(rq, p);
-#endif
-}
-
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
#ifdef CONFIG_SMP
* SMP rebalancing from interrupt is the only thing that cares
* here.
*/
- next->oncpu = 1;
+ next->on_cpu = 1;
#endif
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
raw_spin_unlock_irq(&rq->lock);
{
#ifdef CONFIG_SMP
/*
- * After ->oncpu is cleared, the task can be moved to a different CPU.
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
* We must ensure this doesn't happen until the switch is completely
* finished.
*/
smp_wmb();
- prev->oncpu = 0;
+ prev->on_cpu = 0;
#endif
#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
local_irq_enable();
__release(rq2->lock);
}
+#else /* CONFIG_SMP */
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
+{
+ BUG_ON(!irqs_disabled());
+ BUG_ON(rq1 != rq2);
+ raw_spin_lock(&rq1->lock);
+ __acquire(rq2->lock); /* Fake it out ;) */
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ __releases(rq1->lock)
+ __releases(rq2->lock)
+{
+ BUG_ON(rq1 != rq2);
+ raw_spin_unlock(&rq1->lock);
+ __release(rq2->lock);
+}
+
#endif
static void calc_load_account_idle(struct rq *this_rq);
update_rq_clock(rq);
sched_info_queued(p);
p->sched_class->enqueue_task(rq, p, flags);
- p->se.on_rq = 1;
}
static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
update_rq_clock(rq);
sched_info_dequeued(p);
p->sched_class->dequeue_task(rq, p, flags);
- p->se.on_rq = 0;
}
/*
*/
if (hardirq_count())
__this_cpu_add(cpu_hardirq_time, delta);
- else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
__this_cpu_add(cpu_softirq_time, delta);
irq_time_write_end();
sched_rt_avg_update(rq, irq_delta);
}
+static int irqtime_account_hi_update(void)
+{
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ unsigned long flags;
+ u64 latest_ns;
+ int ret = 0;
+
+ local_irq_save(flags);
+ latest_ns = this_cpu_read(cpu_hardirq_time);
+ if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+ ret = 1;
+ local_irq_restore(flags);
+ return ret;
+}
+
+static int irqtime_account_si_update(void)
+{
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ unsigned long flags;
+ u64 latest_ns;
+ int ret = 0;
+
+ local_irq_save(flags);
+ latest_ns = this_cpu_read(cpu_softirq_time);
+ if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+ ret = 1;
+ local_irq_restore(flags);
+ return ret;
+}
+
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+#define sched_clock_irqtime (0)
+
static void update_rq_clock_task(struct rq *rq, s64 delta)
{
rq->clock_task += delta;
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
- int oldprio, int running)
+ int oldprio)
{
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
- prev_class->switched_from(rq, p, running);
- p->sched_class->switched_to(rq, p, running);
- } else
- p->sched_class->prio_changed(rq, p, oldprio, running);
+ prev_class->switched_from(rq, p);
+ p->sched_class->switched_to(rq, p);
+ } else if (oldprio != p->prio)
+ p->sched_class->prio_changed(rq, p, oldprio);
}
static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
* A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back clock update.
*/
- if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
+ if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
rq->skip_clock_update = 1;
}
* The task's runqueue lock must be held.
* Returns true if you have to wait for migration thread.
*/
-static bool migrate_task(struct task_struct *p, struct rq *rq)
+static bool need_migrate_task(struct task_struct *p)
{
/*
* If the task is not on a runqueue (and not running), then
* the next wake-up will properly place the task.
*/
- return p->se.on_rq || task_running(rq, p);
+ bool running = p->on_rq || p->on_cpu;
+ smp_rmb(); /* finish_lock_switch() */
+ return running;
}
/*
rq = task_rq_lock(p, &flags);
trace_sched_wait_task(p);
running = task_running(rq, p);
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
ncsw = 0;
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
* Cause a process which is running on another CPU to enter
* kernel-mode, without any delay. (to get signals handled.)
*
- * NOTE: this function doesnt have to take the runqueue lock,
+ * NOTE: this function doesn't have to take the runqueue lock,
* because all it wants to ensure is that the remote task enters
* the kernel. If the IPI races and the task has been migrated
* to another CPU then no harm is done and the purpose has been
EXPORT_SYMBOL_GPL(kick_process);
#endif /* CONFIG_SMP */
-/**
- * task_oncpu_function_call - call a function on the cpu on which a task runs
- * @p: the task to evaluate
- * @func: the function to be called
- * @info: the function call argument
- *
- * Calls the function @func when the task is currently running. This might
- * be on the current CPU, which just calls the function directly
- */
-void task_oncpu_function_call(struct task_struct *p,
- void (*func) (void *info), void *info)
-{
- int cpu;
-
- preempt_disable();
- cpu = task_cpu(p);
- if (task_curr(p))
- smp_call_function_single(cpu, func, info, 1);
- preempt_enable();
-}
-
#ifdef CONFIG_SMP
/*
- * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ * ->cpus_allowed is protected by both rq->lock and p->pi_lock
*/
static int select_fallback_rq(int cpu, struct task_struct *p)
{
}
/*
- * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
*/
static inline
-int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
{
- int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
+ int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
/*
* In order not to call set_task_cpu() on a blocking task we need
}
#endif
-static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
- bool is_sync, bool is_migrate, bool is_local,
- unsigned long en_flags)
+static void
+ttwu_stat(struct rq *rq, struct task_struct *p, int cpu, int wake_flags)
{
+#ifdef CONFIG_SCHEDSTATS
+#ifdef CONFIG_SMP
+ int this_cpu = smp_processor_id();
+
+ if (cpu == this_cpu) {
+ schedstat_inc(rq, ttwu_local);
+ schedstat_inc(p, se.statistics.nr_wakeups_local);
+ } else {
+ struct sched_domain *sd;
+
+ schedstat_inc(p, se.statistics.nr_wakeups_remote);
+ for_each_domain(this_cpu, sd) {
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+ schedstat_inc(sd, ttwu_wake_remote);
+ break;
+ }
+ }
+ }
+#endif /* CONFIG_SMP */
+
+ schedstat_inc(rq, ttwu_count);
schedstat_inc(p, se.statistics.nr_wakeups);
- if (is_sync)
+
+ if (wake_flags & WF_SYNC)
schedstat_inc(p, se.statistics.nr_wakeups_sync);
- if (is_migrate)
+
+ if (cpu != task_cpu(p))
schedstat_inc(p, se.statistics.nr_wakeups_migrate);
- if (is_local)
- schedstat_inc(p, se.statistics.nr_wakeups_local);
- else
- schedstat_inc(p, se.statistics.nr_wakeups_remote);
+#endif /* CONFIG_SCHEDSTATS */
+}
+
+static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+{
activate_task(rq, p, en_flags);
+ p->on_rq = 1;
+
+ /* if a worker is waking up, notify workqueue */
+ if (p->flags & PF_WQ_WORKER)
+ wq_worker_waking_up(p, cpu_of(rq));
}
-static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
- int wake_flags, bool success)
+static void
+ttwu_post_activation(struct task_struct *p, struct rq *rq, int wake_flags)
{
- trace_sched_wakeup(p, success);
+ trace_sched_wakeup(p, true);
check_preempt_curr(rq, p, wake_flags);
p->state = TASK_RUNNING;
rq->idle_stamp = 0;
}
#endif
- /* if a worker is waking up, notify workqueue */
- if ((p->flags & PF_WQ_WORKER) && success)
- wq_worker_waking_up(p, cpu_of(rq));
}
/**
this_cpu = get_cpu();
smp_wmb();
- rq = task_rq_lock(p, &flags);
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ rq = __task_rq_lock(p);
if (!(p->state & state))
goto out;
- if (p->se.on_rq)
+ cpu = task_cpu(p);
+
+ if (p->on_rq)
goto out_running;
- cpu = task_cpu(p);
orig_cpu = cpu;
-
#ifdef CONFIG_SMP
if (unlikely(task_running(rq, p)))
goto out_activate;
- /*
- * In order to handle concurrent wakeups and release the rq->lock
- * we put the task in TASK_WAKING state.
- *
- * First fix up the nr_uninterruptible count:
- */
- if (task_contributes_to_load(p)) {
- if (likely(cpu_online(orig_cpu)))
- rq->nr_uninterruptible--;
- else
- this_rq()->nr_uninterruptible--;
- }
+ p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
if (p->sched_class->task_waking) {
- p->sched_class->task_waking(rq, p);
+ p->sched_class->task_waking(p);
en_flags |= ENQUEUE_WAKING;
}
- cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
+ cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
if (cpu != orig_cpu)
set_task_cpu(p, cpu);
__task_rq_unlock(rq);
WARN_ON(task_cpu(p) != cpu);
WARN_ON(p->state != TASK_WAKING);
-#ifdef CONFIG_SCHEDSTATS
- schedstat_inc(rq, ttwu_count);
- if (cpu == this_cpu)
- schedstat_inc(rq, ttwu_local);
- else {
- struct sched_domain *sd;
- for_each_domain(this_cpu, sd) {
- if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- schedstat_inc(sd, ttwu_wake_remote);
- break;
- }
- }
- }
-#endif /* CONFIG_SCHEDSTATS */
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible--;
out_activate:
#endif /* CONFIG_SMP */
- ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
- cpu == this_cpu, en_flags);
- success = 1;
+ ttwu_activate(rq, p, en_flags);
out_running:
- ttwu_post_activation(p, rq, wake_flags, success);
+ ttwu_post_activation(p, rq, wake_flags);
+ ttwu_stat(rq, p, cpu, wake_flags);
+ success = 1;
out:
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
put_cpu();
return success;
* try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened
*
- * Put @p on the run-queue if it's not already there. The caller must
+ * Put @p on the run-queue if it's not already there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task. this_rq() stays locked over invocation.
+ * the current task.
*/
static void try_to_wake_up_local(struct task_struct *p)
{
struct rq *rq = task_rq(p);
- bool success = false;
BUG_ON(rq != this_rq());
BUG_ON(p == current);
lockdep_assert_held(&rq->lock);
+ if (!raw_spin_trylock(&p->pi_lock)) {
+ raw_spin_unlock(&rq->lock);
+ raw_spin_lock(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+ }
+
if (!(p->state & TASK_NORMAL))
- return;
+ goto out;
- if (!p->se.on_rq) {
- if (likely(!task_running(rq, p))) {
- schedstat_inc(rq, ttwu_count);
- schedstat_inc(rq, ttwu_local);
- }
- ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
- success = true;
- }
- ttwu_post_activation(p, rq, 0, success);
+ if (!p->on_rq)
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+
+ ttwu_post_activation(p, rq, 0);
+ ttwu_stat(rq, p, smp_processor_id(), 0);
+out:
+ raw_spin_unlock(&p->pi_lock);
}
/**
*/
static void __sched_fork(struct task_struct *p)
{
+ p->on_rq = 0;
+
+ p->se.on_rq = 0;
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
+ p->se.vruntime = 0;
+ INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
INIT_LIST_HEAD(&p->rt.run_list);
- p->se.on_rq = 0;
- INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
- p->oncpu = 0;
+#if defined(CONFIG_SMP)
+ p->on_cpu = 0;
#endif
#ifdef CONFIG_PREEMPT
/* Want to start with kernel preemption disabled. */
* We set TASK_WAKING so that select_task_rq() can drop rq->lock
* without people poking at ->cpus_allowed.
*/
- cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
+ cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
set_task_cpu(p, cpu);
p->state = TASK_RUNNING;
rq = task_rq_lock(p, &flags);
activate_task(rq, p, 0);
- trace_sched_wakeup_new(p, 1);
+ p->on_rq = 1;
+ trace_sched_wakeup_new(p, true);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken)
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
+ sched_info_switch(prev, next);
+ perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
+ trace_sched_switch(prev, next);
}
/**
struct mm_struct *mm, *oldmm;
prepare_task_switch(rq, prev, next);
- trace_sched_switch(prev, next);
+
mm = next->mm;
oldmm = prev->active_mm;
/*
int dest_cpu;
rq = task_rq_lock(p, &flags);
- dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+ dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
if (dest_cpu == smp_processor_id())
goto unlock;
* select_task_rq() can race against ->cpus_allowed
*/
if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
- likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
+ likely(cpu_active(dest_cpu)) && need_migrate_task(p)) {
struct migration_arg arg = { p, dest_cpu };
task_rq_unlock(rq, &flags);
}
}
+/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+ cputime64_t tmp = cputime_to_cputime64(cputime);
+
+ /* Add system time to process. */
+ p->stime = cputime_add(p->stime, cputime);
+ p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+ account_group_system_time(p, cputime);
+
+ /* Add system time to cpustat. */
+ *target_cputime64 = cputime64_add(*target_cputime64, tmp);
+ cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
+ /* Account for system time used */
+ acct_update_integrals(p);
+}
+
/*
* Account system cpu time to a process.
* @p: the process that the cpu time gets accounted to
cputime_t cputime, cputime_t cputime_scaled)
{
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- cputime64_t tmp;
+ cputime64_t *target_cputime64;
if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
account_guest_time(p, cputime, cputime_scaled);
return;
}
- /* Add system time to process. */
- p->stime = cputime_add(p->stime, cputime);
- p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
- account_group_system_time(p, cputime);
-
- /* Add system time to cpustat. */
- tmp = cputime_to_cputime64(cputime);
if (hardirq_count() - hardirq_offset)
- cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ target_cputime64 = &cpustat->irq;
else if (in_serving_softirq())
- cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+ target_cputime64 = &cpustat->softirq;
else
- cpustat->system = cputime64_add(cpustat->system, tmp);
-
- cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+ target_cputime64 = &cpustat->system;
- /* Account for system time used */
- acct_update_integrals(p);
+ __account_system_time(p, cputime, cputime_scaled, target_cputime64);
}
/*
* Account for involuntary wait time.
- * @steal: the cpu time spent in involuntary wait
+ * @cputime: the cpu time spent in involuntary wait
*/
void account_steal_time(cputime_t cputime)
{
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ * - check for guest_time
+ * - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq)
+{
+ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+
+ if (irqtime_account_hi_update()) {
+ cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ } else if (irqtime_account_si_update()) {
+ cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+ } else if (this_cpu_ksoftirqd() == p) {
+ /*
+ * ksoftirqd time do not get accounted in cpu_softirq_time.
+ * So, we have to handle it separately here.
+ * Also, p->stime needs to be updated for ksoftirqd.
+ */
+ __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+ &cpustat->softirq);
+ } else if (user_tick) {
+ account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ } else if (p == rq->idle) {
+ account_idle_time(cputime_one_jiffy);
+ } else if (p->flags & PF_VCPU) { /* System time or guest time */
+ account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ } else {
+ __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+ &cpustat->system);
+ }
+}
+
+static void irqtime_account_idle_ticks(int ticks)
+{
+ int i;
+ struct rq *rq = this_rq();
+
+ for (i = 0; i < ticks; i++)
+ irqtime_account_process_tick(current, 0, rq);
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+static void irqtime_account_idle_ticks(int ticks) {}
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq) {}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
/*
* Account a single tick of cpu time.
* @p: the process that the cpu time gets accounted to
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
struct rq *rq = this_rq();
+ if (sched_clock_irqtime) {
+ irqtime_account_process_tick(p, user_tick, rq);
+ return;
+ }
+
if (user_tick)
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
*/
void account_idle_ticks(unsigned long ticks)
{
+
+ if (sched_clock_irqtime) {
+ irqtime_account_idle_ticks(ticks);
+ return;
+ }
+
account_idle_time(jiffies_to_cputime(ticks));
}
static void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- if (prev->se.on_rq)
+ if (prev->on_rq)
update_rq_clock(rq);
prev->sched_class->put_prev_task(rq, prev);
}
rcu_note_context_switch(cpu);
prev = rq->curr;
- release_kernel_lock(prev);
-need_resched_nonpreemptible:
-
schedule_debug(prev);
if (sched_feat(HRTICK))
if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
} else {
+ deactivate_task(rq, prev, DEQUEUE_SLEEP);
+ prev->on_rq = 0;
+
/*
- * If a worker is going to sleep, notify and
- * ask workqueue whether it wants to wake up a
- * task to maintain concurrency. If so, wake
- * up the task.
+ * If a worker went to sleep, notify and ask workqueue
+ * whether it wants to wake up a task to maintain
+ * concurrency.
*/
if (prev->flags & PF_WQ_WORKER) {
struct task_struct *to_wakeup;
if (to_wakeup)
try_to_wake_up_local(to_wakeup);
}
- deactivate_task(rq, prev, DEQUEUE_SLEEP);
+
+ /*
+ * If we are going to sleep and we have plugged IO
+ * queued, make sure to submit it to avoid deadlocks.
+ */
+ if (blk_needs_flush_plug(prev)) {
+ raw_spin_unlock(&rq->lock);
+ blk_flush_plug(prev);
+ raw_spin_lock(&rq->lock);
+ }
}
switch_count = &prev->nvcsw;
}
rq->skip_clock_update = 0;
if (likely(prev != next)) {
- sched_info_switch(prev, next);
- perf_event_task_sched_out(prev, next);
-
rq->nr_switches++;
rq->curr = next;
++*switch_count;
post_schedule(rq);
- if (unlikely(reacquire_kernel_lock(prev)))
- goto need_resched_nonpreemptible;
-
preempt_enable_no_resched();
if (need_resched())
goto need_resched;
EXPORT_SYMBOL(schedule);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
-{
- unsigned int cpu;
- struct rq *rq;
- if (!sched_feat(OWNER_SPIN))
- return 0;
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+ bool ret = false;
-#ifdef CONFIG_DEBUG_PAGEALLOC
- /*
- * Need to access the cpu field knowing that
- * DEBUG_PAGEALLOC could have unmapped it if
- * the mutex owner just released it and exited.
- */
- if (probe_kernel_address(&owner->cpu, cpu))
- return 0;
-#else
- cpu = owner->cpu;
-#endif
+ rcu_read_lock();
+ if (lock->owner != owner)
+ goto fail;
/*
- * Even if the access succeeded (likely case),
- * the cpu field may no longer be valid.
+ * Ensure we emit the owner->on_cpu, dereference _after_ checking
+ * lock->owner still matches owner, if that fails, owner might
+ * point to free()d memory, if it still matches, the rcu_read_lock()
+ * ensures the memory stays valid.
*/
- if (cpu >= nr_cpumask_bits)
- return 0;
+ barrier();
- /*
- * We need to validate that we can do a
- * get_cpu() and that we have the percpu area.
- */
- if (!cpu_online(cpu))
- return 0;
+ ret = owner->on_cpu;
+fail:
+ rcu_read_unlock();
- rq = cpu_rq(cpu);
+ return ret;
+}
- for (;;) {
- /*
- * Owner changed, break to re-assess state.
- */
- if (lock->owner != owner) {
- /*
- * If the lock has switched to a different owner,
- * we likely have heavy contention. Return 0 to quit
- * optimistic spinning and not contend further:
- */
- if (lock->owner)
- return 0;
- break;
- }
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+{
+ if (!sched_feat(OWNER_SPIN))
+ return 0;
- /*
- * Is that owner really running on that cpu?
- */
- if (task_thread_info(rq->curr) != owner || need_resched())
+ while (owner_running(lock, owner)) {
+ if (need_resched())
return 0;
arch_mutex_cpu_relax();
}
+ /*
+ * If the owner changed to another task there is likely
+ * heavy contention, stop spinning.
+ */
+ if (lock->owner)
+ return 0;
+
return 1;
}
#endif
{
__wake_up_common(q, mode, 1, 0, key);
}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
BUG_ON(prio < 0 || prio > MAX_PRIO);
+ lockdep_assert_held(&p->pi_lock);
+
rq = task_rq_lock(p, &flags);
trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
prev_class = p->sched_class;
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
dequeue_task(rq, p, 0);
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq) {
+ if (on_rq)
enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
- check_class_changed(rq, p, prev_class, oldprio, running);
- }
+ check_class_changed(rq, p, prev_class, oldprio);
task_rq_unlock(rq, &flags);
}
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
if (on_rq)
dequeue_task(rq, p, 0);
static void
__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
{
- BUG_ON(p->se.on_rq);
-
p->policy = policy;
p->rt_priority = prio;
p->normal_prio = normal_prio(p);
rcu_read_lock();
pcred = __task_cred(p);
- match = (cred->euid == pcred->euid ||
- cred->euid == pcred->uid);
+ if (cred->user->user_ns == pcred->user->user_ns)
+ match = (cred->euid == pcred->euid ||
+ cred->euid == pcred->uid);
+ else
+ match = false;
rcu_read_unlock();
return match;
}
param->sched_priority > rlim_rtprio)
return -EPERM;
}
+
/*
- * Like positive nice levels, dont allow tasks to
- * move out of SCHED_IDLE either:
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/
- if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
- return -EPERM;
+ if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+ if (!can_nice(p, TASK_NICE(p)))
+ return -EPERM;
+ }
/* can't change other user's priorities */
if (!check_same_owner(p))
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
/*
- * To be able to change p->policy safely, the apropriate
+ * To be able to change p->policy safely, the appropriate
* runqueue lock must be held.
*/
rq = __task_rq_lock(p);
return -EINVAL;
}
+ /*
+ * If not changing anything there's no need to proceed further:
+ */
+ if (unlikely(policy == p->policy && (!rt_policy(policy) ||
+ param->sched_priority == p->rt_priority))) {
+
+ __task_rq_unlock(rq);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ return 0;
+ }
+
#ifdef CONFIG_RT_GROUP_SCHED
if (user) {
/*
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
goto recheck;
}
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
deactivate_task(rq, p, 0);
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq) {
+ if (on_rq)
activate_task(rq, p, 0);
- check_class_changed(rq, p, prev_class, oldprio, running);
- }
+ check_class_changed(rq, p, prev_class, oldprio);
__task_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
goto out_free_cpus_allowed;
}
retval = -EPERM;
- if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
+ if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
goto out_unlock;
retval = security_task_setscheduler(p);
{
struct task_struct *p;
unsigned long flags;
- struct rq *rq;
int retval;
get_online_cpus();
if (retval)
goto out_unlock;
- rq = task_rq_lock(p, &flags);
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
- task_rq_unlock(rq, &flags);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
rcu_read_unlock();
}
EXPORT_SYMBOL(yield);
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Returns true if we indeed boosted the target task.
+ */
+bool __sched yield_to(struct task_struct *p, bool preempt)
+{
+ struct task_struct *curr = current;
+ struct rq *rq, *p_rq;
+ unsigned long flags;
+ bool yielded = 0;
+
+ local_irq_save(flags);
+ rq = this_rq();
+
+again:
+ p_rq = task_rq(p);
+ double_rq_lock(rq, p_rq);
+ while (task_rq(p) != p_rq) {
+ double_rq_unlock(rq, p_rq);
+ goto again;
+ }
+
+ if (!curr->sched_class->yield_to_task)
+ goto out;
+
+ if (curr->sched_class != p->sched_class)
+ goto out;
+
+ if (task_running(p_rq, p) || p->state)
+ goto out;
+
+ yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+ if (yielded) {
+ schedstat_inc(rq, yld_count);
+ /*
+ * Make p's CPU reschedule; pick_next_entity takes care of
+ * fairness.
+ */
+ if (preempt && rq != p_rq)
+ resched_task(p_rq->curr);
+ }
+
+out:
+ double_rq_unlock(rq, p_rq);
+ local_irq_restore(flags);
+
+ if (yielded)
+ schedule();
+
+ return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
/*
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
* that process accounting knows that this is a task in IO wait state.
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ blk_flush_plug(current);
current->in_iowait = 1;
schedule();
current->in_iowait = 0;
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ blk_flush_plug(current);
current->in_iowait = 1;
ret = schedule_timeout(timeout);
current->in_iowait = 0;
do_each_thread(g, p) {
/*
* reset the NMI-timeout, listing all files on a slow
- * console might take alot of time:
+ * console might take a lot of time:
*/
touch_nmi_watchdog();
if (!state_filter || (p->state & state_filter))
rcu_read_unlock();
rq->curr = rq->idle = idle;
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
- idle->oncpu = 1;
+#if defined(CONFIG_SMP)
+ idle->on_cpu = 1;
#endif
raw_spin_unlock_irqrestore(&rq->lock, flags);
* The idle tasks have their own, simple scheduling class:
*/
idle->sched_class = &idle_sched_class;
- ftrace_graph_init_task(idle);
+ ftrace_graph_init_idle_task(idle, cpu);
}
/*
unsigned int dest_cpu;
int ret = 0;
- /*
- * Serialize against TASK_WAKING so that ttwu() and wunt() can
- * drop the rq->lock and still rely on ->cpus_allowed.
- */
-again:
- while (task_is_waking(p))
- cpu_relax();
- rq = task_rq_lock(p, &flags);
- if (task_is_waking(p)) {
- task_rq_unlock(rq, &flags);
- goto again;
- }
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ rq = __task_rq_lock(p);
if (!cpumask_intersects(new_mask, cpu_active_mask)) {
ret = -EINVAL;
goto out;
dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (migrate_task(p, rq)) {
+ if (need_migrate_task(p)) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
tlb_migrate_finish(p->mm);
return 0;
}
out:
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
return ret;
}
* If we're not on a rq, the next wake-up will ensure we're
* placed properly.
*/
- if (p->se.on_rq) {
+ if (p->on_rq) {
deactivate_task(rq_src, p, 0);
set_task_cpu(p, dest_cpu);
activate_task(rq_dest, p, 0);
break;
#endif
}
+
+ update_max_interval();
+
return NOTIFY_OK;
}
INIT_LIST_HEAD(&cfs_rq->tasks);
#ifdef CONFIG_FAIR_GROUP_SCHED
cfs_rq->rq = rq;
+ /* allow initial update_cfs_load() to truncate */
+#ifdef CONFIG_SMP
+ cfs_rq->load_stamp = 1;
+#endif
#endif
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
}
{
int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
- return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+ return (nested == preempt_offset);
}
void __might_sleep(const char *file, int line, int preempt_offset)
#ifdef CONFIG_MAGIC_SYSRQ
static void normalize_task(struct rq *rq, struct task_struct *p)
{
+ const struct sched_class *prev_class = p->sched_class;
+ int old_prio = p->prio;
int on_rq;
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
if (on_rq)
deactivate_task(rq, p, 0);
__setscheduler(rq, p, SCHED_NORMAL, 0);
activate_task(rq, p, 0);
resched_task(rq->curr);
}
+
+ check_class_changed(rq, p, prev_class, old_prio);
}
void normalize_rt_tasks(void)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se;
- struct rq *rq;
int i;
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
tg->shares = NICE_0_LOAD;
for_each_possible_cpu(i) {
- rq = cpu_rq(i);
-
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)
rq = task_rq_lock(tsk, &flags);
running = task_current(rq, tsk);
- on_rq = tsk->se.on_rq;
+ on_rq = tsk->on_rq;
if (on_rq)
dequeue_task(rq, tsk, 0);
/* Propagate contribution to hierarchy */
raw_spin_lock_irqsave(&rq->lock, flags);
for_each_sched_entity(se)
- update_cfs_shares(group_cfs_rq(se), 0);
+ update_cfs_shares(group_cfs_rq(se));
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
}
static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup *old_cgrp, struct task_struct *task)
{
/*
* cgroup_exit() is called in the copy_process() failure path.