blob: 6150cd70f4484505e5dda73b34fd4c95d03f0a33 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * kernel/sched.c
3 *
4 * Kernel scheduler and related syscalls
5 *
6 * Copyright (C) 1991-2002 Linus Torvalds
7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 */
20
21#include <linux/mm.h>
22#include <linux/module.h>
23#include <linux/nmi.h>
24#include <linux/init.h>
25#include <asm/uaccess.h>
26#include <linux/highmem.h>
27#include <linux/smp_lock.h>
28#include <asm/mmu_context.h>
29#include <linux/interrupt.h>
Randy.Dunlapc59ede72006-01-11 12:17:46 -080030#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070031#include <linux/completion.h>
32#include <linux/kernel_stat.h>
Ingo Molnar9a11b49a2006-07-03 00:24:33 -070033#include <linux/debug_locks.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include <linux/security.h>
35#include <linux/notifier.h>
36#include <linux/profile.h>
Nigel Cunningham7dfb7102006-12-06 20:34:23 -080037#include <linux/freezer.h>
akpm@osdl.org198e2f12006-01-12 01:05:30 -080038#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070039#include <linux/blkdev.h>
40#include <linux/delay.h>
41#include <linux/smp.h>
42#include <linux/threads.h>
43#include <linux/timer.h>
44#include <linux/rcupdate.h>
45#include <linux/cpu.h>
46#include <linux/cpuset.h>
47#include <linux/percpu.h>
48#include <linux/kthread.h>
49#include <linux/seq_file.h>
50#include <linux/syscalls.h>
51#include <linux/times.h>
Jay Lan8f0ab512006-09-30 23:28:59 -070052#include <linux/tsacct_kern.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080053#include <linux/kprobes.h>
Shailabh Nagar0ff92242006-07-14 00:24:37 -070054#include <linux/delayacct.h>
Eric Dumazet5517d862007-05-08 00:32:57 -070055#include <linux/reciprocal_div.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070056
Eric Dumazet5517d862007-05-08 00:32:57 -070057#include <asm/tlb.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070058#include <asm/unistd.h>
59
60/*
Alexey Dobriyanb035b6d2007-02-10 01:45:10 -080061 * Scheduler clock - returns current time in nanosec units.
62 * This is default implementation.
63 * Architectures and sub-architectures can override this.
64 */
65unsigned long long __attribute__((weak)) sched_clock(void)
66{
67 return (unsigned long long)jiffies * (1000000000 / HZ);
68}
69
70/*
Ingo Molnarbb29ab22007-07-09 18:51:59 +020071 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
72 */
73void sched_clock_unstable_event(void)
74{
75}
76
77/*
Linus Torvalds1da177e2005-04-16 15:20:36 -070078 * Convert user-nice values [ -20 ... 0 ... 19 ]
79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
80 * and back.
81 */
82#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
83#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
84#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
85
86/*
87 * 'User priority' is the nice value converted to something we
88 * can work with better when scaling various scheduler parameters,
89 * it's a [ 0 ... 39 ] range.
90 */
91#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
92#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
93#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
94
95/*
96 * Some helpers for converting nanosecond timing to jiffy resolution
97 */
98#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
99#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
100
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200101#define NICE_0_LOAD SCHED_LOAD_SCALE
102#define NICE_0_SHIFT SCHED_LOAD_SHIFT
103
Linus Torvalds1da177e2005-04-16 15:20:36 -0700104/*
105 * These are the 'tuning knobs' of the scheduler:
106 *
107 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
108 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
109 * Timeslices get refilled after they expire.
110 */
111#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
112#define DEF_TIMESLICE (100 * HZ / 1000)
113#define ON_RUNQUEUE_WEIGHT 30
114#define CHILD_PENALTY 95
115#define PARENT_PENALTY 100
116#define EXIT_WEIGHT 3
117#define PRIO_BONUS_RATIO 25
118#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
119#define INTERACTIVE_DELTA 2
120#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
121#define STARVATION_LIMIT (MAX_SLEEP_AVG)
122#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
123
124/*
125 * If a task is 'interactive' then we reinsert it in the active
126 * array after it has expired its current timeslice. (it will not
127 * continue to run immediately, it will still roundrobin with
128 * other interactive tasks.)
129 *
130 * This part scales the interactivity limit depending on niceness.
131 *
132 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
133 * Here are a few examples of different nice levels:
134 *
135 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
136 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
137 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
138 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
139 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
140 *
141 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
142 * priority range a task can explore, a value of '1' means the
143 * task is rated interactive.)
144 *
145 * Ie. nice +19 tasks can never get 'interactive' enough to be
146 * reinserted into the active array. And only heavily CPU-hog nice -20
147 * tasks will be expired. Default nice 0 tasks are somewhere between,
148 * it takes some effort for them to get interactive, but it's not
149 * too hard.
150 */
151
152#define CURRENT_BONUS(p) \
153 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
154 MAX_SLEEP_AVG)
155
156#define GRANULARITY (10 * HZ / 1000 ? : 1)
157
158#ifdef CONFIG_SMP
159#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
160 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
161 num_online_cpus())
162#else
163#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
164 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
165#endif
166
167#define SCALE(v1,v1_max,v2_max) \
168 (v1) * (v2_max) / (v1_max)
169
170#define DELTA(p) \
Martin Andersson013d3862006-03-27 01:15:18 -0800171 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
172 INTERACTIVE_DELTA)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700173
174#define TASK_INTERACTIVE(p) \
175 ((p)->prio <= (p)->static_prio - DELTA(p))
176
177#define INTERACTIVE_SLEEP(p) \
178 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
179 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
180
181#define TASK_PREEMPTS_CURR(p, rq) \
Andrew Mortond5f9f942007-05-08 20:27:06 -0700182 ((p)->prio < (rq)->curr->prio)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184#define SCALE_PRIO(x, prio) \
Peter Williams2dd73a42006-06-27 02:54:34 -0700185 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186
Peter Williams2dd73a42006-06-27 02:54:34 -0700187static unsigned int static_prio_timeslice(int static_prio)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188{
Peter Williams2dd73a42006-06-27 02:54:34 -0700189 if (static_prio < NICE_TO_PRIO(0))
190 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700191 else
Peter Williams2dd73a42006-06-27 02:54:34 -0700192 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193}
Peter Williams2dd73a42006-06-27 02:54:34 -0700194
Eric Dumazet5517d862007-05-08 00:32:57 -0700195#ifdef CONFIG_SMP
196/*
197 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
198 * Since cpu_power is a 'constant', we can use a reciprocal divide.
199 */
200static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
201{
202 return reciprocal_divide(load, sg->reciprocal_cpu_power);
203}
204
205/*
206 * Each time a sched group cpu_power is changed,
207 * we must compute its reciprocal value
208 */
209static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
210{
211 sg->__cpu_power += val;
212 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
213}
214#endif
215
Borislav Petkov91fcdd42006-10-19 23:28:29 -0700216/*
217 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
218 * to time slice values: [800ms ... 100ms ... 5ms]
219 *
220 * The higher a thread's priority, the bigger timeslices
221 * it gets during one round of execution. But even the lowest
222 * priority thread gets MIN_TIMESLICE worth of execution time.
223 */
224
Ingo Molnar36c8b582006-07-03 00:25:41 -0700225static inline unsigned int task_timeslice(struct task_struct *p)
Peter Williams2dd73a42006-06-27 02:54:34 -0700226{
227 return static_prio_timeslice(p->static_prio);
228}
229
Ingo Molnare05606d2007-07-09 18:51:59 +0200230static inline int rt_policy(int policy)
231{
232 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
233 return 1;
234 return 0;
235}
236
237static inline int task_has_rt_policy(struct task_struct *p)
238{
239 return rt_policy(p->policy);
240}
241
Linus Torvalds1da177e2005-04-16 15:20:36 -0700242/*
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200243 * This is the priority-queue data structure of the RT scheduling class:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700244 */
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200245struct rt_prio_array {
246 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
247 struct list_head queue[MAX_RT_PRIO];
248};
Linus Torvalds1da177e2005-04-16 15:20:36 -0700249
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200250struct load_stat {
251 struct load_weight load;
252 u64 load_update_start, load_update_last;
253 unsigned long delta_fair, delta_exec, delta_stat;
254};
255
256/* CFS-related fields in a runqueue */
257struct cfs_rq {
258 struct load_weight load;
259 unsigned long nr_running;
260
261 s64 fair_clock;
262 u64 exec_clock;
263 s64 wait_runtime;
264 u64 sleeper_bonus;
265 unsigned long wait_runtime_overruns, wait_runtime_underruns;
266
267 struct rb_root tasks_timeline;
268 struct rb_node *rb_leftmost;
269 struct rb_node *rb_load_balance_curr;
270#ifdef CONFIG_FAIR_GROUP_SCHED
271 /* 'curr' points to currently running entity on this cfs_rq.
272 * It is set to NULL otherwise (i.e when none are currently running).
273 */
274 struct sched_entity *curr;
275 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
276
277 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
278 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
279 * (like users, containers etc.)
280 *
281 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
282 * list is used during load balance.
283 */
284 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
285#endif
286};
287
288/* Real-Time classes' related field in a runqueue: */
289struct rt_rq {
290 struct rt_prio_array active;
291 int rt_load_balance_idx;
292 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
293};
294
295/*
296 * The prio-array type of the old scheduler:
297 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298struct prio_array {
299 unsigned int nr_active;
Steven Rostedtd4448862006-06-27 02:54:29 -0700300 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301 struct list_head queue[MAX_PRIO];
302};
303
304/*
305 * This is the main, per-CPU runqueue data structure.
306 *
307 * Locking rule: those places that want to lock multiple runqueues
308 * (such as the load balancing or the thread migration code), lock
309 * acquire operations must be ordered by ascending &runqueue.
310 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700311struct rq {
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200312 spinlock_t lock; /* runqueue lock */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700313
314 /*
315 * nr_running and cpu_load should be in the same cacheline because
316 * remote CPUs use both these fields when doing load calculation.
317 */
318 unsigned long nr_running;
Peter Williams2dd73a42006-06-27 02:54:34 -0700319 unsigned long raw_weighted_load;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200320 #define CPU_LOAD_IDX_MAX 5
321 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
Siddha, Suresh Bbdecea32007-05-08 00:32:48 -0700322 unsigned char idle_at_tick;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -0700323#ifdef CONFIG_NO_HZ
324 unsigned char in_nohz_recently;
325#endif
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200326 struct load_stat ls; /* capture load from *all* tasks on this cpu */
327 unsigned long nr_load_updates;
328 u64 nr_switches;
329
330 struct cfs_rq cfs;
331#ifdef CONFIG_FAIR_GROUP_SCHED
332 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700333#endif
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200334 struct rt_rq rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335
336 /*
337 * This is part of a global counter where only the total sum
338 * over all CPUs matters. A task can increase this counter on
339 * one CPU and if it got migrated afterwards it may decrease
340 * it on another CPU. Always updated under the runqueue lock:
341 */
342 unsigned long nr_uninterruptible;
343
344 unsigned long expired_timestamp;
Mike Galbraithb18ec802006-12-10 02:20:31 -0800345 unsigned long long most_recent_timestamp;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200346
Ingo Molnar36c8b582006-07-03 00:25:41 -0700347 struct task_struct *curr, *idle;
Christoph Lameterc9819f42006-12-10 02:20:25 -0800348 unsigned long next_balance;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700349 struct mm_struct *prev_mm;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200350
Ingo Molnar70b97a72006-07-03 00:25:42 -0700351 struct prio_array *active, *expired, arrays[2];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352 int best_expired_prio;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200353
354 u64 clock, prev_clock_raw;
355 s64 clock_max_delta;
356
357 unsigned int clock_warps, clock_overflows;
358 unsigned int clock_unstable_events;
359
360 struct sched_class *load_balance_class;
361
Linus Torvalds1da177e2005-04-16 15:20:36 -0700362 atomic_t nr_iowait;
363
364#ifdef CONFIG_SMP
365 struct sched_domain *sd;
366
367 /* For active balancing */
368 int active_balance;
369 int push_cpu;
Christoph Lameter0a2966b2006-09-25 23:30:51 -0700370 int cpu; /* cpu of this runqueue */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700371
Ingo Molnar36c8b582006-07-03 00:25:41 -0700372 struct task_struct *migration_thread;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700373 struct list_head migration_queue;
374#endif
375
376#ifdef CONFIG_SCHEDSTATS
377 /* latency stats */
378 struct sched_info rq_sched_info;
379
380 /* sys_sched_yield() stats */
381 unsigned long yld_exp_empty;
382 unsigned long yld_act_empty;
383 unsigned long yld_both_empty;
384 unsigned long yld_cnt;
385
386 /* schedule() stats */
387 unsigned long sched_switch;
388 unsigned long sched_cnt;
389 unsigned long sched_goidle;
390
391 /* try_to_wake_up() stats */
392 unsigned long ttwu_cnt;
393 unsigned long ttwu_local;
394#endif
Ingo Molnarfcb99372006-07-03 00:25:10 -0700395 struct lock_class_key rq_lock_key;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700396};
397
Siddha, Suresh Bc3396622007-05-08 00:33:09 -0700398static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
Gautham R Shenoy5be93612007-05-09 02:34:04 -0700399static DEFINE_MUTEX(sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400
Ingo Molnardd41f592007-07-09 18:51:59 +0200401static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
402{
403 rq->curr->sched_class->check_preempt_curr(rq, p);
404}
405
Christoph Lameter0a2966b2006-09-25 23:30:51 -0700406static inline int cpu_of(struct rq *rq)
407{
408#ifdef CONFIG_SMP
409 return rq->cpu;
410#else
411 return 0;
412#endif
413}
414
Nick Piggin674311d2005-06-25 14:57:27 -0700415/*
Ingo Molnar20d315d2007-07-09 18:51:58 +0200416 * Per-runqueue clock, as finegrained as the platform can give us:
417 */
418static unsigned long long __rq_clock(struct rq *rq)
419{
420 u64 prev_raw = rq->prev_clock_raw;
421 u64 now = sched_clock();
422 s64 delta = now - prev_raw;
423 u64 clock = rq->clock;
424
425 /*
426 * Protect against sched_clock() occasionally going backwards:
427 */
428 if (unlikely(delta < 0)) {
429 clock++;
430 rq->clock_warps++;
431 } else {
432 /*
433 * Catch too large forward jumps too:
434 */
435 if (unlikely(delta > 2*TICK_NSEC)) {
436 clock++;
437 rq->clock_overflows++;
438 } else {
439 if (unlikely(delta > rq->clock_max_delta))
440 rq->clock_max_delta = delta;
441 clock += delta;
442 }
443 }
444
445 rq->prev_clock_raw = now;
446 rq->clock = clock;
447
448 return clock;
449}
450
451static inline unsigned long long rq_clock(struct rq *rq)
452{
453 int this_cpu = smp_processor_id();
454
455 if (this_cpu == cpu_of(rq))
456 return __rq_clock(rq);
457
458 return rq->clock;
459}
460
461/*
Nick Piggin674311d2005-06-25 14:57:27 -0700462 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -0700463 * See detach_destroy_domains: synchronize_sched for details.
Nick Piggin674311d2005-06-25 14:57:27 -0700464 *
465 * The domain tree of any CPU may only be accessed from within
466 * preempt-disabled sections.
467 */
Ingo Molnar48f24c42006-07-03 00:25:40 -0700468#define for_each_domain(cpu, __sd) \
469 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700470
471#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
472#define this_rq() (&__get_cpu_var(runqueues))
473#define task_rq(p) cpu_rq(task_cpu(p))
474#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
475
Ingo Molnar138a8ae2007-07-09 18:51:58 +0200476#ifdef CONFIG_FAIR_GROUP_SCHED
477/* Change a task's ->cfs_rq if it moves across CPUs */
478static inline void set_task_cfs_rq(struct task_struct *p)
479{
480 p->se.cfs_rq = &task_rq(p)->cfs;
481}
482#else
483static inline void set_task_cfs_rq(struct task_struct *p)
484{
485}
486#endif
487
Linus Torvalds1da177e2005-04-16 15:20:36 -0700488#ifndef prepare_arch_switch
Nick Piggin4866cde2005-06-25 14:57:23 -0700489# define prepare_arch_switch(next) do { } while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700490#endif
Nick Piggin4866cde2005-06-25 14:57:23 -0700491#ifndef finish_arch_switch
492# define finish_arch_switch(prev) do { } while (0)
493#endif
494
495#ifndef __ARCH_WANT_UNLOCKED_CTXSW
Ingo Molnar70b97a72006-07-03 00:25:42 -0700496static inline int task_running(struct rq *rq, struct task_struct *p)
Nick Piggin4866cde2005-06-25 14:57:23 -0700497{
498 return rq->curr == p;
499}
500
Ingo Molnar70b97a72006-07-03 00:25:42 -0700501static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -0700502{
503}
504
Ingo Molnar70b97a72006-07-03 00:25:42 -0700505static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
Nick Piggin4866cde2005-06-25 14:57:23 -0700506{
Ingo Molnarda04c032005-09-13 11:17:59 +0200507#ifdef CONFIG_DEBUG_SPINLOCK
508 /* this is a valid case when another task releases the spinlock */
509 rq->lock.owner = current;
510#endif
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700511 /*
512 * If we are tracking spinlock dependencies then we have to
513 * fix up the runqueue lock - which gets 'carried over' from
514 * prev into current:
515 */
516 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
517
Nick Piggin4866cde2005-06-25 14:57:23 -0700518 spin_unlock_irq(&rq->lock);
519}
520
521#else /* __ARCH_WANT_UNLOCKED_CTXSW */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700522static inline int task_running(struct rq *rq, struct task_struct *p)
Nick Piggin4866cde2005-06-25 14:57:23 -0700523{
524#ifdef CONFIG_SMP
525 return p->oncpu;
526#else
527 return rq->curr == p;
528#endif
529}
530
Ingo Molnar70b97a72006-07-03 00:25:42 -0700531static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -0700532{
533#ifdef CONFIG_SMP
534 /*
535 * We can optimise this out completely for !SMP, because the
536 * SMP rebalancing from interrupt is the only thing that cares
537 * here.
538 */
539 next->oncpu = 1;
540#endif
541#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
542 spin_unlock_irq(&rq->lock);
543#else
544 spin_unlock(&rq->lock);
545#endif
546}
547
Ingo Molnar70b97a72006-07-03 00:25:42 -0700548static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
Nick Piggin4866cde2005-06-25 14:57:23 -0700549{
550#ifdef CONFIG_SMP
551 /*
552 * After ->oncpu is cleared, the task can be moved to a different CPU.
553 * We must ensure this doesn't happen until the switch is completely
554 * finished.
555 */
556 smp_wmb();
557 prev->oncpu = 0;
558#endif
559#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
560 local_irq_enable();
561#endif
562}
563#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700564
565/*
Ingo Molnarb29739f2006-06-27 02:54:51 -0700566 * __task_rq_lock - lock the runqueue a given task resides on.
567 * Must be called interrupts disabled.
568 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700569static inline struct rq *__task_rq_lock(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700570 __acquires(rq->lock)
571{
Ingo Molnar70b97a72006-07-03 00:25:42 -0700572 struct rq *rq;
Ingo Molnarb29739f2006-06-27 02:54:51 -0700573
574repeat_lock_task:
575 rq = task_rq(p);
576 spin_lock(&rq->lock);
577 if (unlikely(rq != task_rq(p))) {
578 spin_unlock(&rq->lock);
579 goto repeat_lock_task;
580 }
581 return rq;
582}
583
584/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700585 * task_rq_lock - lock the runqueue a given task resides on and disable
586 * interrupts. Note the ordering: we can safely lookup the task_rq without
587 * explicitly disabling preemption.
588 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700589static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700590 __acquires(rq->lock)
591{
Ingo Molnar70b97a72006-07-03 00:25:42 -0700592 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700593
594repeat_lock_task:
595 local_irq_save(*flags);
596 rq = task_rq(p);
597 spin_lock(&rq->lock);
598 if (unlikely(rq != task_rq(p))) {
599 spin_unlock_irqrestore(&rq->lock, *flags);
600 goto repeat_lock_task;
601 }
602 return rq;
603}
604
Ingo Molnar70b97a72006-07-03 00:25:42 -0700605static inline void __task_rq_unlock(struct rq *rq)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700606 __releases(rq->lock)
607{
608 spin_unlock(&rq->lock);
609}
610
Ingo Molnar70b97a72006-07-03 00:25:42 -0700611static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612 __releases(rq->lock)
613{
614 spin_unlock_irqrestore(&rq->lock, *flags);
615}
616
Linus Torvalds1da177e2005-04-16 15:20:36 -0700617/*
Robert P. J. Daycc2a73b2006-12-10 02:20:00 -0800618 * this_rq_lock - lock this runqueue and disable interrupts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700619 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700620static inline struct rq *this_rq_lock(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700621 __acquires(rq->lock)
622{
Ingo Molnar70b97a72006-07-03 00:25:42 -0700623 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700624
625 local_irq_disable();
626 rq = this_rq();
627 spin_lock(&rq->lock);
628
629 return rq;
630}
631
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200632/*
633 * resched_task - mark a task 'to be rescheduled now'.
634 *
635 * On UP this means the setting of the need_resched flag, on SMP it
636 * might also involve a cross-CPU call to trigger the scheduler on
637 * the target CPU.
638 */
639#ifdef CONFIG_SMP
640
641#ifndef tsk_is_polling
642#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
643#endif
644
645static void resched_task(struct task_struct *p)
646{
647 int cpu;
648
649 assert_spin_locked(&task_rq(p)->lock);
650
651 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
652 return;
653
654 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
655
656 cpu = task_cpu(p);
657 if (cpu == smp_processor_id())
658 return;
659
660 /* NEED_RESCHED must be visible before we test polling */
661 smp_mb();
662 if (!tsk_is_polling(p))
663 smp_send_reschedule(cpu);
664}
665
666static void resched_cpu(int cpu)
667{
668 struct rq *rq = cpu_rq(cpu);
669 unsigned long flags;
670
671 if (!spin_trylock_irqsave(&rq->lock, flags))
672 return;
673 resched_task(cpu_curr(cpu));
674 spin_unlock_irqrestore(&rq->lock, flags);
675}
676#else
677static inline void resched_task(struct task_struct *p)
678{
679 assert_spin_locked(&task_rq(p)->lock);
680 set_tsk_need_resched(p);
681}
682#endif
683
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200684static u64 div64_likely32(u64 divident, unsigned long divisor)
685{
686#if BITS_PER_LONG == 32
687 if (likely(divident <= 0xffffffffULL))
688 return (u32)divident / divisor;
689 do_div(divident, divisor);
690
691 return divident;
692#else
693 return divident / divisor;
694#endif
695}
696
697#if BITS_PER_LONG == 32
698# define WMULT_CONST (~0UL)
699#else
700# define WMULT_CONST (1UL << 32)
701#endif
702
703#define WMULT_SHIFT 32
704
705static inline unsigned long
706calc_delta_mine(unsigned long delta_exec, unsigned long weight,
707 struct load_weight *lw)
708{
709 u64 tmp;
710
711 if (unlikely(!lw->inv_weight))
712 lw->inv_weight = WMULT_CONST / lw->weight;
713
714 tmp = (u64)delta_exec * weight;
715 /*
716 * Check whether we'd overflow the 64-bit multiplication:
717 */
718 if (unlikely(tmp > WMULT_CONST)) {
719 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
720 >> (WMULT_SHIFT/2);
721 } else {
722 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
723 }
724
725 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
726}
727
728static inline unsigned long
729calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
730{
731 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
732}
733
734static void update_load_add(struct load_weight *lw, unsigned long inc)
735{
736 lw->weight += inc;
737 lw->inv_weight = 0;
738}
739
740static void update_load_sub(struct load_weight *lw, unsigned long dec)
741{
742 lw->weight -= dec;
743 lw->inv_weight = 0;
744}
745
746static void __update_curr_load(struct rq *rq, struct load_stat *ls)
747{
748 if (rq->curr != rq->idle && ls->load.weight) {
749 ls->delta_exec += ls->delta_stat;
750 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
751 ls->delta_stat = 0;
752 }
753}
754
755/*
756 * Update delta_exec, delta_fair fields for rq.
757 *
758 * delta_fair clock advances at a rate inversely proportional to
759 * total load (rq->ls.load.weight) on the runqueue, while
760 * delta_exec advances at the same rate as wall-clock (provided
761 * cpu is not idle).
762 *
763 * delta_exec / delta_fair is a measure of the (smoothened) load on this
764 * runqueue over any given interval. This (smoothened) load is used
765 * during load balance.
766 *
767 * This function is called /before/ updating rq->ls.load
768 * and when switching tasks.
769 */
770static void update_curr_load(struct rq *rq, u64 now)
771{
772 struct load_stat *ls = &rq->ls;
773 u64 start;
774
775 start = ls->load_update_start;
776 ls->load_update_start = now;
777 ls->delta_stat += now - start;
778 /*
779 * Stagger updates to ls->delta_fair. Very frequent updates
780 * can be expensive.
781 */
782 if (ls->delta_stat >= sysctl_sched_stat_granularity)
783 __update_curr_load(rq, ls);
784}
785
Linus Torvalds1da177e2005-04-16 15:20:36 -0700786/*
Peter Williams2dd73a42006-06-27 02:54:34 -0700787 * To aid in avoiding the subversion of "niceness" due to uneven distribution
788 * of tasks with abnormal "nice" values across CPUs the contribution that
789 * each task makes to its run queue's load is weighted according to its
790 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
791 * scaled version of the new time slice allocation that they receive on time
792 * slice expiry etc.
793 */
794
795/*
796 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
797 * If static_prio_timeslice() is ever changed to break this assumption then
798 * this code will need modification
799 */
800#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
Ingo Molnardd41f592007-07-09 18:51:59 +0200801#define load_weight(lp) \
Peter Williams2dd73a42006-06-27 02:54:34 -0700802 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
803#define PRIO_TO_LOAD_WEIGHT(prio) \
Ingo Molnardd41f592007-07-09 18:51:59 +0200804 load_weight(static_prio_timeslice(prio))
Peter Williams2dd73a42006-06-27 02:54:34 -0700805#define RTPRIO_TO_LOAD_WEIGHT(rp) \
Ingo Molnardd41f592007-07-09 18:51:59 +0200806 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
807
808#define WEIGHT_IDLEPRIO 2
809#define WMULT_IDLEPRIO (1 << 31)
810
811/*
812 * Nice levels are multiplicative, with a gentle 10% change for every
813 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
814 * nice 1, it will get ~10% less CPU time than another CPU-bound task
815 * that remained on nice 0.
816 *
817 * The "10% effect" is relative and cumulative: from _any_ nice level,
818 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
819 * it's +10% CPU usage.
820 */
821static const int prio_to_weight[40] = {
822/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
823/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
824/* 0 */ NICE_0_LOAD /* 1024 */,
825/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
826/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
827};
828
829static const u32 prio_to_wmult[40] = {
830 48356, 60446, 75558, 94446, 118058, 147573,
831 184467, 230589, 288233, 360285, 450347,
832 562979, 703746, 879575, 1099582, 1374389,
833 717986, 2147483, 2684354, 3355443, 4194304,
834 244160, 6557201, 8196502, 10250518, 12782640,
835 16025997, 19976592, 24970740, 31350126, 39045157,
836 49367440, 61356675, 76695844, 95443717, 119304647,
837 148102320, 186737708, 238609294, 286331153,
838};
Peter Williams2dd73a42006-06-27 02:54:34 -0700839
Ingo Molnar36c8b582006-07-03 00:25:41 -0700840static inline void
Ingo Molnardd41f592007-07-09 18:51:59 +0200841inc_load(struct rq *rq, const struct task_struct *p, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700842{
Ingo Molnardd41f592007-07-09 18:51:59 +0200843 update_curr_load(rq, now);
844 update_load_add(&rq->ls.load, p->se.load.weight);
Peter Williams2dd73a42006-06-27 02:54:34 -0700845}
846
Ingo Molnar36c8b582006-07-03 00:25:41 -0700847static inline void
Ingo Molnardd41f592007-07-09 18:51:59 +0200848dec_load(struct rq *rq, const struct task_struct *p, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700849{
Ingo Molnardd41f592007-07-09 18:51:59 +0200850 update_curr_load(rq, now);
851 update_load_sub(&rq->ls.load, p->se.load.weight);
Peter Williams2dd73a42006-06-27 02:54:34 -0700852}
853
Ingo Molnardd41f592007-07-09 18:51:59 +0200854static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700855{
856 rq->nr_running++;
Ingo Molnardd41f592007-07-09 18:51:59 +0200857 inc_load(rq, p, now);
Peter Williams2dd73a42006-06-27 02:54:34 -0700858}
859
Ingo Molnardd41f592007-07-09 18:51:59 +0200860static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700861{
862 rq->nr_running--;
Ingo Molnardd41f592007-07-09 18:51:59 +0200863 dec_load(rq, p, now);
Peter Williams2dd73a42006-06-27 02:54:34 -0700864}
865
Ingo Molnardd41f592007-07-09 18:51:59 +0200866static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
867
868/*
869 * runqueue iterator, to support SMP load-balancing between different
870 * scheduling classes, without having to expose their internal data
871 * structures to the load-balancing proper:
872 */
873struct rq_iterator {
874 void *arg;
875 struct task_struct *(*start)(void *);
876 struct task_struct *(*next)(void *);
877};
878
879static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
880 unsigned long max_nr_move, unsigned long max_load_move,
881 struct sched_domain *sd, enum cpu_idle_type idle,
882 int *all_pinned, unsigned long *load_moved,
883 int this_best_prio, int best_prio, int best_prio_seen,
884 struct rq_iterator *iterator);
885
886#include "sched_stats.h"
887#include "sched_rt.c"
888#include "sched_fair.c"
889#include "sched_idletask.c"
890#ifdef CONFIG_SCHED_DEBUG
891# include "sched_debug.c"
892#endif
893
894#define sched_class_highest (&rt_sched_class)
895
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200896static void set_load_weight(struct task_struct *p)
897{
Ingo Molnardd41f592007-07-09 18:51:59 +0200898 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
899 p->se.wait_runtime = 0;
900
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200901 if (task_has_rt_policy(p)) {
Ingo Molnardd41f592007-07-09 18:51:59 +0200902 p->se.load.weight = prio_to_weight[0] * 2;
903 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
904 return;
905 }
906
907 /*
908 * SCHED_IDLE tasks get minimal weight:
909 */
910 if (p->policy == SCHED_IDLE) {
911 p->se.load.weight = WEIGHT_IDLEPRIO;
912 p->se.load.inv_weight = WMULT_IDLEPRIO;
913 return;
914 }
915
916 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
917 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200918}
919
Ingo Molnardd41f592007-07-09 18:51:59 +0200920static void
921enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200922{
923 sched_info_queued(p);
Ingo Molnardd41f592007-07-09 18:51:59 +0200924 p->sched_class->enqueue_task(rq, p, wakeup, now);
925 p->se.on_rq = 1;
926}
927
928static void
929dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
930{
931 p->sched_class->dequeue_task(rq, p, sleep, now);
932 p->se.on_rq = 0;
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200933}
934
935/*
Ingo Molnardd41f592007-07-09 18:51:59 +0200936 * __normal_prio - return the priority that is based on the static prio
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200937 */
Ingo Molnar14531182007-07-09 18:51:59 +0200938static inline int __normal_prio(struct task_struct *p)
939{
Ingo Molnardd41f592007-07-09 18:51:59 +0200940 return p->static_prio;
Ingo Molnar14531182007-07-09 18:51:59 +0200941}
942
943/*
Ingo Molnarb29739f2006-06-27 02:54:51 -0700944 * Calculate the expected normal priority: i.e. priority
945 * without taking RT-inheritance into account. Might be
946 * boosted by interactivity modifiers. Changes upon fork,
947 * setprio syscalls, and whenever the interactivity
948 * estimator recalculates.
949 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700950static inline int normal_prio(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700951{
952 int prio;
953
Ingo Molnare05606d2007-07-09 18:51:59 +0200954 if (task_has_rt_policy(p))
Ingo Molnarb29739f2006-06-27 02:54:51 -0700955 prio = MAX_RT_PRIO-1 - p->rt_priority;
956 else
957 prio = __normal_prio(p);
958 return prio;
959}
960
961/*
962 * Calculate the current priority, i.e. the priority
963 * taken into account by the scheduler. This value might
964 * be boosted by RT tasks, or might be boosted by
965 * interactivity modifiers. Will be RT if the task got
966 * RT-boosted. If not then it returns p->normal_prio.
967 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700968static int effective_prio(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700969{
970 p->normal_prio = normal_prio(p);
971 /*
972 * If we are RT tasks or we were boosted to RT priority,
973 * keep the priority unchanged. Otherwise, update priority
974 * to the normal priority:
975 */
976 if (!rt_prio(p->prio))
977 return p->normal_prio;
978 return p->prio;
979}
980
981/*
Ingo Molnardd41f592007-07-09 18:51:59 +0200982 * activate_task - move a task to the runqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700983 */
Ingo Molnardd41f592007-07-09 18:51:59 +0200984static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700985{
Ingo Molnardd41f592007-07-09 18:51:59 +0200986 u64 now = rq_clock(rq);
Con Kolivasd425b272006-03-31 02:31:29 -0800987
Ingo Molnardd41f592007-07-09 18:51:59 +0200988 if (p->state == TASK_UNINTERRUPTIBLE)
989 rq->nr_uninterruptible--;
990
991 enqueue_task(rq, p, wakeup, now);
992 inc_nr_running(p, rq, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700993}
994
995/*
Ingo Molnardd41f592007-07-09 18:51:59 +0200996 * activate_idle_task - move idle task to the _front_ of runqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700997 */
Ingo Molnardd41f592007-07-09 18:51:59 +0200998static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700999{
Ingo Molnardd41f592007-07-09 18:51:59 +02001000 u64 now = rq_clock(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001001
Ingo Molnardd41f592007-07-09 18:51:59 +02001002 if (p->state == TASK_UNINTERRUPTIBLE)
1003 rq->nr_uninterruptible--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001004
Ingo Molnardd41f592007-07-09 18:51:59 +02001005 enqueue_task(rq, p, 0, now);
1006 inc_nr_running(p, rq, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001007}
1008
1009/*
1010 * deactivate_task - remove a task from the runqueue.
1011 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001012static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001013{
Ingo Molnardd41f592007-07-09 18:51:59 +02001014 u64 now = rq_clock(rq);
1015
1016 if (p->state == TASK_UNINTERRUPTIBLE)
1017 rq->nr_uninterruptible++;
1018
1019 dequeue_task(rq, p, sleep, now);
1020 dec_nr_running(p, rq, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001021}
1022
Linus Torvalds1da177e2005-04-16 15:20:36 -07001023/**
1024 * task_curr - is this task currently executing on a CPU?
1025 * @p: the task in question.
1026 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001027inline int task_curr(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001028{
1029 return cpu_curr(task_cpu(p)) == p;
1030}
1031
Peter Williams2dd73a42006-06-27 02:54:34 -07001032/* Used instead of source_load when we know the type == 0 */
1033unsigned long weighted_cpuload(const int cpu)
1034{
Ingo Molnardd41f592007-07-09 18:51:59 +02001035 return cpu_rq(cpu)->ls.load.weight;
1036}
1037
1038static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1039{
1040#ifdef CONFIG_SMP
1041 task_thread_info(p)->cpu = cpu;
1042 set_task_cfs_rq(p);
1043#endif
Peter Williams2dd73a42006-06-27 02:54:34 -07001044}
1045
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046#ifdef CONFIG_SMP
Ingo Molnarc65cc872007-07-09 18:51:58 +02001047
Ingo Molnardd41f592007-07-09 18:51:59 +02001048void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
Ingo Molnarc65cc872007-07-09 18:51:58 +02001049{
Ingo Molnardd41f592007-07-09 18:51:59 +02001050 int old_cpu = task_cpu(p);
1051 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1052 u64 clock_offset, fair_clock_offset;
1053
1054 clock_offset = old_rq->clock - new_rq->clock;
1055 fair_clock_offset = old_rq->cfs.fair_clock -
1056 new_rq->cfs.fair_clock;
1057 if (p->se.wait_start)
1058 p->se.wait_start -= clock_offset;
1059 if (p->se.wait_start_fair)
1060 p->se.wait_start_fair -= fair_clock_offset;
1061 if (p->se.sleep_start)
1062 p->se.sleep_start -= clock_offset;
1063 if (p->se.block_start)
1064 p->se.block_start -= clock_offset;
1065 if (p->se.sleep_start_fair)
1066 p->se.sleep_start_fair -= fair_clock_offset;
1067
1068 __set_task_cpu(p, new_cpu);
Ingo Molnarc65cc872007-07-09 18:51:58 +02001069}
1070
Ingo Molnar70b97a72006-07-03 00:25:42 -07001071struct migration_req {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001072 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001073
Ingo Molnar36c8b582006-07-03 00:25:41 -07001074 struct task_struct *task;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001075 int dest_cpu;
1076
Linus Torvalds1da177e2005-04-16 15:20:36 -07001077 struct completion done;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001078};
Linus Torvalds1da177e2005-04-16 15:20:36 -07001079
1080/*
1081 * The task's runqueue lock must be held.
1082 * Returns true if you have to wait for migration thread.
1083 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001084static int
Ingo Molnar70b97a72006-07-03 00:25:42 -07001085migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001086{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001087 struct rq *rq = task_rq(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001088
1089 /*
1090 * If the task is not on a runqueue (and not running), then
1091 * it is sufficient to simply update the task's cpu field.
1092 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001093 if (!p->se.on_rq && !task_running(rq, p)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001094 set_task_cpu(p, dest_cpu);
1095 return 0;
1096 }
1097
1098 init_completion(&req->done);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001099 req->task = p;
1100 req->dest_cpu = dest_cpu;
1101 list_add(&req->list, &rq->migration_queue);
Ingo Molnar48f24c42006-07-03 00:25:40 -07001102
Linus Torvalds1da177e2005-04-16 15:20:36 -07001103 return 1;
1104}
1105
1106/*
1107 * wait_task_inactive - wait for a thread to unschedule.
1108 *
1109 * The caller must ensure that the task *will* unschedule sometime soon,
1110 * else this function might spin for a *long* time. This function can't
1111 * be called with interrupts off, or it may introduce deadlock with
1112 * smp_call_function() if an IPI is sent by the same process we are
1113 * waiting to become inactive.
1114 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001115void wait_task_inactive(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001116{
1117 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02001118 int running, on_rq;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001119 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001120
1121repeat:
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001122 /*
1123 * We do the initial early heuristics without holding
1124 * any task-queue locks at all. We'll only try to get
1125 * the runqueue lock when things look like they will
1126 * work out!
1127 */
1128 rq = task_rq(p);
1129
1130 /*
1131 * If the task is actively running on another CPU
1132 * still, just relax and busy-wait without holding
1133 * any locks.
1134 *
1135 * NOTE! Since we don't hold any locks, it's not
1136 * even sure that "rq" stays as the right runqueue!
1137 * But we don't care, since "task_running()" will
1138 * return false if the runqueue has changed and p
1139 * is actually now running somewhere else!
1140 */
1141 while (task_running(rq, p))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001142 cpu_relax();
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001143
1144 /*
1145 * Ok, time to look more closely! We need the rq
1146 * lock now, to be *sure*. If we're wrong, we'll
1147 * just go back and repeat.
1148 */
1149 rq = task_rq_lock(p, &flags);
1150 running = task_running(rq, p);
Ingo Molnardd41f592007-07-09 18:51:59 +02001151 on_rq = p->se.on_rq;
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001152 task_rq_unlock(rq, &flags);
1153
1154 /*
1155 * Was it really running after all now that we
1156 * checked with the proper locks actually held?
1157 *
1158 * Oops. Go back and try again..
1159 */
1160 if (unlikely(running)) {
1161 cpu_relax();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001162 goto repeat;
1163 }
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001164
1165 /*
1166 * It's not enough that it's not actively running,
1167 * it must be off the runqueue _entirely_, and not
1168 * preempted!
1169 *
1170 * So if it wa still runnable (but just not actively
1171 * running right now), it's preempted, and we should
1172 * yield - it could be a while.
1173 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001174 if (unlikely(on_rq)) {
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001175 yield();
1176 goto repeat;
1177 }
1178
1179 /*
1180 * Ahh, all good. It wasn't running, and it wasn't
1181 * runnable, which means that it will never become
1182 * running in the future either. We're all done!
1183 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001184}
1185
1186/***
1187 * kick_process - kick a running thread to enter/exit the kernel
1188 * @p: the to-be-kicked thread
1189 *
1190 * Cause a process which is running on another CPU to enter
1191 * kernel-mode, without any delay. (to get signals handled.)
1192 *
1193 * NOTE: this function doesnt have to take the runqueue lock,
1194 * because all it wants to ensure is that the remote task enters
1195 * the kernel. If the IPI races and the task has been migrated
1196 * to another CPU then no harm is done and the purpose has been
1197 * achieved as well.
1198 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001199void kick_process(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001200{
1201 int cpu;
1202
1203 preempt_disable();
1204 cpu = task_cpu(p);
1205 if ((cpu != smp_processor_id()) && task_curr(p))
1206 smp_send_reschedule(cpu);
1207 preempt_enable();
1208}
1209
1210/*
Peter Williams2dd73a42006-06-27 02:54:34 -07001211 * Return a low guess at the load of a migration-source cpu weighted
1212 * according to the scheduling class and "nice" value.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001213 *
1214 * We want to under-estimate the load of migration sources, to
1215 * balance conservatively.
1216 */
Con Kolivasb9104722005-11-08 21:38:55 -08001217static inline unsigned long source_load(int cpu, int type)
1218{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001219 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02001220 unsigned long total = weighted_cpuload(cpu);
Nick Piggina2000572006-02-10 01:51:02 -08001221
Peter Williams2dd73a42006-06-27 02:54:34 -07001222 if (type == 0)
Ingo Molnardd41f592007-07-09 18:51:59 +02001223 return total;
Peter Williams2dd73a42006-06-27 02:54:34 -07001224
Ingo Molnardd41f592007-07-09 18:51:59 +02001225 return min(rq->cpu_load[type-1], total);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001226}
1227
1228/*
Peter Williams2dd73a42006-06-27 02:54:34 -07001229 * Return a high guess at the load of a migration-target cpu weighted
1230 * according to the scheduling class and "nice" value.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001231 */
Con Kolivasb9104722005-11-08 21:38:55 -08001232static inline unsigned long target_load(int cpu, int type)
1233{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001234 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02001235 unsigned long total = weighted_cpuload(cpu);
Nick Piggina2000572006-02-10 01:51:02 -08001236
Peter Williams2dd73a42006-06-27 02:54:34 -07001237 if (type == 0)
Ingo Molnardd41f592007-07-09 18:51:59 +02001238 return total;
Peter Williams2dd73a42006-06-27 02:54:34 -07001239
Ingo Molnardd41f592007-07-09 18:51:59 +02001240 return max(rq->cpu_load[type-1], total);
Peter Williams2dd73a42006-06-27 02:54:34 -07001241}
1242
1243/*
1244 * Return the average load per task on the cpu's run queue
1245 */
1246static inline unsigned long cpu_avg_load_per_task(int cpu)
1247{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001248 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02001249 unsigned long total = weighted_cpuload(cpu);
Peter Williams2dd73a42006-06-27 02:54:34 -07001250 unsigned long n = rq->nr_running;
1251
Ingo Molnardd41f592007-07-09 18:51:59 +02001252 return n ? total / n : SCHED_LOAD_SCALE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001253}
1254
Nick Piggin147cbb42005-06-25 14:57:19 -07001255/*
1256 * find_idlest_group finds and returns the least busy CPU group within the
1257 * domain.
1258 */
1259static struct sched_group *
1260find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1261{
1262 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1263 unsigned long min_load = ULONG_MAX, this_load = 0;
1264 int load_idx = sd->forkexec_idx;
1265 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1266
1267 do {
1268 unsigned long load, avg_load;
1269 int local_group;
1270 int i;
1271
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001272 /* Skip over this group if it has no CPUs allowed */
1273 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1274 goto nextgroup;
1275
Nick Piggin147cbb42005-06-25 14:57:19 -07001276 local_group = cpu_isset(this_cpu, group->cpumask);
Nick Piggin147cbb42005-06-25 14:57:19 -07001277
1278 /* Tally up the load of all CPUs in the group */
1279 avg_load = 0;
1280
1281 for_each_cpu_mask(i, group->cpumask) {
1282 /* Bias balancing toward cpus of our domain */
1283 if (local_group)
1284 load = source_load(i, load_idx);
1285 else
1286 load = target_load(i, load_idx);
1287
1288 avg_load += load;
1289 }
1290
1291 /* Adjust by relative CPU power of the group */
Eric Dumazet5517d862007-05-08 00:32:57 -07001292 avg_load = sg_div_cpu_power(group,
1293 avg_load * SCHED_LOAD_SCALE);
Nick Piggin147cbb42005-06-25 14:57:19 -07001294
1295 if (local_group) {
1296 this_load = avg_load;
1297 this = group;
1298 } else if (avg_load < min_load) {
1299 min_load = avg_load;
1300 idlest = group;
1301 }
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001302nextgroup:
Nick Piggin147cbb42005-06-25 14:57:19 -07001303 group = group->next;
1304 } while (group != sd->groups);
1305
1306 if (!idlest || 100*this_load < imbalance*min_load)
1307 return NULL;
1308 return idlest;
1309}
1310
1311/*
Satoru Takeuchi0feaece2006-10-03 01:14:10 -07001312 * find_idlest_cpu - find the idlest cpu among the cpus in group.
Nick Piggin147cbb42005-06-25 14:57:19 -07001313 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07001314static int
1315find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
Nick Piggin147cbb42005-06-25 14:57:19 -07001316{
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001317 cpumask_t tmp;
Nick Piggin147cbb42005-06-25 14:57:19 -07001318 unsigned long load, min_load = ULONG_MAX;
1319 int idlest = -1;
1320 int i;
1321
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001322 /* Traverse only the allowed CPUs */
1323 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1324
1325 for_each_cpu_mask(i, tmp) {
Peter Williams2dd73a42006-06-27 02:54:34 -07001326 load = weighted_cpuload(i);
Nick Piggin147cbb42005-06-25 14:57:19 -07001327
1328 if (load < min_load || (load == min_load && i == this_cpu)) {
1329 min_load = load;
1330 idlest = i;
1331 }
1332 }
1333
1334 return idlest;
1335}
1336
Nick Piggin476d1392005-06-25 14:57:29 -07001337/*
1338 * sched_balance_self: balance the current task (running on cpu) in domains
1339 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1340 * SD_BALANCE_EXEC.
1341 *
1342 * Balance, ie. select the least loaded group.
1343 *
1344 * Returns the target CPU number, or the same CPU if no balancing is needed.
1345 *
1346 * preempt must be disabled.
1347 */
1348static int sched_balance_self(int cpu, int flag)
1349{
1350 struct task_struct *t = current;
1351 struct sched_domain *tmp, *sd = NULL;
Nick Piggin147cbb42005-06-25 14:57:19 -07001352
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07001353 for_each_domain(cpu, tmp) {
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07001354 /*
1355 * If power savings logic is enabled for a domain, stop there.
1356 */
1357 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1358 break;
Nick Piggin476d1392005-06-25 14:57:29 -07001359 if (tmp->flags & flag)
1360 sd = tmp;
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07001361 }
Nick Piggin476d1392005-06-25 14:57:29 -07001362
1363 while (sd) {
1364 cpumask_t span;
1365 struct sched_group *group;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001366 int new_cpu, weight;
1367
1368 if (!(sd->flags & flag)) {
1369 sd = sd->child;
1370 continue;
1371 }
Nick Piggin476d1392005-06-25 14:57:29 -07001372
1373 span = sd->span;
1374 group = find_idlest_group(sd, t, cpu);
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001375 if (!group) {
1376 sd = sd->child;
1377 continue;
1378 }
Nick Piggin476d1392005-06-25 14:57:29 -07001379
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001380 new_cpu = find_idlest_cpu(group, t, cpu);
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001381 if (new_cpu == -1 || new_cpu == cpu) {
1382 /* Now try balancing at a lower domain level of cpu */
1383 sd = sd->child;
1384 continue;
1385 }
Nick Piggin476d1392005-06-25 14:57:29 -07001386
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001387 /* Now try balancing at a lower domain level of new_cpu */
Nick Piggin476d1392005-06-25 14:57:29 -07001388 cpu = new_cpu;
Nick Piggin476d1392005-06-25 14:57:29 -07001389 sd = NULL;
1390 weight = cpus_weight(span);
1391 for_each_domain(cpu, tmp) {
1392 if (weight <= cpus_weight(tmp->span))
1393 break;
1394 if (tmp->flags & flag)
1395 sd = tmp;
1396 }
1397 /* while loop will break here if sd == NULL */
1398 }
1399
1400 return cpu;
1401}
1402
1403#endif /* CONFIG_SMP */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001404
1405/*
1406 * wake_idle() will wake a task on an idle cpu if task->cpu is
1407 * not idle and an idle cpu is available. The span of cpus to
1408 * search starts with cpus closest then further out as needed,
1409 * so we always favor a closer, idle cpu.
1410 *
1411 * Returns the CPU we should wake onto.
1412 */
1413#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
Ingo Molnar36c8b582006-07-03 00:25:41 -07001414static int wake_idle(int cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001415{
1416 cpumask_t tmp;
1417 struct sched_domain *sd;
1418 int i;
1419
Siddha, Suresh B49531982007-05-08 00:33:01 -07001420 /*
1421 * If it is idle, then it is the best cpu to run this task.
1422 *
1423 * This cpu is also the best, if it has more than one task already.
1424 * Siblings must be also busy(in most cases) as they didn't already
1425 * pickup the extra load from this cpu and hence we need not check
1426 * sibling runqueue info. This will avoid the checks and cache miss
1427 * penalities associated with that.
1428 */
1429 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001430 return cpu;
1431
1432 for_each_domain(cpu, sd) {
1433 if (sd->flags & SD_WAKE_IDLE) {
Nick Piggine0f364f2005-06-25 14:57:06 -07001434 cpus_and(tmp, sd->span, p->cpus_allowed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001435 for_each_cpu_mask(i, tmp) {
1436 if (idle_cpu(i))
1437 return i;
1438 }
1439 }
Nick Piggine0f364f2005-06-25 14:57:06 -07001440 else
1441 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001442 }
1443 return cpu;
1444}
1445#else
Ingo Molnar36c8b582006-07-03 00:25:41 -07001446static inline int wake_idle(int cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001447{
1448 return cpu;
1449}
1450#endif
1451
1452/***
1453 * try_to_wake_up - wake up a thread
1454 * @p: the to-be-woken-up thread
1455 * @state: the mask of task states that can be woken
1456 * @sync: do a synchronous wakeup?
1457 *
1458 * Put it on the run-queue if it's not already there. The "current"
1459 * thread is always on the run-queue (except when the actual
1460 * re-schedule is in progress), and as such you're allowed to do
1461 * the simpler "current->state = TASK_RUNNING" to mark yourself
1462 * runnable without the overhead of this.
1463 *
1464 * returns failure only if the task is already active.
1465 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001466static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001467{
1468 int cpu, this_cpu, success = 0;
1469 unsigned long flags;
1470 long old_state;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001471 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001472#ifdef CONFIG_SMP
Nick Piggin78979862005-06-25 14:57:13 -07001473 struct sched_domain *sd, *this_sd = NULL;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001474 unsigned long load, this_load;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001475 int new_cpu;
1476#endif
1477
1478 rq = task_rq_lock(p, &flags);
1479 old_state = p->state;
1480 if (!(old_state & state))
1481 goto out;
1482
Ingo Molnardd41f592007-07-09 18:51:59 +02001483 if (p->se.on_rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001484 goto out_running;
1485
1486 cpu = task_cpu(p);
1487 this_cpu = smp_processor_id();
1488
1489#ifdef CONFIG_SMP
1490 if (unlikely(task_running(rq, p)))
1491 goto out_activate;
1492
Nick Piggin78979862005-06-25 14:57:13 -07001493 new_cpu = cpu;
1494
Linus Torvalds1da177e2005-04-16 15:20:36 -07001495 schedstat_inc(rq, ttwu_cnt);
1496 if (cpu == this_cpu) {
1497 schedstat_inc(rq, ttwu_local);
Nick Piggin78979862005-06-25 14:57:13 -07001498 goto out_set_cpu;
1499 }
1500
1501 for_each_domain(this_cpu, sd) {
1502 if (cpu_isset(cpu, sd->span)) {
1503 schedstat_inc(sd, ttwu_wake_remote);
1504 this_sd = sd;
1505 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001506 }
1507 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001508
Nick Piggin78979862005-06-25 14:57:13 -07001509 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001510 goto out_set_cpu;
1511
Linus Torvalds1da177e2005-04-16 15:20:36 -07001512 /*
Nick Piggin78979862005-06-25 14:57:13 -07001513 * Check for affine wakeup and passive balancing possibilities.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001514 */
Nick Piggin78979862005-06-25 14:57:13 -07001515 if (this_sd) {
1516 int idx = this_sd->wake_idx;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001517 unsigned int imbalance;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001518
Nick Piggina3f21bc2005-06-25 14:57:15 -07001519 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1520
Nick Piggin78979862005-06-25 14:57:13 -07001521 load = source_load(cpu, idx);
1522 this_load = target_load(this_cpu, idx);
1523
Nick Piggin78979862005-06-25 14:57:13 -07001524 new_cpu = this_cpu; /* Wake to this CPU if we can */
1525
Nick Piggina3f21bc2005-06-25 14:57:15 -07001526 if (this_sd->flags & SD_WAKE_AFFINE) {
1527 unsigned long tl = this_load;
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08001528 unsigned long tl_per_task;
1529
1530 tl_per_task = cpu_avg_load_per_task(this_cpu);
Peter Williams2dd73a42006-06-27 02:54:34 -07001531
Linus Torvalds1da177e2005-04-16 15:20:36 -07001532 /*
Nick Piggina3f21bc2005-06-25 14:57:15 -07001533 * If sync wakeup then subtract the (maximum possible)
1534 * effect of the currently running task from the load
1535 * of the current CPU:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001536 */
Nick Piggina3f21bc2005-06-25 14:57:15 -07001537 if (sync)
Ingo Molnardd41f592007-07-09 18:51:59 +02001538 tl -= current->se.load.weight;
Nick Piggina3f21bc2005-06-25 14:57:15 -07001539
1540 if ((tl <= load &&
Peter Williams2dd73a42006-06-27 02:54:34 -07001541 tl + target_load(cpu, idx) <= tl_per_task) ||
Ingo Molnardd41f592007-07-09 18:51:59 +02001542 100*(tl + p->se.load.weight) <= imbalance*load) {
Nick Piggina3f21bc2005-06-25 14:57:15 -07001543 /*
1544 * This domain has SD_WAKE_AFFINE and
1545 * p is cache cold in this domain, and
1546 * there is no bad imbalance.
1547 */
1548 schedstat_inc(this_sd, ttwu_move_affine);
1549 goto out_set_cpu;
1550 }
1551 }
1552
1553 /*
1554 * Start passive balancing when half the imbalance_pct
1555 * limit is reached.
1556 */
1557 if (this_sd->flags & SD_WAKE_BALANCE) {
1558 if (imbalance*this_load <= 100*load) {
1559 schedstat_inc(this_sd, ttwu_move_balance);
1560 goto out_set_cpu;
1561 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001562 }
1563 }
1564
1565 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1566out_set_cpu:
1567 new_cpu = wake_idle(new_cpu, p);
1568 if (new_cpu != cpu) {
1569 set_task_cpu(p, new_cpu);
1570 task_rq_unlock(rq, &flags);
1571 /* might preempt at this point */
1572 rq = task_rq_lock(p, &flags);
1573 old_state = p->state;
1574 if (!(old_state & state))
1575 goto out;
Ingo Molnardd41f592007-07-09 18:51:59 +02001576 if (p->se.on_rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001577 goto out_running;
1578
1579 this_cpu = smp_processor_id();
1580 cpu = task_cpu(p);
1581 }
1582
1583out_activate:
1584#endif /* CONFIG_SMP */
Ingo Molnardd41f592007-07-09 18:51:59 +02001585 activate_task(rq, p, 1);
Ingo Molnard79fc0f2005-09-10 00:26:12 -07001586 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001587 * Sync wakeups (i.e. those types of wakeups where the waker
1588 * has indicated that it will leave the CPU in short order)
1589 * don't trigger a preemption, if the woken up task will run on
1590 * this cpu. (in this case the 'I will reschedule' promise of
1591 * the waker guarantees that the freshly woken up task is going
1592 * to be considered on this CPU.)
1593 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001594 if (!sync || cpu != this_cpu)
1595 check_preempt_curr(rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001596 success = 1;
1597
1598out_running:
1599 p->state = TASK_RUNNING;
1600out:
1601 task_rq_unlock(rq, &flags);
1602
1603 return success;
1604}
1605
Ingo Molnar36c8b582006-07-03 00:25:41 -07001606int fastcall wake_up_process(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001607{
1608 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1609 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1610}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001611EXPORT_SYMBOL(wake_up_process);
1612
Ingo Molnar36c8b582006-07-03 00:25:41 -07001613int fastcall wake_up_state(struct task_struct *p, unsigned int state)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001614{
1615 return try_to_wake_up(p, state, 0);
1616}
1617
Linus Torvalds1da177e2005-04-16 15:20:36 -07001618/*
1619 * Perform scheduler related setup for a newly forked process p.
1620 * p is forked by current.
Ingo Molnardd41f592007-07-09 18:51:59 +02001621 *
1622 * __sched_fork() is basic setup used by init_idle() too:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001623 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001624static void __sched_fork(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001625{
Ingo Molnardd41f592007-07-09 18:51:59 +02001626 p->se.wait_start_fair = 0;
1627 p->se.wait_start = 0;
1628 p->se.exec_start = 0;
1629 p->se.sum_exec_runtime = 0;
1630 p->se.delta_exec = 0;
1631 p->se.delta_fair_run = 0;
1632 p->se.delta_fair_sleep = 0;
1633 p->se.wait_runtime = 0;
1634 p->se.sum_wait_runtime = 0;
1635 p->se.sum_sleep_runtime = 0;
1636 p->se.sleep_start = 0;
1637 p->se.sleep_start_fair = 0;
1638 p->se.block_start = 0;
1639 p->se.sleep_max = 0;
1640 p->se.block_max = 0;
1641 p->se.exec_max = 0;
1642 p->se.wait_max = 0;
1643 p->se.wait_runtime_overruns = 0;
1644 p->se.wait_runtime_underruns = 0;
Nick Piggin476d1392005-06-25 14:57:29 -07001645
Ingo Molnardd41f592007-07-09 18:51:59 +02001646 INIT_LIST_HEAD(&p->run_list);
1647 p->se.on_rq = 0;
Nick Piggin476d1392005-06-25 14:57:29 -07001648
Linus Torvalds1da177e2005-04-16 15:20:36 -07001649 /*
1650 * We mark the process as running here, but have not actually
1651 * inserted it onto the runqueue yet. This guarantees that
1652 * nobody will actually run it, and a signal or other external
1653 * event cannot wake it up and insert it on the runqueue either.
1654 */
1655 p->state = TASK_RUNNING;
Ingo Molnardd41f592007-07-09 18:51:59 +02001656}
1657
1658/*
1659 * fork()/clone()-time setup:
1660 */
1661void sched_fork(struct task_struct *p, int clone_flags)
1662{
1663 int cpu = get_cpu();
1664
1665 __sched_fork(p);
1666
1667#ifdef CONFIG_SMP
1668 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1669#endif
1670 __set_task_cpu(p, cpu);
Ingo Molnarb29739f2006-06-27 02:54:51 -07001671
1672 /*
1673 * Make sure we do not leak PI boosting priority to the child:
1674 */
1675 p->prio = current->normal_prio;
1676
Chandra Seetharaman52f17b62006-07-14 00:24:38 -07001677#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
Ingo Molnardd41f592007-07-09 18:51:59 +02001678 if (likely(sched_info_on()))
Chandra Seetharaman52f17b62006-07-14 00:24:38 -07001679 memset(&p->sched_info, 0, sizeof(p->sched_info));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001680#endif
Chen, Kenneth Wd6077cb2006-02-14 13:53:10 -08001681#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
Nick Piggin4866cde2005-06-25 14:57:23 -07001682 p->oncpu = 0;
1683#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001684#ifdef CONFIG_PREEMPT
Nick Piggin4866cde2005-06-25 14:57:23 -07001685 /* Want to start with kernel preemption disabled. */
Al Viroa1261f52005-11-13 16:06:55 -08001686 task_thread_info(p)->preempt_count = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001687#endif
Nick Piggin476d1392005-06-25 14:57:29 -07001688 put_cpu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001689}
1690
1691/*
Ingo Molnardd41f592007-07-09 18:51:59 +02001692 * After fork, child runs first. (default) If set to 0 then
1693 * parent will (try to) run first.
1694 */
1695unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1696
1697/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698 * wake_up_new_task - wake up a newly created task for the first time.
1699 *
1700 * This function will do some initial scheduler statistics housekeeping
1701 * that must be done for every newly created context, then puts the task
1702 * on the runqueue and wakes it.
1703 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001704void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001705{
1706 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02001707 struct rq *rq;
1708 int this_cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001709
1710 rq = task_rq_lock(p, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001711 BUG_ON(p->state != TASK_RUNNING);
Ingo Molnardd41f592007-07-09 18:51:59 +02001712 this_cpu = smp_processor_id(); /* parent's CPU */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001713
1714 p->prio = effective_prio(p);
1715
Ingo Molnardd41f592007-07-09 18:51:59 +02001716 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1717 task_cpu(p) != this_cpu || !current->se.on_rq) {
1718 activate_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001719 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001720 /*
Ingo Molnardd41f592007-07-09 18:51:59 +02001721 * Let the scheduling class do new task startup
1722 * management (if any):
Linus Torvalds1da177e2005-04-16 15:20:36 -07001723 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001724 p->sched_class->task_new(rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001725 }
Ingo Molnardd41f592007-07-09 18:51:59 +02001726 check_preempt_curr(rq, p);
1727 task_rq_unlock(rq, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001728}
1729
Linus Torvalds1da177e2005-04-16 15:20:36 -07001730/**
Nick Piggin4866cde2005-06-25 14:57:23 -07001731 * prepare_task_switch - prepare to switch tasks
1732 * @rq: the runqueue preparing to switch
1733 * @next: the task we are going to switch to.
1734 *
1735 * This is called with the rq lock held and interrupts off. It must
1736 * be paired with a subsequent finish_task_switch after the context
1737 * switch.
1738 *
1739 * prepare_task_switch sets up locking and calls architecture specific
1740 * hooks.
1741 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001742static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -07001743{
1744 prepare_lock_switch(rq, next);
1745 prepare_arch_switch(next);
1746}
1747
1748/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001749 * finish_task_switch - clean up after a task-switch
Jeff Garzik344baba2005-09-07 01:15:17 -04001750 * @rq: runqueue associated with task-switch
Linus Torvalds1da177e2005-04-16 15:20:36 -07001751 * @prev: the thread we just switched away from.
1752 *
Nick Piggin4866cde2005-06-25 14:57:23 -07001753 * finish_task_switch must be called after the context switch, paired
1754 * with a prepare_task_switch call before the context switch.
1755 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1756 * and do any other architecture-specific cleanup actions.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001757 *
1758 * Note that we may have delayed dropping an mm in context_switch(). If
1759 * so, we finish that here outside of the runqueue lock. (Doing it
1760 * with the lock held can cause deadlocks; see schedule() for
1761 * details.)
1762 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001763static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001764 __releases(rq->lock)
1765{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001766 struct mm_struct *mm = rq->prev_mm;
Oleg Nesterov55a101f2006-09-29 02:01:10 -07001767 long prev_state;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001768
1769 rq->prev_mm = NULL;
1770
1771 /*
1772 * A task struct has one reference for the use as "current".
Oleg Nesterovc394cc92006-09-29 02:01:11 -07001773 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
Oleg Nesterov55a101f2006-09-29 02:01:10 -07001774 * schedule one last time. The schedule call will never return, and
1775 * the scheduled task must drop that reference.
Oleg Nesterovc394cc92006-09-29 02:01:11 -07001776 * The test for TASK_DEAD must occur while the runqueue locks are
Linus Torvalds1da177e2005-04-16 15:20:36 -07001777 * still held, otherwise prev could be scheduled on another cpu, die
1778 * there before we look at prev->state, and then the reference would
1779 * be dropped twice.
1780 * Manfred Spraul <manfred@colorfullife.com>
1781 */
Oleg Nesterov55a101f2006-09-29 02:01:10 -07001782 prev_state = prev->state;
Nick Piggin4866cde2005-06-25 14:57:23 -07001783 finish_arch_switch(prev);
1784 finish_lock_switch(rq, prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001785 if (mm)
1786 mmdrop(mm);
Oleg Nesterovc394cc92006-09-29 02:01:11 -07001787 if (unlikely(prev_state == TASK_DEAD)) {
bibo maoc6fd91f2006-03-26 01:38:20 -08001788 /*
1789 * Remove function-return probe instances associated with this
1790 * task and put them back on the free list.
1791 */
1792 kprobe_flush_task(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001793 put_task_struct(prev);
bibo maoc6fd91f2006-03-26 01:38:20 -08001794 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001795}
1796
1797/**
1798 * schedule_tail - first thing a freshly forked thread must call.
1799 * @prev: the thread we just switched away from.
1800 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001801asmlinkage void schedule_tail(struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001802 __releases(rq->lock)
1803{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001804 struct rq *rq = this_rq();
1805
Nick Piggin4866cde2005-06-25 14:57:23 -07001806 finish_task_switch(rq, prev);
1807#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1808 /* In this case, finish_task_switch does not reenable preemption */
1809 preempt_enable();
1810#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001811 if (current->set_child_tid)
1812 put_user(current->pid, current->set_child_tid);
1813}
1814
1815/*
1816 * context_switch - switch to the new MM and the new
1817 * thread's register state.
1818 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001819static inline void
Ingo Molnar70b97a72006-07-03 00:25:42 -07001820context_switch(struct rq *rq, struct task_struct *prev,
Ingo Molnar36c8b582006-07-03 00:25:41 -07001821 struct task_struct *next)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001822{
Ingo Molnardd41f592007-07-09 18:51:59 +02001823 struct mm_struct *mm, *oldmm;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001824
Ingo Molnardd41f592007-07-09 18:51:59 +02001825 prepare_task_switch(rq, next);
1826 mm = next->mm;
1827 oldmm = prev->active_mm;
Zachary Amsden9226d122007-02-13 13:26:21 +01001828 /*
1829 * For paravirt, this is coupled with an exit in switch_to to
1830 * combine the page table reload and the switch backend into
1831 * one hypercall.
1832 */
1833 arch_enter_lazy_cpu_mode();
1834
Ingo Molnardd41f592007-07-09 18:51:59 +02001835 if (unlikely(!mm)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001836 next->active_mm = oldmm;
1837 atomic_inc(&oldmm->mm_count);
1838 enter_lazy_tlb(oldmm, next);
1839 } else
1840 switch_mm(oldmm, mm, next);
1841
Ingo Molnardd41f592007-07-09 18:51:59 +02001842 if (unlikely(!prev->mm)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001843 prev->active_mm = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001844 rq->prev_mm = oldmm;
1845 }
Ingo Molnar3a5f5e42006-07-14 00:24:27 -07001846 /*
1847 * Since the runqueue lock will be released by the next
1848 * task (which is an invalid locking op but in the case
1849 * of the scheduler it's an obvious special-case), so we
1850 * do an early lockdep release here:
1851 */
1852#ifndef __ARCH_WANT_UNLOCKED_CTXSW
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07001853 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
Ingo Molnar3a5f5e42006-07-14 00:24:27 -07001854#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001855
1856 /* Here we just switch the register state and the stack. */
1857 switch_to(prev, next, prev);
1858
Ingo Molnardd41f592007-07-09 18:51:59 +02001859 barrier();
1860 /*
1861 * this_rq must be evaluated again because prev may have moved
1862 * CPUs since it called schedule(), thus the 'rq' on its stack
1863 * frame will be invalid.
1864 */
1865 finish_task_switch(this_rq(), prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001866}
1867
1868/*
1869 * nr_running, nr_uninterruptible and nr_context_switches:
1870 *
1871 * externally visible scheduler statistics: current number of runnable
1872 * threads, current number of uninterruptible-sleeping threads, total
1873 * number of context switches performed since bootup.
1874 */
1875unsigned long nr_running(void)
1876{
1877 unsigned long i, sum = 0;
1878
1879 for_each_online_cpu(i)
1880 sum += cpu_rq(i)->nr_running;
1881
1882 return sum;
1883}
1884
1885unsigned long nr_uninterruptible(void)
1886{
1887 unsigned long i, sum = 0;
1888
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08001889 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001890 sum += cpu_rq(i)->nr_uninterruptible;
1891
1892 /*
1893 * Since we read the counters lockless, it might be slightly
1894 * inaccurate. Do not allow it to go below zero though:
1895 */
1896 if (unlikely((long)sum < 0))
1897 sum = 0;
1898
1899 return sum;
1900}
1901
1902unsigned long long nr_context_switches(void)
1903{
Steven Rostedtcc94abf2006-06-27 02:54:31 -07001904 int i;
1905 unsigned long long sum = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001906
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08001907 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001908 sum += cpu_rq(i)->nr_switches;
1909
1910 return sum;
1911}
1912
1913unsigned long nr_iowait(void)
1914{
1915 unsigned long i, sum = 0;
1916
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08001917 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001918 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1919
1920 return sum;
1921}
1922
Jack Steinerdb1b1fe2006-03-31 02:31:21 -08001923unsigned long nr_active(void)
1924{
1925 unsigned long i, running = 0, uninterruptible = 0;
1926
1927 for_each_online_cpu(i) {
1928 running += cpu_rq(i)->nr_running;
1929 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1930 }
1931
1932 if (unlikely((long)uninterruptible < 0))
1933 uninterruptible = 0;
1934
1935 return running + uninterruptible;
1936}
1937
Linus Torvalds1da177e2005-04-16 15:20:36 -07001938/*
Ingo Molnardd41f592007-07-09 18:51:59 +02001939 * Update rq->cpu_load[] statistics. This function is usually called every
1940 * scheduler tick (TICK_NSEC).
Ingo Molnar48f24c42006-07-03 00:25:40 -07001941 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001942static void update_cpu_load(struct rq *this_rq)
Ingo Molnar48f24c42006-07-03 00:25:40 -07001943{
Ingo Molnardd41f592007-07-09 18:51:59 +02001944 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1945 unsigned long total_load = this_rq->ls.load.weight;
1946 unsigned long this_load = total_load;
1947 struct load_stat *ls = &this_rq->ls;
1948 u64 now = __rq_clock(this_rq);
1949 int i, scale;
1950
1951 this_rq->nr_load_updates++;
1952 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1953 goto do_avg;
1954
1955 /* Update delta_fair/delta_exec fields first */
1956 update_curr_load(this_rq, now);
1957
1958 fair_delta64 = ls->delta_fair + 1;
1959 ls->delta_fair = 0;
1960
1961 exec_delta64 = ls->delta_exec + 1;
1962 ls->delta_exec = 0;
1963
1964 sample_interval64 = now - ls->load_update_last;
1965 ls->load_update_last = now;
1966
1967 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1968 sample_interval64 = TICK_NSEC;
1969
1970 if (exec_delta64 > sample_interval64)
1971 exec_delta64 = sample_interval64;
1972
1973 idle_delta64 = sample_interval64 - exec_delta64;
1974
1975 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1976 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1977
1978 this_load = (unsigned long)tmp64;
1979
1980do_avg:
1981
1982 /* Update our load: */
1983 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1984 unsigned long old_load, new_load;
1985
1986 /* scale is effectively 1 << i now, and >> i divides by scale */
1987
1988 old_load = this_rq->cpu_load[i];
1989 new_load = this_load;
1990
1991 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1992 }
Ingo Molnar48f24c42006-07-03 00:25:40 -07001993}
1994
Ingo Molnardd41f592007-07-09 18:51:59 +02001995#ifdef CONFIG_SMP
1996
Ingo Molnar48f24c42006-07-03 00:25:40 -07001997/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001998 * double_rq_lock - safely lock two runqueues
1999 *
2000 * Note this does not disable interrupts like task_rq_lock,
2001 * you need to do so manually before calling.
2002 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002003static void double_rq_lock(struct rq *rq1, struct rq *rq2)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002004 __acquires(rq1->lock)
2005 __acquires(rq2->lock)
2006{
Kirill Korotaev054b9102006-12-10 02:20:11 -08002007 BUG_ON(!irqs_disabled());
Linus Torvalds1da177e2005-04-16 15:20:36 -07002008 if (rq1 == rq2) {
2009 spin_lock(&rq1->lock);
2010 __acquire(rq2->lock); /* Fake it out ;) */
2011 } else {
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07002012 if (rq1 < rq2) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002013 spin_lock(&rq1->lock);
2014 spin_lock(&rq2->lock);
2015 } else {
2016 spin_lock(&rq2->lock);
2017 spin_lock(&rq1->lock);
2018 }
2019 }
2020}
2021
2022/*
2023 * double_rq_unlock - safely unlock two runqueues
2024 *
2025 * Note this does not restore interrupts like task_rq_unlock,
2026 * you need to do so manually after calling.
2027 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002028static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002029 __releases(rq1->lock)
2030 __releases(rq2->lock)
2031{
2032 spin_unlock(&rq1->lock);
2033 if (rq1 != rq2)
2034 spin_unlock(&rq2->lock);
2035 else
2036 __release(rq2->lock);
2037}
2038
2039/*
2040 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2041 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002042static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002043 __releases(this_rq->lock)
2044 __acquires(busiest->lock)
2045 __acquires(this_rq->lock)
2046{
Kirill Korotaev054b9102006-12-10 02:20:11 -08002047 if (unlikely(!irqs_disabled())) {
2048 /* printk() doesn't work good under rq->lock */
2049 spin_unlock(&this_rq->lock);
2050 BUG_ON(1);
2051 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002052 if (unlikely(!spin_trylock(&busiest->lock))) {
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07002053 if (busiest < this_rq) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002054 spin_unlock(&this_rq->lock);
2055 spin_lock(&busiest->lock);
2056 spin_lock(&this_rq->lock);
2057 } else
2058 spin_lock(&busiest->lock);
2059 }
2060}
2061
2062/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002063 * If dest_cpu is allowed for this process, migrate the task to it.
2064 * This is accomplished by forcing the cpu_allowed mask to only
2065 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
2066 * the cpu_allowed mask is restored.
2067 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07002068static void sched_migrate_task(struct task_struct *p, int dest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002069{
Ingo Molnar70b97a72006-07-03 00:25:42 -07002070 struct migration_req req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002071 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002072 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002073
2074 rq = task_rq_lock(p, &flags);
2075 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2076 || unlikely(cpu_is_offline(dest_cpu)))
2077 goto out;
2078
2079 /* force the process onto the specified CPU */
2080 if (migrate_task(p, dest_cpu, &req)) {
2081 /* Need to wait for migration thread (might exit: take ref). */
2082 struct task_struct *mt = rq->migration_thread;
Ingo Molnar36c8b582006-07-03 00:25:41 -07002083
Linus Torvalds1da177e2005-04-16 15:20:36 -07002084 get_task_struct(mt);
2085 task_rq_unlock(rq, &flags);
2086 wake_up_process(mt);
2087 put_task_struct(mt);
2088 wait_for_completion(&req.done);
Ingo Molnar36c8b582006-07-03 00:25:41 -07002089
Linus Torvalds1da177e2005-04-16 15:20:36 -07002090 return;
2091 }
2092out:
2093 task_rq_unlock(rq, &flags);
2094}
2095
2096/*
Nick Piggin476d1392005-06-25 14:57:29 -07002097 * sched_exec - execve() is a valuable balancing opportunity, because at
2098 * this point the task has the smallest effective memory and cache footprint.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002099 */
2100void sched_exec(void)
2101{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002102 int new_cpu, this_cpu = get_cpu();
Nick Piggin476d1392005-06-25 14:57:29 -07002103 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002104 put_cpu();
Nick Piggin476d1392005-06-25 14:57:29 -07002105 if (new_cpu != this_cpu)
2106 sched_migrate_task(current, new_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002107}
2108
2109/*
2110 * pull_task - move a task from a remote runqueue to the local runqueue.
2111 * Both runqueues must be locked.
2112 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002113static void pull_task(struct rq *src_rq, struct task_struct *p,
2114 struct rq *this_rq, int this_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002115{
Ingo Molnardd41f592007-07-09 18:51:59 +02002116 deactivate_task(src_rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002117 set_task_cpu(p, this_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02002118 activate_task(this_rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002119 /*
2120 * Note that idle threads have a prio of MAX_PRIO, for this test
2121 * to be always true for them.
2122 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002123 check_preempt_curr(this_rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002124}
2125
2126/*
2127 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2128 */
Arjan van de Ven858119e2006-01-14 13:20:43 -08002129static
Ingo Molnar70b97a72006-07-03 00:25:42 -07002130int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002131 struct sched_domain *sd, enum cpu_idle_type idle,
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07002132 int *all_pinned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002133{
2134 /*
2135 * We do not migrate tasks that are:
2136 * 1) running (obviously), or
2137 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2138 * 3) are cache-hot on their current CPU.
2139 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002140 if (!cpu_isset(this_cpu, p->cpus_allowed))
2141 return 0;
Nick Piggin81026792005-06-25 14:57:07 -07002142 *all_pinned = 0;
2143
2144 if (task_running(rq, p))
2145 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002146
2147 /*
Ingo Molnardd41f592007-07-09 18:51:59 +02002148 * Aggressive migration if too many balance attempts have failed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002149 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002150 if (sd->nr_balance_failed > sd->cache_nice_tries)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002151 return 1;
2152
Linus Torvalds1da177e2005-04-16 15:20:36 -07002153 return 1;
2154}
2155
Ingo Molnardd41f592007-07-09 18:51:59 +02002156static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2157 unsigned long max_nr_move, unsigned long max_load_move,
2158 struct sched_domain *sd, enum cpu_idle_type idle,
2159 int *all_pinned, unsigned long *load_moved,
2160 int this_best_prio, int best_prio, int best_prio_seen,
2161 struct rq_iterator *iterator)
2162{
2163 int pulled = 0, pinned = 0, skip_for_load;
2164 struct task_struct *p;
2165 long rem_load_move = max_load_move;
2166
2167 if (max_nr_move == 0 || max_load_move == 0)
2168 goto out;
2169
2170 pinned = 1;
2171
2172 /*
2173 * Start the load-balancing iterator:
2174 */
2175 p = iterator->start(iterator->arg);
2176next:
2177 if (!p)
2178 goto out;
2179 /*
2180 * To help distribute high priority tasks accross CPUs we don't
2181 * skip a task if it will be the highest priority task (i.e. smallest
2182 * prio value) on its new queue regardless of its load weight
2183 */
2184 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2185 SCHED_LOAD_SCALE_FUZZ;
2186 if (skip_for_load && p->prio < this_best_prio)
2187 skip_for_load = !best_prio_seen && p->prio == best_prio;
2188 if (skip_for_load ||
2189 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2190
2191 best_prio_seen |= p->prio == best_prio;
2192 p = iterator->next(iterator->arg);
2193 goto next;
2194 }
2195
2196 pull_task(busiest, p, this_rq, this_cpu);
2197 pulled++;
2198 rem_load_move -= p->se.load.weight;
2199
2200 /*
2201 * We only want to steal up to the prescribed number of tasks
2202 * and the prescribed amount of weighted load.
2203 */
2204 if (pulled < max_nr_move && rem_load_move > 0) {
2205 if (p->prio < this_best_prio)
2206 this_best_prio = p->prio;
2207 p = iterator->next(iterator->arg);
2208 goto next;
2209 }
2210out:
2211 /*
2212 * Right now, this is the only place pull_task() is called,
2213 * so we can safely collect pull_task() stats here rather than
2214 * inside pull_task().
2215 */
2216 schedstat_add(sd, lb_gained[idle], pulled);
2217
2218 if (all_pinned)
2219 *all_pinned = pinned;
2220 *load_moved = max_load_move - rem_load_move;
2221 return pulled;
2222}
Ingo Molnar48f24c42006-07-03 00:25:40 -07002223
Linus Torvalds1da177e2005-04-16 15:20:36 -07002224/*
Peter Williams2dd73a42006-06-27 02:54:34 -07002225 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2226 * load from busiest to this_rq, as part of a balancing operation within
2227 * "domain". Returns the number of tasks moved.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002228 *
2229 * Called with both runqueues locked.
2230 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002231static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
Peter Williams2dd73a42006-06-27 02:54:34 -07002232 unsigned long max_nr_move, unsigned long max_load_move,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002233 struct sched_domain *sd, enum cpu_idle_type idle,
Peter Williams2dd73a42006-06-27 02:54:34 -07002234 int *all_pinned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002235{
Ingo Molnardd41f592007-07-09 18:51:59 +02002236 struct sched_class *class = sched_class_highest;
2237 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2238 long rem_load_move = max_load_move;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002239
Ingo Molnardd41f592007-07-09 18:51:59 +02002240 do {
2241 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2242 max_nr_move, (unsigned long)rem_load_move,
2243 sd, idle, all_pinned, &load_moved);
2244 total_nr_moved += nr_moved;
2245 max_nr_move -= nr_moved;
2246 rem_load_move -= load_moved;
2247 class = class->next;
2248 } while (class && max_nr_move && rem_load_move > 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002249
Ingo Molnardd41f592007-07-09 18:51:59 +02002250 return total_nr_moved;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002251}
2252
2253/*
2254 * find_busiest_group finds and returns the busiest CPU group within the
Ingo Molnar48f24c42006-07-03 00:25:40 -07002255 * domain. It calculates and returns the amount of weighted load which
2256 * should be moved to restore balance via the imbalance parameter.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002257 */
2258static struct sched_group *
2259find_busiest_group(struct sched_domain *sd, int this_cpu,
Ingo Molnardd41f592007-07-09 18:51:59 +02002260 unsigned long *imbalance, enum cpu_idle_type idle,
2261 int *sd_idle, cpumask_t *cpus, int *balance)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002262{
2263 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2264 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
Siddha, Suresh B0c117f12005-09-10 00:26:21 -07002265 unsigned long max_pull;
Peter Williams2dd73a42006-06-27 02:54:34 -07002266 unsigned long busiest_load_per_task, busiest_nr_running;
2267 unsigned long this_load_per_task, this_nr_running;
Nick Piggin78979862005-06-25 14:57:13 -07002268 int load_idx;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002269#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2270 int power_savings_balance = 1;
2271 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2272 unsigned long min_nr_running = ULONG_MAX;
2273 struct sched_group *group_min = NULL, *group_leader = NULL;
2274#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002275
2276 max_load = this_load = total_load = total_pwr = 0;
Peter Williams2dd73a42006-06-27 02:54:34 -07002277 busiest_load_per_task = busiest_nr_running = 0;
2278 this_load_per_task = this_nr_running = 0;
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002279 if (idle == CPU_NOT_IDLE)
Nick Piggin78979862005-06-25 14:57:13 -07002280 load_idx = sd->busy_idx;
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002281 else if (idle == CPU_NEWLY_IDLE)
Nick Piggin78979862005-06-25 14:57:13 -07002282 load_idx = sd->newidle_idx;
2283 else
2284 load_idx = sd->idle_idx;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002285
2286 do {
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002287 unsigned long load, group_capacity;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002288 int local_group;
2289 int i;
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002290 unsigned int balance_cpu = -1, first_idle_cpu = 0;
Peter Williams2dd73a42006-06-27 02:54:34 -07002291 unsigned long sum_nr_running, sum_weighted_load;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002292
2293 local_group = cpu_isset(this_cpu, group->cpumask);
2294
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002295 if (local_group)
2296 balance_cpu = first_cpu(group->cpumask);
2297
Linus Torvalds1da177e2005-04-16 15:20:36 -07002298 /* Tally up the load of all CPUs in the group */
Peter Williams2dd73a42006-06-27 02:54:34 -07002299 sum_weighted_load = sum_nr_running = avg_load = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002300
2301 for_each_cpu_mask(i, group->cpumask) {
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002302 struct rq *rq;
2303
2304 if (!cpu_isset(i, *cpus))
2305 continue;
2306
2307 rq = cpu_rq(i);
Peter Williams2dd73a42006-06-27 02:54:34 -07002308
Nick Piggin5969fe02005-09-10 00:26:19 -07002309 if (*sd_idle && !idle_cpu(i))
2310 *sd_idle = 0;
2311
Linus Torvalds1da177e2005-04-16 15:20:36 -07002312 /* Bias balancing toward cpus of our domain */
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002313 if (local_group) {
2314 if (idle_cpu(i) && !first_idle_cpu) {
2315 first_idle_cpu = 1;
2316 balance_cpu = i;
2317 }
2318
Nick Piggina2000572006-02-10 01:51:02 -08002319 load = target_load(i, load_idx);
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002320 } else
Nick Piggina2000572006-02-10 01:51:02 -08002321 load = source_load(i, load_idx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002322
2323 avg_load += load;
Peter Williams2dd73a42006-06-27 02:54:34 -07002324 sum_nr_running += rq->nr_running;
Ingo Molnardd41f592007-07-09 18:51:59 +02002325 sum_weighted_load += weighted_cpuload(i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002326 }
2327
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002328 /*
2329 * First idle cpu or the first cpu(busiest) in this sched group
2330 * is eligible for doing load balancing at this and above
2331 * domains.
2332 */
2333 if (local_group && balance_cpu != this_cpu && balance) {
2334 *balance = 0;
2335 goto ret;
2336 }
2337
Linus Torvalds1da177e2005-04-16 15:20:36 -07002338 total_load += avg_load;
Eric Dumazet5517d862007-05-08 00:32:57 -07002339 total_pwr += group->__cpu_power;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002340
2341 /* Adjust by relative CPU power of the group */
Eric Dumazet5517d862007-05-08 00:32:57 -07002342 avg_load = sg_div_cpu_power(group,
2343 avg_load * SCHED_LOAD_SCALE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002344
Eric Dumazet5517d862007-05-08 00:32:57 -07002345 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002346
Linus Torvalds1da177e2005-04-16 15:20:36 -07002347 if (local_group) {
2348 this_load = avg_load;
2349 this = group;
Peter Williams2dd73a42006-06-27 02:54:34 -07002350 this_nr_running = sum_nr_running;
2351 this_load_per_task = sum_weighted_load;
2352 } else if (avg_load > max_load &&
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002353 sum_nr_running > group_capacity) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002354 max_load = avg_load;
2355 busiest = group;
Peter Williams2dd73a42006-06-27 02:54:34 -07002356 busiest_nr_running = sum_nr_running;
2357 busiest_load_per_task = sum_weighted_load;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002358 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002359
2360#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2361 /*
2362 * Busy processors will not participate in power savings
2363 * balance.
2364 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002365 if (idle == CPU_NOT_IDLE ||
2366 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2367 goto group_next;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002368
2369 /*
2370 * If the local group is idle or completely loaded
2371 * no need to do power savings balance at this domain
2372 */
2373 if (local_group && (this_nr_running >= group_capacity ||
2374 !this_nr_running))
2375 power_savings_balance = 0;
2376
Ingo Molnardd41f592007-07-09 18:51:59 +02002377 /*
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002378 * If a group is already running at full capacity or idle,
2379 * don't include that group in power savings calculations
Ingo Molnardd41f592007-07-09 18:51:59 +02002380 */
2381 if (!power_savings_balance || sum_nr_running >= group_capacity
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002382 || !sum_nr_running)
Ingo Molnardd41f592007-07-09 18:51:59 +02002383 goto group_next;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002384
Ingo Molnardd41f592007-07-09 18:51:59 +02002385 /*
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002386 * Calculate the group which has the least non-idle load.
Ingo Molnardd41f592007-07-09 18:51:59 +02002387 * This is the group from where we need to pick up the load
2388 * for saving power
2389 */
2390 if ((sum_nr_running < min_nr_running) ||
2391 (sum_nr_running == min_nr_running &&
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002392 first_cpu(group->cpumask) <
2393 first_cpu(group_min->cpumask))) {
Ingo Molnardd41f592007-07-09 18:51:59 +02002394 group_min = group;
2395 min_nr_running = sum_nr_running;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002396 min_load_per_task = sum_weighted_load /
2397 sum_nr_running;
Ingo Molnardd41f592007-07-09 18:51:59 +02002398 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002399
Ingo Molnardd41f592007-07-09 18:51:59 +02002400 /*
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002401 * Calculate the group which is almost near its
Ingo Molnardd41f592007-07-09 18:51:59 +02002402 * capacity but still has some space to pick up some load
2403 * from other group and save more power
2404 */
2405 if (sum_nr_running <= group_capacity - 1) {
2406 if (sum_nr_running > leader_nr_running ||
2407 (sum_nr_running == leader_nr_running &&
2408 first_cpu(group->cpumask) >
2409 first_cpu(group_leader->cpumask))) {
2410 group_leader = group;
2411 leader_nr_running = sum_nr_running;
2412 }
Ingo Molnar48f24c42006-07-03 00:25:40 -07002413 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002414group_next:
2415#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002416 group = group->next;
2417 } while (group != sd->groups);
2418
Peter Williams2dd73a42006-06-27 02:54:34 -07002419 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002420 goto out_balanced;
2421
2422 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2423
2424 if (this_load >= avg_load ||
2425 100*max_load <= sd->imbalance_pct*this_load)
2426 goto out_balanced;
2427
Peter Williams2dd73a42006-06-27 02:54:34 -07002428 busiest_load_per_task /= busiest_nr_running;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002429 /*
2430 * We're trying to get all the cpus to the average_load, so we don't
2431 * want to push ourselves above the average load, nor do we wish to
2432 * reduce the max loaded cpu below the average load, as either of these
2433 * actions would just result in more rebalancing later, and ping-pong
2434 * tasks around. Thus we look for the minimum possible imbalance.
2435 * Negative imbalances (*we* are more loaded than anyone else) will
2436 * be counted as no imbalance for these purposes -- we can't fix that
2437 * by pulling tasks to us. Be careful of negative numbers as they'll
2438 * appear as very large values with unsigned longs.
2439 */
Peter Williams2dd73a42006-06-27 02:54:34 -07002440 if (max_load <= busiest_load_per_task)
2441 goto out_balanced;
2442
2443 /*
2444 * In the presence of smp nice balancing, certain scenarios can have
2445 * max load less than avg load(as we skip the groups at or below
2446 * its cpu_power, while calculating max_load..)
2447 */
2448 if (max_load < avg_load) {
2449 *imbalance = 0;
2450 goto small_imbalance;
2451 }
Siddha, Suresh B0c117f12005-09-10 00:26:21 -07002452
2453 /* Don't want to pull so many tasks that a group would go idle */
Peter Williams2dd73a42006-06-27 02:54:34 -07002454 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
Siddha, Suresh B0c117f12005-09-10 00:26:21 -07002455
Linus Torvalds1da177e2005-04-16 15:20:36 -07002456 /* How much load to actually move to equalise the imbalance */
Eric Dumazet5517d862007-05-08 00:32:57 -07002457 *imbalance = min(max_pull * busiest->__cpu_power,
2458 (avg_load - this_load) * this->__cpu_power)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002459 / SCHED_LOAD_SCALE;
2460
Peter Williams2dd73a42006-06-27 02:54:34 -07002461 /*
2462 * if *imbalance is less than the average load per runnable task
2463 * there is no gaurantee that any tasks will be moved so we'll have
2464 * a think about bumping its value to force at least one task to be
2465 * moved
2466 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002467 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07002468 unsigned long tmp, pwr_now, pwr_move;
Peter Williams2dd73a42006-06-27 02:54:34 -07002469 unsigned int imbn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002470
Peter Williams2dd73a42006-06-27 02:54:34 -07002471small_imbalance:
2472 pwr_move = pwr_now = 0;
2473 imbn = 2;
2474 if (this_nr_running) {
2475 this_load_per_task /= this_nr_running;
2476 if (busiest_load_per_task > this_load_per_task)
2477 imbn = 1;
2478 } else
2479 this_load_per_task = SCHED_LOAD_SCALE;
2480
Ingo Molnardd41f592007-07-09 18:51:59 +02002481 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2482 busiest_load_per_task * imbn) {
Peter Williams2dd73a42006-06-27 02:54:34 -07002483 *imbalance = busiest_load_per_task;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002484 return busiest;
2485 }
2486
2487 /*
2488 * OK, we don't have enough imbalance to justify moving tasks,
2489 * however we may be able to increase total CPU power used by
2490 * moving them.
2491 */
2492
Eric Dumazet5517d862007-05-08 00:32:57 -07002493 pwr_now += busiest->__cpu_power *
2494 min(busiest_load_per_task, max_load);
2495 pwr_now += this->__cpu_power *
2496 min(this_load_per_task, this_load);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002497 pwr_now /= SCHED_LOAD_SCALE;
2498
2499 /* Amount of load we'd subtract */
Eric Dumazet5517d862007-05-08 00:32:57 -07002500 tmp = sg_div_cpu_power(busiest,
2501 busiest_load_per_task * SCHED_LOAD_SCALE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002502 if (max_load > tmp)
Eric Dumazet5517d862007-05-08 00:32:57 -07002503 pwr_move += busiest->__cpu_power *
Peter Williams2dd73a42006-06-27 02:54:34 -07002504 min(busiest_load_per_task, max_load - tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002505
2506 /* Amount of load we'd add */
Eric Dumazet5517d862007-05-08 00:32:57 -07002507 if (max_load * busiest->__cpu_power <
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08002508 busiest_load_per_task * SCHED_LOAD_SCALE)
Eric Dumazet5517d862007-05-08 00:32:57 -07002509 tmp = sg_div_cpu_power(this,
2510 max_load * busiest->__cpu_power);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002511 else
Eric Dumazet5517d862007-05-08 00:32:57 -07002512 tmp = sg_div_cpu_power(this,
2513 busiest_load_per_task * SCHED_LOAD_SCALE);
2514 pwr_move += this->__cpu_power *
2515 min(this_load_per_task, this_load + tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002516 pwr_move /= SCHED_LOAD_SCALE;
2517
2518 /* Move if we gain throughput */
2519 if (pwr_move <= pwr_now)
2520 goto out_balanced;
2521
Peter Williams2dd73a42006-06-27 02:54:34 -07002522 *imbalance = busiest_load_per_task;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002523 }
2524
Linus Torvalds1da177e2005-04-16 15:20:36 -07002525 return busiest;
2526
2527out_balanced:
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002528#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002529 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002530 goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002531
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002532 if (this == group_leader && group_leader != group_min) {
2533 *imbalance = min_load_per_task;
2534 return group_min;
2535 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002536#endif
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002537ret:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002538 *imbalance = 0;
2539 return NULL;
2540}
2541
2542/*
2543 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2544 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002545static struct rq *
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002546find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002547 unsigned long imbalance, cpumask_t *cpus)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002548{
Ingo Molnar70b97a72006-07-03 00:25:42 -07002549 struct rq *busiest = NULL, *rq;
Peter Williams2dd73a42006-06-27 02:54:34 -07002550 unsigned long max_load = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002551 int i;
2552
2553 for_each_cpu_mask(i, group->cpumask) {
Ingo Molnardd41f592007-07-09 18:51:59 +02002554 unsigned long wl;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002555
2556 if (!cpu_isset(i, *cpus))
2557 continue;
2558
Ingo Molnar48f24c42006-07-03 00:25:40 -07002559 rq = cpu_rq(i);
Ingo Molnardd41f592007-07-09 18:51:59 +02002560 wl = weighted_cpuload(i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002561
Ingo Molnardd41f592007-07-09 18:51:59 +02002562 if (rq->nr_running == 1 && wl > imbalance)
Peter Williams2dd73a42006-06-27 02:54:34 -07002563 continue;
2564
Ingo Molnardd41f592007-07-09 18:51:59 +02002565 if (wl > max_load) {
2566 max_load = wl;
Ingo Molnar48f24c42006-07-03 00:25:40 -07002567 busiest = rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002568 }
2569 }
2570
2571 return busiest;
2572}
2573
2574/*
Nick Piggin77391d72005-06-25 14:57:30 -07002575 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2576 * so long as it is large enough.
2577 */
2578#define MAX_PINNED_INTERVAL 512
2579
Ingo Molnar48f24c42006-07-03 00:25:40 -07002580static inline unsigned long minus_1_or_zero(unsigned long n)
2581{
2582 return n > 0 ? n - 1 : 0;
2583}
2584
Nick Piggin77391d72005-06-25 14:57:30 -07002585/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002586 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2587 * tasks if there is an imbalance.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002588 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002589static int load_balance(int this_cpu, struct rq *this_rq,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002590 struct sched_domain *sd, enum cpu_idle_type idle,
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002591 int *balance)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002592{
Ingo Molnar48f24c42006-07-03 00:25:40 -07002593 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002594 struct sched_group *group;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002595 unsigned long imbalance;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002596 struct rq *busiest;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002597 cpumask_t cpus = CPU_MASK_ALL;
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002598 unsigned long flags;
Nick Piggin5969fe02005-09-10 00:26:19 -07002599
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002600 /*
2601 * When power savings policy is enabled for the parent domain, idle
2602 * sibling can pick up load irrespective of busy siblings. In this case,
Ingo Molnardd41f592007-07-09 18:51:59 +02002603 * let the state of idle sibling percolate up as CPU_IDLE, instead of
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002604 * portraying it as CPU_NOT_IDLE.
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002605 */
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002606 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002607 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002608 sd_idle = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002609
Linus Torvalds1da177e2005-04-16 15:20:36 -07002610 schedstat_inc(sd, lb_cnt[idle]);
2611
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002612redo:
2613 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002614 &cpus, balance);
2615
Chen, Kenneth W06066712006-12-10 02:20:35 -08002616 if (*balance == 0)
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002617 goto out_balanced;
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002618
Linus Torvalds1da177e2005-04-16 15:20:36 -07002619 if (!group) {
2620 schedstat_inc(sd, lb_nobusyg[idle]);
2621 goto out_balanced;
2622 }
2623
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002624 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002625 if (!busiest) {
2626 schedstat_inc(sd, lb_nobusyq[idle]);
2627 goto out_balanced;
2628 }
2629
Nick Piggindb935db2005-06-25 14:57:11 -07002630 BUG_ON(busiest == this_rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002631
2632 schedstat_add(sd, lb_imbalance[idle], imbalance);
2633
2634 nr_moved = 0;
2635 if (busiest->nr_running > 1) {
2636 /*
2637 * Attempt to move tasks. If find_busiest_group has found
2638 * an imbalance but busiest->nr_running <= 1, the group is
2639 * still unbalanced. nr_moved simply stays zero, so it is
2640 * correctly treated as an imbalance.
2641 */
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002642 local_irq_save(flags);
Nick Piggine17224b2005-09-10 00:26:18 -07002643 double_rq_lock(this_rq, busiest);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002644 nr_moved = move_tasks(this_rq, this_cpu, busiest,
Ingo Molnar48f24c42006-07-03 00:25:40 -07002645 minus_1_or_zero(busiest->nr_running),
2646 imbalance, sd, idle, &all_pinned);
Nick Piggine17224b2005-09-10 00:26:18 -07002647 double_rq_unlock(this_rq, busiest);
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002648 local_irq_restore(flags);
Nick Piggin81026792005-06-25 14:57:07 -07002649
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002650 /*
2651 * some other cpu did the load balance for us.
2652 */
2653 if (nr_moved && this_cpu != smp_processor_id())
2654 resched_cpu(this_cpu);
2655
Nick Piggin81026792005-06-25 14:57:07 -07002656 /* All tasks on this runqueue were pinned by CPU affinity */
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002657 if (unlikely(all_pinned)) {
2658 cpu_clear(cpu_of(busiest), cpus);
2659 if (!cpus_empty(cpus))
2660 goto redo;
Nick Piggin81026792005-06-25 14:57:07 -07002661 goto out_balanced;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002662 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002663 }
Nick Piggin81026792005-06-25 14:57:07 -07002664
Linus Torvalds1da177e2005-04-16 15:20:36 -07002665 if (!nr_moved) {
2666 schedstat_inc(sd, lb_failed[idle]);
2667 sd->nr_balance_failed++;
2668
2669 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002670
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002671 spin_lock_irqsave(&busiest->lock, flags);
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002672
2673 /* don't kick the migration_thread, if the curr
2674 * task on busiest cpu can't be moved to this_cpu
2675 */
2676 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002677 spin_unlock_irqrestore(&busiest->lock, flags);
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002678 all_pinned = 1;
2679 goto out_one_pinned;
2680 }
2681
Linus Torvalds1da177e2005-04-16 15:20:36 -07002682 if (!busiest->active_balance) {
2683 busiest->active_balance = 1;
2684 busiest->push_cpu = this_cpu;
Nick Piggin81026792005-06-25 14:57:07 -07002685 active_balance = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002686 }
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002687 spin_unlock_irqrestore(&busiest->lock, flags);
Nick Piggin81026792005-06-25 14:57:07 -07002688 if (active_balance)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002689 wake_up_process(busiest->migration_thread);
2690
2691 /*
2692 * We've kicked active balancing, reset the failure
2693 * counter.
2694 */
Nick Piggin39507452005-06-25 14:57:09 -07002695 sd->nr_balance_failed = sd->cache_nice_tries+1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002696 }
Nick Piggin81026792005-06-25 14:57:07 -07002697 } else
Linus Torvalds1da177e2005-04-16 15:20:36 -07002698 sd->nr_balance_failed = 0;
2699
Nick Piggin81026792005-06-25 14:57:07 -07002700 if (likely(!active_balance)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002701 /* We were unbalanced, so reset the balancing interval */
2702 sd->balance_interval = sd->min_interval;
Nick Piggin81026792005-06-25 14:57:07 -07002703 } else {
2704 /*
2705 * If we've begun active balancing, start to back off. This
2706 * case may not be covered by the all_pinned logic if there
2707 * is only 1 task on the busy runqueue (because we don't call
2708 * move_tasks).
2709 */
2710 if (sd->balance_interval < sd->max_interval)
2711 sd->balance_interval *= 2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002712 }
2713
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002714 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002715 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002716 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002717 return nr_moved;
2718
2719out_balanced:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002720 schedstat_inc(sd, lb_balanced[idle]);
2721
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002722 sd->nr_balance_failed = 0;
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002723
2724out_one_pinned:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002725 /* tune up the balancing interval */
Nick Piggin77391d72005-06-25 14:57:30 -07002726 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2727 (sd->balance_interval < sd->max_interval))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002728 sd->balance_interval *= 2;
2729
Ingo Molnar48f24c42006-07-03 00:25:40 -07002730 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002731 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002732 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002733 return 0;
2734}
2735
2736/*
2737 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2738 * tasks if there is an imbalance.
2739 *
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002740 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
Linus Torvalds1da177e2005-04-16 15:20:36 -07002741 * this_rq is locked.
2742 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07002743static int
Ingo Molnar70b97a72006-07-03 00:25:42 -07002744load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002745{
2746 struct sched_group *group;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002747 struct rq *busiest = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002748 unsigned long imbalance;
2749 int nr_moved = 0;
Nick Piggin5969fe02005-09-10 00:26:19 -07002750 int sd_idle = 0;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002751 cpumask_t cpus = CPU_MASK_ALL;
Nick Piggin5969fe02005-09-10 00:26:19 -07002752
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002753 /*
2754 * When power savings policy is enabled for the parent domain, idle
2755 * sibling can pick up load irrespective of busy siblings. In this case,
2756 * let the state of idle sibling percolate up as IDLE, instead of
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002757 * portraying it as CPU_NOT_IDLE.
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002758 */
2759 if (sd->flags & SD_SHARE_CPUPOWER &&
2760 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002761 sd_idle = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002762
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002763 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002764redo:
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002765 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002766 &sd_idle, &cpus, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002767 if (!group) {
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002768 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002769 goto out_balanced;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002770 }
2771
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002772 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002773 &cpus);
Nick Piggindb935db2005-06-25 14:57:11 -07002774 if (!busiest) {
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002775 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002776 goto out_balanced;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002777 }
2778
Nick Piggindb935db2005-06-25 14:57:11 -07002779 BUG_ON(busiest == this_rq);
2780
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002781 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
Nick Piggind6d5cfa2005-09-10 00:26:16 -07002782
2783 nr_moved = 0;
2784 if (busiest->nr_running > 1) {
2785 /* Attempt to move tasks */
2786 double_lock_balance(this_rq, busiest);
2787 nr_moved = move_tasks(this_rq, this_cpu, busiest,
Peter Williams2dd73a42006-06-27 02:54:34 -07002788 minus_1_or_zero(busiest->nr_running),
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002789 imbalance, sd, CPU_NEWLY_IDLE, NULL);
Nick Piggind6d5cfa2005-09-10 00:26:16 -07002790 spin_unlock(&busiest->lock);
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002791
2792 if (!nr_moved) {
2793 cpu_clear(cpu_of(busiest), cpus);
2794 if (!cpus_empty(cpus))
2795 goto redo;
2796 }
Nick Piggind6d5cfa2005-09-10 00:26:16 -07002797 }
2798
Nick Piggin5969fe02005-09-10 00:26:19 -07002799 if (!nr_moved) {
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002800 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002801 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2802 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002803 return -1;
2804 } else
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002805 sd->nr_balance_failed = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002806
Linus Torvalds1da177e2005-04-16 15:20:36 -07002807 return nr_moved;
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002808
2809out_balanced:
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002810 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
Ingo Molnar48f24c42006-07-03 00:25:40 -07002811 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002812 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002813 return -1;
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002814 sd->nr_balance_failed = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07002815
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002816 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002817}
2818
2819/*
2820 * idle_balance is called by schedule() if this_cpu is about to become
2821 * idle. Attempts to pull tasks from other CPUs.
2822 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002823static void idle_balance(int this_cpu, struct rq *this_rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002824{
2825 struct sched_domain *sd;
Ingo Molnardd41f592007-07-09 18:51:59 +02002826 int pulled_task = -1;
2827 unsigned long next_balance = jiffies + HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002828
2829 for_each_domain(this_cpu, sd) {
Christoph Lameter92c4ca52007-06-23 17:16:33 -07002830 unsigned long interval;
2831
2832 if (!(sd->flags & SD_LOAD_BALANCE))
2833 continue;
2834
2835 if (sd->flags & SD_BALANCE_NEWIDLE)
Ingo Molnar48f24c42006-07-03 00:25:40 -07002836 /* If we've pulled tasks over stop searching: */
Christoph Lameter1bd77f22006-12-10 02:20:27 -08002837 pulled_task = load_balance_newidle(this_cpu,
Christoph Lameter92c4ca52007-06-23 17:16:33 -07002838 this_rq, sd);
2839
2840 interval = msecs_to_jiffies(sd->balance_interval);
2841 if (time_after(next_balance, sd->last_balance + interval))
2842 next_balance = sd->last_balance + interval;
2843 if (pulled_task)
2844 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002845 }
Ingo Molnardd41f592007-07-09 18:51:59 +02002846 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
Christoph Lameter1bd77f22006-12-10 02:20:27 -08002847 /*
2848 * We are going idle. next_balance may be set based on
2849 * a busy processor. So reset next_balance.
2850 */
2851 this_rq->next_balance = next_balance;
Ingo Molnardd41f592007-07-09 18:51:59 +02002852 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002853}
2854
2855/*
2856 * active_load_balance is run by migration threads. It pushes running tasks
2857 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2858 * running on each physical CPU where possible, and avoids physical /
2859 * logical imbalances.
2860 *
2861 * Called with busiest_rq locked.
2862 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002863static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002864{
Nick Piggin39507452005-06-25 14:57:09 -07002865 int target_cpu = busiest_rq->push_cpu;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002866 struct sched_domain *sd;
2867 struct rq *target_rq;
Nick Piggin39507452005-06-25 14:57:09 -07002868
Ingo Molnar48f24c42006-07-03 00:25:40 -07002869 /* Is there any task to move? */
Nick Piggin39507452005-06-25 14:57:09 -07002870 if (busiest_rq->nr_running <= 1)
Nick Piggin39507452005-06-25 14:57:09 -07002871 return;
2872
2873 target_rq = cpu_rq(target_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002874
2875 /*
Nick Piggin39507452005-06-25 14:57:09 -07002876 * This condition is "impossible", if it occurs
2877 * we need to fix it. Originally reported by
2878 * Bjorn Helgaas on a 128-cpu setup.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002879 */
Nick Piggin39507452005-06-25 14:57:09 -07002880 BUG_ON(busiest_rq == target_rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002881
Nick Piggin39507452005-06-25 14:57:09 -07002882 /* move a task from busiest_rq to target_rq */
2883 double_lock_balance(busiest_rq, target_rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002884
Nick Piggin39507452005-06-25 14:57:09 -07002885 /* Search for an sd spanning us and the target CPU. */
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07002886 for_each_domain(target_cpu, sd) {
Nick Piggin39507452005-06-25 14:57:09 -07002887 if ((sd->flags & SD_LOAD_BALANCE) &&
Ingo Molnar48f24c42006-07-03 00:25:40 -07002888 cpu_isset(busiest_cpu, sd->span))
Nick Piggin39507452005-06-25 14:57:09 -07002889 break;
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07002890 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002891
Ingo Molnar48f24c42006-07-03 00:25:40 -07002892 if (likely(sd)) {
2893 schedstat_inc(sd, alb_cnt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002894
Ingo Molnar48f24c42006-07-03 00:25:40 -07002895 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002896 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
Ingo Molnar48f24c42006-07-03 00:25:40 -07002897 NULL))
2898 schedstat_inc(sd, alb_pushed);
2899 else
2900 schedstat_inc(sd, alb_failed);
2901 }
Nick Piggin39507452005-06-25 14:57:09 -07002902 spin_unlock(&target_rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002903}
2904
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002905#ifdef CONFIG_NO_HZ
2906static struct {
2907 atomic_t load_balancer;
2908 cpumask_t cpu_mask;
2909} nohz ____cacheline_aligned = {
2910 .load_balancer = ATOMIC_INIT(-1),
2911 .cpu_mask = CPU_MASK_NONE,
2912};
2913
Christoph Lameter7835b982006-12-10 02:20:22 -08002914/*
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002915 * This routine will try to nominate the ilb (idle load balancing)
2916 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2917 * load balancing on behalf of all those cpus. If all the cpus in the system
2918 * go into this tickless mode, then there will be no ilb owner (as there is
2919 * no need for one) and all the cpus will sleep till the next wakeup event
2920 * arrives...
Christoph Lameter7835b982006-12-10 02:20:22 -08002921 *
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002922 * For the ilb owner, tick is not stopped. And this tick will be used
2923 * for idle load balancing. ilb owner will still be part of
2924 * nohz.cpu_mask..
2925 *
2926 * While stopping the tick, this cpu will become the ilb owner if there
2927 * is no other owner. And will be the owner till that cpu becomes busy
2928 * or if all cpus in the system stop their ticks at which point
2929 * there is no need for ilb owner.
2930 *
2931 * When the ilb owner becomes busy, it nominates another owner, during the
2932 * next busy scheduler_tick()
2933 */
2934int select_nohz_load_balancer(int stop_tick)
2935{
2936 int cpu = smp_processor_id();
2937
2938 if (stop_tick) {
2939 cpu_set(cpu, nohz.cpu_mask);
2940 cpu_rq(cpu)->in_nohz_recently = 1;
2941
2942 /*
2943 * If we are going offline and still the leader, give up!
2944 */
2945 if (cpu_is_offline(cpu) &&
2946 atomic_read(&nohz.load_balancer) == cpu) {
2947 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2948 BUG();
2949 return 0;
2950 }
2951
2952 /* time for ilb owner also to sleep */
2953 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2954 if (atomic_read(&nohz.load_balancer) == cpu)
2955 atomic_set(&nohz.load_balancer, -1);
2956 return 0;
2957 }
2958
2959 if (atomic_read(&nohz.load_balancer) == -1) {
2960 /* make me the ilb owner */
2961 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2962 return 1;
2963 } else if (atomic_read(&nohz.load_balancer) == cpu)
2964 return 1;
2965 } else {
2966 if (!cpu_isset(cpu, nohz.cpu_mask))
2967 return 0;
2968
2969 cpu_clear(cpu, nohz.cpu_mask);
2970
2971 if (atomic_read(&nohz.load_balancer) == cpu)
2972 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2973 BUG();
2974 }
2975 return 0;
2976}
2977#endif
2978
2979static DEFINE_SPINLOCK(balancing);
2980
2981/*
Christoph Lameter7835b982006-12-10 02:20:22 -08002982 * It checks each scheduling domain to see if it is due to be balanced,
2983 * and initiates a balancing operation if so.
2984 *
2985 * Balancing parameters are set up in arch_init_sched_domains.
2986 */
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002987static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
Christoph Lameter7835b982006-12-10 02:20:22 -08002988{
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002989 int balance = 1;
2990 struct rq *rq = cpu_rq(cpu);
Christoph Lameter7835b982006-12-10 02:20:22 -08002991 unsigned long interval;
2992 struct sched_domain *sd;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002993 /* Earliest time when we have to do rebalance again */
Christoph Lameterc9819f42006-12-10 02:20:25 -08002994 unsigned long next_balance = jiffies + 60*HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002995
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002996 for_each_domain(cpu, sd) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002997 if (!(sd->flags & SD_LOAD_BALANCE))
2998 continue;
2999
3000 interval = sd->balance_interval;
Ingo Molnard15bcfd2007-07-09 18:51:57 +02003001 if (idle != CPU_IDLE)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003002 interval *= sd->busy_factor;
3003
3004 /* scale ms to jiffies */
3005 interval = msecs_to_jiffies(interval);
3006 if (unlikely(!interval))
3007 interval = 1;
Ingo Molnardd41f592007-07-09 18:51:59 +02003008 if (interval > HZ*NR_CPUS/10)
3009 interval = HZ*NR_CPUS/10;
3010
Linus Torvalds1da177e2005-04-16 15:20:36 -07003011
Christoph Lameter08c183f2006-12-10 02:20:29 -08003012 if (sd->flags & SD_SERIALIZE) {
3013 if (!spin_trylock(&balancing))
3014 goto out;
3015 }
3016
Christoph Lameterc9819f42006-12-10 02:20:25 -08003017 if (time_after_eq(jiffies, sd->last_balance + interval)) {
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003018 if (load_balance(cpu, rq, sd, idle, &balance)) {
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07003019 /*
3020 * We've pulled tasks over so either we're no
Nick Piggin5969fe02005-09-10 00:26:19 -07003021 * longer idle, or one of our SMT siblings is
3022 * not idle.
3023 */
Ingo Molnard15bcfd2007-07-09 18:51:57 +02003024 idle = CPU_NOT_IDLE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003025 }
Christoph Lameter1bd77f22006-12-10 02:20:27 -08003026 sd->last_balance = jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003027 }
Christoph Lameter08c183f2006-12-10 02:20:29 -08003028 if (sd->flags & SD_SERIALIZE)
3029 spin_unlock(&balancing);
3030out:
Christoph Lameterc9819f42006-12-10 02:20:25 -08003031 if (time_after(next_balance, sd->last_balance + interval))
3032 next_balance = sd->last_balance + interval;
Siddha, Suresh B783609c2006-12-10 02:20:33 -08003033
3034 /*
3035 * Stop the load balance at this level. There is another
3036 * CPU in our sched group which is doing load balancing more
3037 * actively.
3038 */
3039 if (!balance)
3040 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003041 }
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003042 rq->next_balance = next_balance;
3043}
3044
3045/*
3046 * run_rebalance_domains is triggered when needed from the scheduler tick.
3047 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3048 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3049 */
3050static void run_rebalance_domains(struct softirq_action *h)
3051{
Ingo Molnardd41f592007-07-09 18:51:59 +02003052 int this_cpu = smp_processor_id();
3053 struct rq *this_rq = cpu_rq(this_cpu);
3054 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3055 CPU_IDLE : CPU_NOT_IDLE;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003056
Ingo Molnardd41f592007-07-09 18:51:59 +02003057 rebalance_domains(this_cpu, idle);
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003058
3059#ifdef CONFIG_NO_HZ
3060 /*
3061 * If this cpu is the owner for idle load balancing, then do the
3062 * balancing on behalf of the other idle cpus whose ticks are
3063 * stopped.
3064 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003065 if (this_rq->idle_at_tick &&
3066 atomic_read(&nohz.load_balancer) == this_cpu) {
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003067 cpumask_t cpus = nohz.cpu_mask;
3068 struct rq *rq;
3069 int balance_cpu;
3070
Ingo Molnardd41f592007-07-09 18:51:59 +02003071 cpu_clear(this_cpu, cpus);
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003072 for_each_cpu_mask(balance_cpu, cpus) {
3073 /*
3074 * If this cpu gets work to do, stop the load balancing
3075 * work being done for other cpus. Next load
3076 * balancing owner will pick it up.
3077 */
3078 if (need_resched())
3079 break;
3080
Ingo Molnardd41f592007-07-09 18:51:59 +02003081 rebalance_domains(balance_cpu, SCHED_IDLE);
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003082
3083 rq = cpu_rq(balance_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02003084 if (time_after(this_rq->next_balance, rq->next_balance))
3085 this_rq->next_balance = rq->next_balance;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003086 }
3087 }
3088#endif
3089}
3090
3091/*
3092 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3093 *
3094 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3095 * idle load balancing owner or decide to stop the periodic load balancing,
3096 * if the whole system is idle.
3097 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003098static inline void trigger_load_balance(struct rq *rq, int cpu)
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003099{
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003100#ifdef CONFIG_NO_HZ
3101 /*
3102 * If we were in the nohz mode recently and busy at the current
3103 * scheduler tick, then check if we need to nominate new idle
3104 * load balancer.
3105 */
3106 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3107 rq->in_nohz_recently = 0;
3108
3109 if (atomic_read(&nohz.load_balancer) == cpu) {
3110 cpu_clear(cpu, nohz.cpu_mask);
3111 atomic_set(&nohz.load_balancer, -1);
3112 }
3113
3114 if (atomic_read(&nohz.load_balancer) == -1) {
3115 /*
3116 * simple selection for now: Nominate the
3117 * first cpu in the nohz list to be the next
3118 * ilb owner.
3119 *
3120 * TBD: Traverse the sched domains and nominate
3121 * the nearest cpu in the nohz.cpu_mask.
3122 */
3123 int ilb = first_cpu(nohz.cpu_mask);
3124
3125 if (ilb != NR_CPUS)
3126 resched_cpu(ilb);
3127 }
3128 }
3129
3130 /*
3131 * If this cpu is idle and doing idle load balancing for all the
3132 * cpus with ticks stopped, is it time for that to stop?
3133 */
3134 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3135 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3136 resched_cpu(cpu);
3137 return;
3138 }
3139
3140 /*
3141 * If this cpu is idle and the idle load balancing is done by
3142 * someone else, then no need raise the SCHED_SOFTIRQ
3143 */
3144 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3145 cpu_isset(cpu, nohz.cpu_mask))
3146 return;
3147#endif
3148 if (time_after_eq(jiffies, rq->next_balance))
3149 raise_softirq(SCHED_SOFTIRQ);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003150}
Ingo Molnardd41f592007-07-09 18:51:59 +02003151
3152#else /* CONFIG_SMP */
3153
Linus Torvalds1da177e2005-04-16 15:20:36 -07003154/*
3155 * on UP we do not need to balance between CPUs:
3156 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07003157static inline void idle_balance(int cpu, struct rq *rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003158{
3159}
Ingo Molnardd41f592007-07-09 18:51:59 +02003160
3161/* Avoid "used but not defined" warning on UP */
3162static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3163 unsigned long max_nr_move, unsigned long max_load_move,
3164 struct sched_domain *sd, enum cpu_idle_type idle,
3165 int *all_pinned, unsigned long *load_moved,
3166 int this_best_prio, int best_prio, int best_prio_seen,
3167 struct rq_iterator *iterator)
3168{
3169 *load_moved = 0;
3170
3171 return 0;
3172}
3173
Linus Torvalds1da177e2005-04-16 15:20:36 -07003174#endif
3175
Linus Torvalds1da177e2005-04-16 15:20:36 -07003176DEFINE_PER_CPU(struct kernel_stat, kstat);
3177
3178EXPORT_PER_CPU_SYMBOL(kstat);
3179
3180/*
Ingo Molnar41b86e92007-07-09 18:51:58 +02003181 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3182 * that have not yet been banked in case the task is currently running.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003183 */
Ingo Molnar41b86e92007-07-09 18:51:58 +02003184unsigned long long task_sched_runtime(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003185{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003186 unsigned long flags;
Ingo Molnar41b86e92007-07-09 18:51:58 +02003187 u64 ns, delta_exec;
3188 struct rq *rq;
Ingo Molnar48f24c42006-07-03 00:25:40 -07003189
Ingo Molnar41b86e92007-07-09 18:51:58 +02003190 rq = task_rq_lock(p, &flags);
3191 ns = p->se.sum_exec_runtime;
3192 if (rq->curr == p) {
3193 delta_exec = rq_clock(rq) - p->se.exec_start;
3194 if ((s64)delta_exec > 0)
3195 ns += delta_exec;
3196 }
3197 task_rq_unlock(rq, &flags);
Ingo Molnar48f24c42006-07-03 00:25:40 -07003198
Linus Torvalds1da177e2005-04-16 15:20:36 -07003199 return ns;
3200}
3201
3202/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07003203 * Account user cpu time to a process.
3204 * @p: the process that the cpu time gets accounted to
3205 * @hardirq_offset: the offset to subtract from hardirq_count()
3206 * @cputime: the cpu time spent in user space since the last update
3207 */
3208void account_user_time(struct task_struct *p, cputime_t cputime)
3209{
3210 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3211 cputime64_t tmp;
3212
3213 p->utime = cputime_add(p->utime, cputime);
3214
3215 /* Add user time to cpustat. */
3216 tmp = cputime_to_cputime64(cputime);
3217 if (TASK_NICE(p) > 0)
3218 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3219 else
3220 cpustat->user = cputime64_add(cpustat->user, tmp);
3221}
3222
3223/*
3224 * Account system cpu time to a process.
3225 * @p: the process that the cpu time gets accounted to
3226 * @hardirq_offset: the offset to subtract from hardirq_count()
3227 * @cputime: the cpu time spent in kernel space since the last update
3228 */
3229void account_system_time(struct task_struct *p, int hardirq_offset,
3230 cputime_t cputime)
3231{
3232 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003233 struct rq *rq = this_rq();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003234 cputime64_t tmp;
3235
3236 p->stime = cputime_add(p->stime, cputime);
3237
3238 /* Add system time to cpustat. */
3239 tmp = cputime_to_cputime64(cputime);
3240 if (hardirq_count() - hardirq_offset)
3241 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3242 else if (softirq_count())
3243 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3244 else if (p != rq->idle)
3245 cpustat->system = cputime64_add(cpustat->system, tmp);
3246 else if (atomic_read(&rq->nr_iowait) > 0)
3247 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3248 else
3249 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3250 /* Account for system time used */
3251 acct_update_integrals(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003252}
3253
3254/*
3255 * Account for involuntary wait time.
3256 * @p: the process from which the cpu time has been stolen
3257 * @steal: the cpu time spent in involuntary wait
3258 */
3259void account_steal_time(struct task_struct *p, cputime_t steal)
3260{
3261 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3262 cputime64_t tmp = cputime_to_cputime64(steal);
Ingo Molnar70b97a72006-07-03 00:25:42 -07003263 struct rq *rq = this_rq();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003264
3265 if (p == rq->idle) {
3266 p->stime = cputime_add(p->stime, steal);
3267 if (atomic_read(&rq->nr_iowait) > 0)
3268 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3269 else
3270 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3271 } else
3272 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3273}
3274
Christoph Lameter7835b982006-12-10 02:20:22 -08003275/*
3276 * This function gets called by the timer code, with HZ frequency.
3277 * We call it with interrupts disabled.
3278 *
3279 * It also gets called by the fork code, when changing the parent's
3280 * timeslices.
3281 */
3282void scheduler_tick(void)
3283{
Christoph Lameter7835b982006-12-10 02:20:22 -08003284 int cpu = smp_processor_id();
3285 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02003286 struct task_struct *curr = rq->curr;
Christoph Lameter7835b982006-12-10 02:20:22 -08003287
Ingo Molnardd41f592007-07-09 18:51:59 +02003288 spin_lock(&rq->lock);
3289 if (curr != rq->idle) /* FIXME: needed? */
3290 curr->sched_class->task_tick(rq, curr);
3291 update_cpu_load(rq);
3292 spin_unlock(&rq->lock);
3293
Christoph Lametere418e1c2006-12-10 02:20:23 -08003294#ifdef CONFIG_SMP
Ingo Molnardd41f592007-07-09 18:51:59 +02003295 rq->idle_at_tick = idle_cpu(cpu);
3296 trigger_load_balance(rq, cpu);
Christoph Lametere418e1c2006-12-10 02:20:23 -08003297#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003298}
3299
Linus Torvalds1da177e2005-04-16 15:20:36 -07003300#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3301
3302void fastcall add_preempt_count(int val)
3303{
3304 /*
3305 * Underflow?
3306 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003307 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3308 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003309 preempt_count() += val;
3310 /*
3311 * Spinlock count overflowing soon?
3312 */
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08003313 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3314 PREEMPT_MASK - 10);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003315}
3316EXPORT_SYMBOL(add_preempt_count);
3317
3318void fastcall sub_preempt_count(int val)
3319{
3320 /*
3321 * Underflow?
3322 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003323 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3324 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003325 /*
3326 * Is the spinlock portion underflowing?
3327 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003328 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3329 !(preempt_count() & PREEMPT_MASK)))
3330 return;
3331
Linus Torvalds1da177e2005-04-16 15:20:36 -07003332 preempt_count() -= val;
3333}
3334EXPORT_SYMBOL(sub_preempt_count);
3335
3336#endif
3337
3338/*
Ingo Molnardd41f592007-07-09 18:51:59 +02003339 * Print scheduling while atomic bug:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003340 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003341static noinline void __schedule_bug(struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003342{
Ingo Molnardd41f592007-07-09 18:51:59 +02003343 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3344 prev->comm, preempt_count(), prev->pid);
3345 debug_show_held_locks(prev);
3346 if (irqs_disabled())
3347 print_irqtrace_events(prev);
3348 dump_stack();
3349}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003350
Ingo Molnardd41f592007-07-09 18:51:59 +02003351/*
3352 * Various schedule()-time debugging checks and statistics:
3353 */
3354static inline void schedule_debug(struct task_struct *prev)
3355{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003356 /*
3357 * Test if we are atomic. Since do_exit() needs to call into
3358 * schedule() atomically, we ignore that path for now.
3359 * Otherwise, whine if we are scheduling when we should not be.
3360 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003361 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3362 __schedule_bug(prev);
3363
Linus Torvalds1da177e2005-04-16 15:20:36 -07003364 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3365
Ingo Molnardd41f592007-07-09 18:51:59 +02003366 schedstat_inc(this_rq(), sched_cnt);
3367}
3368
3369/*
3370 * Pick up the highest-prio task:
3371 */
3372static inline struct task_struct *
3373pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3374{
3375 struct sched_class *class;
3376 struct task_struct *p;
3377
3378 /*
3379 * Optimization: we know that if all tasks are in
3380 * the fair class we can call that function directly:
3381 */
3382 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3383 p = fair_sched_class.pick_next_task(rq, now);
3384 if (likely(p))
3385 return p;
3386 }
3387
3388 class = sched_class_highest;
3389 for ( ; ; ) {
3390 p = class->pick_next_task(rq, now);
3391 if (p)
3392 return p;
3393 /*
3394 * Will never be NULL as the idle class always
3395 * returns a non-NULL p:
3396 */
3397 class = class->next;
3398 }
3399}
3400
3401/*
3402 * schedule() is the main scheduler function.
3403 */
3404asmlinkage void __sched schedule(void)
3405{
3406 struct task_struct *prev, *next;
3407 long *switch_count;
3408 struct rq *rq;
3409 u64 now;
3410 int cpu;
3411
Linus Torvalds1da177e2005-04-16 15:20:36 -07003412need_resched:
3413 preempt_disable();
Ingo Molnardd41f592007-07-09 18:51:59 +02003414 cpu = smp_processor_id();
3415 rq = cpu_rq(cpu);
3416 rcu_qsctr_inc(cpu);
3417 prev = rq->curr;
3418 switch_count = &prev->nivcsw;
3419
Linus Torvalds1da177e2005-04-16 15:20:36 -07003420 release_kernel_lock(prev);
3421need_resched_nonpreemptible:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003422
Ingo Molnardd41f592007-07-09 18:51:59 +02003423 schedule_debug(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003424
3425 spin_lock_irq(&rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003426 clear_tsk_need_resched(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003427
Ingo Molnardd41f592007-07-09 18:51:59 +02003428 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3429 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3430 unlikely(signal_pending(prev)))) {
3431 prev->state = TASK_RUNNING;
3432 } else {
3433 deactivate_task(rq, prev, 1);
3434 }
3435 switch_count = &prev->nvcsw;
3436 }
3437
3438 if (unlikely(!rq->nr_running))
3439 idle_balance(cpu, rq);
3440
3441 now = __rq_clock(rq);
3442 prev->sched_class->put_prev_task(rq, prev, now);
3443 next = pick_next_task(rq, prev, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003444
3445 sched_info_switch(prev, next);
Ingo Molnardd41f592007-07-09 18:51:59 +02003446
Linus Torvalds1da177e2005-04-16 15:20:36 -07003447 if (likely(prev != next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003448 rq->nr_switches++;
3449 rq->curr = next;
3450 ++*switch_count;
3451
Ingo Molnardd41f592007-07-09 18:51:59 +02003452 context_switch(rq, prev, next); /* unlocks the rq */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003453 } else
3454 spin_unlock_irq(&rq->lock);
3455
Ingo Molnardd41f592007-07-09 18:51:59 +02003456 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3457 cpu = smp_processor_id();
3458 rq = cpu_rq(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003459 goto need_resched_nonpreemptible;
Ingo Molnardd41f592007-07-09 18:51:59 +02003460 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003461 preempt_enable_no_resched();
3462 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3463 goto need_resched;
3464}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003465EXPORT_SYMBOL(schedule);
3466
3467#ifdef CONFIG_PREEMPT
3468/*
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003469 * this is the entry point to schedule() from in-kernel preemption
Linus Torvalds1da177e2005-04-16 15:20:36 -07003470 * off of preempt_enable. Kernel preemptions off return from interrupt
3471 * occur there and call schedule directly.
3472 */
3473asmlinkage void __sched preempt_schedule(void)
3474{
3475 struct thread_info *ti = current_thread_info();
3476#ifdef CONFIG_PREEMPT_BKL
3477 struct task_struct *task = current;
3478 int saved_lock_depth;
3479#endif
3480 /*
3481 * If there is a non-zero preempt_count or interrupts are disabled,
3482 * we do not want to preempt the current task. Just return..
3483 */
Nick Pigginbeed33a2006-10-11 01:21:52 -07003484 if (likely(ti->preempt_count || irqs_disabled()))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003485 return;
3486
3487need_resched:
3488 add_preempt_count(PREEMPT_ACTIVE);
3489 /*
3490 * We keep the big kernel semaphore locked, but we
3491 * clear ->lock_depth so that schedule() doesnt
3492 * auto-release the semaphore:
3493 */
3494#ifdef CONFIG_PREEMPT_BKL
3495 saved_lock_depth = task->lock_depth;
3496 task->lock_depth = -1;
3497#endif
3498 schedule();
3499#ifdef CONFIG_PREEMPT_BKL
3500 task->lock_depth = saved_lock_depth;
3501#endif
3502 sub_preempt_count(PREEMPT_ACTIVE);
3503
3504 /* we could miss a preemption opportunity between schedule and now */
3505 barrier();
3506 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3507 goto need_resched;
3508}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003509EXPORT_SYMBOL(preempt_schedule);
3510
3511/*
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003512 * this is the entry point to schedule() from kernel preemption
Linus Torvalds1da177e2005-04-16 15:20:36 -07003513 * off of irq context.
3514 * Note, that this is called and return with irqs disabled. This will
3515 * protect us against recursive calling from irq.
3516 */
3517asmlinkage void __sched preempt_schedule_irq(void)
3518{
3519 struct thread_info *ti = current_thread_info();
3520#ifdef CONFIG_PREEMPT_BKL
3521 struct task_struct *task = current;
3522 int saved_lock_depth;
3523#endif
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003524 /* Catch callers which need to be fixed */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003525 BUG_ON(ti->preempt_count || !irqs_disabled());
3526
3527need_resched:
3528 add_preempt_count(PREEMPT_ACTIVE);
3529 /*
3530 * We keep the big kernel semaphore locked, but we
3531 * clear ->lock_depth so that schedule() doesnt
3532 * auto-release the semaphore:
3533 */
3534#ifdef CONFIG_PREEMPT_BKL
3535 saved_lock_depth = task->lock_depth;
3536 task->lock_depth = -1;
3537#endif
3538 local_irq_enable();
3539 schedule();
3540 local_irq_disable();
3541#ifdef CONFIG_PREEMPT_BKL
3542 task->lock_depth = saved_lock_depth;
3543#endif
3544 sub_preempt_count(PREEMPT_ACTIVE);
3545
3546 /* we could miss a preemption opportunity between schedule and now */
3547 barrier();
3548 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3549 goto need_resched;
3550}
3551
3552#endif /* CONFIG_PREEMPT */
3553
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003554int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3555 void *key)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003556{
Ingo Molnar48f24c42006-07-03 00:25:40 -07003557 return try_to_wake_up(curr->private, mode, sync);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003558}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003559EXPORT_SYMBOL(default_wake_function);
3560
3561/*
3562 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3563 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3564 * number) then we wake all the non-exclusive tasks and one exclusive task.
3565 *
3566 * There are circumstances in which we can try to wake a task which has already
3567 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3568 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3569 */
3570static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3571 int nr_exclusive, int sync, void *key)
3572{
3573 struct list_head *tmp, *next;
3574
3575 list_for_each_safe(tmp, next, &q->task_list) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07003576 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3577 unsigned flags = curr->flags;
3578
Linus Torvalds1da177e2005-04-16 15:20:36 -07003579 if (curr->func(curr, mode, sync, key) &&
Ingo Molnar48f24c42006-07-03 00:25:40 -07003580 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003581 break;
3582 }
3583}
3584
3585/**
3586 * __wake_up - wake up threads blocked on a waitqueue.
3587 * @q: the waitqueue
3588 * @mode: which threads
3589 * @nr_exclusive: how many wake-one or wake-many threads to wake up
Martin Waitz67be2dd2005-05-01 08:59:26 -07003590 * @key: is directly passed to the wakeup function
Linus Torvalds1da177e2005-04-16 15:20:36 -07003591 */
3592void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003593 int nr_exclusive, void *key)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003594{
3595 unsigned long flags;
3596
3597 spin_lock_irqsave(&q->lock, flags);
3598 __wake_up_common(q, mode, nr_exclusive, 0, key);
3599 spin_unlock_irqrestore(&q->lock, flags);
3600}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003601EXPORT_SYMBOL(__wake_up);
3602
3603/*
3604 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3605 */
3606void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3607{
3608 __wake_up_common(q, mode, 1, 0, NULL);
3609}
3610
3611/**
Martin Waitz67be2dd2005-05-01 08:59:26 -07003612 * __wake_up_sync - wake up threads blocked on a waitqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003613 * @q: the waitqueue
3614 * @mode: which threads
3615 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3616 *
3617 * The sync wakeup differs that the waker knows that it will schedule
3618 * away soon, so while the target thread will be woken up, it will not
3619 * be migrated to another CPU - ie. the two threads are 'synchronized'
3620 * with each other. This can prevent needless bouncing between CPUs.
3621 *
3622 * On UP it can prevent extra preemption.
3623 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003624void fastcall
3625__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003626{
3627 unsigned long flags;
3628 int sync = 1;
3629
3630 if (unlikely(!q))
3631 return;
3632
3633 if (unlikely(!nr_exclusive))
3634 sync = 0;
3635
3636 spin_lock_irqsave(&q->lock, flags);
3637 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3638 spin_unlock_irqrestore(&q->lock, flags);
3639}
3640EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3641
3642void fastcall complete(struct completion *x)
3643{
3644 unsigned long flags;
3645
3646 spin_lock_irqsave(&x->wait.lock, flags);
3647 x->done++;
3648 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3649 1, 0, NULL);
3650 spin_unlock_irqrestore(&x->wait.lock, flags);
3651}
3652EXPORT_SYMBOL(complete);
3653
3654void fastcall complete_all(struct completion *x)
3655{
3656 unsigned long flags;
3657
3658 spin_lock_irqsave(&x->wait.lock, flags);
3659 x->done += UINT_MAX/2;
3660 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3661 0, 0, NULL);
3662 spin_unlock_irqrestore(&x->wait.lock, flags);
3663}
3664EXPORT_SYMBOL(complete_all);
3665
3666void fastcall __sched wait_for_completion(struct completion *x)
3667{
3668 might_sleep();
Ingo Molnar48f24c42006-07-03 00:25:40 -07003669
Linus Torvalds1da177e2005-04-16 15:20:36 -07003670 spin_lock_irq(&x->wait.lock);
3671 if (!x->done) {
3672 DECLARE_WAITQUEUE(wait, current);
3673
3674 wait.flags |= WQ_FLAG_EXCLUSIVE;
3675 __add_wait_queue_tail(&x->wait, &wait);
3676 do {
3677 __set_current_state(TASK_UNINTERRUPTIBLE);
3678 spin_unlock_irq(&x->wait.lock);
3679 schedule();
3680 spin_lock_irq(&x->wait.lock);
3681 } while (!x->done);
3682 __remove_wait_queue(&x->wait, &wait);
3683 }
3684 x->done--;
3685 spin_unlock_irq(&x->wait.lock);
3686}
3687EXPORT_SYMBOL(wait_for_completion);
3688
3689unsigned long fastcall __sched
3690wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3691{
3692 might_sleep();
3693
3694 spin_lock_irq(&x->wait.lock);
3695 if (!x->done) {
3696 DECLARE_WAITQUEUE(wait, current);
3697
3698 wait.flags |= WQ_FLAG_EXCLUSIVE;
3699 __add_wait_queue_tail(&x->wait, &wait);
3700 do {
3701 __set_current_state(TASK_UNINTERRUPTIBLE);
3702 spin_unlock_irq(&x->wait.lock);
3703 timeout = schedule_timeout(timeout);
3704 spin_lock_irq(&x->wait.lock);
3705 if (!timeout) {
3706 __remove_wait_queue(&x->wait, &wait);
3707 goto out;
3708 }
3709 } while (!x->done);
3710 __remove_wait_queue(&x->wait, &wait);
3711 }
3712 x->done--;
3713out:
3714 spin_unlock_irq(&x->wait.lock);
3715 return timeout;
3716}
3717EXPORT_SYMBOL(wait_for_completion_timeout);
3718
3719int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3720{
3721 int ret = 0;
3722
3723 might_sleep();
3724
3725 spin_lock_irq(&x->wait.lock);
3726 if (!x->done) {
3727 DECLARE_WAITQUEUE(wait, current);
3728
3729 wait.flags |= WQ_FLAG_EXCLUSIVE;
3730 __add_wait_queue_tail(&x->wait, &wait);
3731 do {
3732 if (signal_pending(current)) {
3733 ret = -ERESTARTSYS;
3734 __remove_wait_queue(&x->wait, &wait);
3735 goto out;
3736 }
3737 __set_current_state(TASK_INTERRUPTIBLE);
3738 spin_unlock_irq(&x->wait.lock);
3739 schedule();
3740 spin_lock_irq(&x->wait.lock);
3741 } while (!x->done);
3742 __remove_wait_queue(&x->wait, &wait);
3743 }
3744 x->done--;
3745out:
3746 spin_unlock_irq(&x->wait.lock);
3747
3748 return ret;
3749}
3750EXPORT_SYMBOL(wait_for_completion_interruptible);
3751
3752unsigned long fastcall __sched
3753wait_for_completion_interruptible_timeout(struct completion *x,
3754 unsigned long timeout)
3755{
3756 might_sleep();
3757
3758 spin_lock_irq(&x->wait.lock);
3759 if (!x->done) {
3760 DECLARE_WAITQUEUE(wait, current);
3761
3762 wait.flags |= WQ_FLAG_EXCLUSIVE;
3763 __add_wait_queue_tail(&x->wait, &wait);
3764 do {
3765 if (signal_pending(current)) {
3766 timeout = -ERESTARTSYS;
3767 __remove_wait_queue(&x->wait, &wait);
3768 goto out;
3769 }
3770 __set_current_state(TASK_INTERRUPTIBLE);
3771 spin_unlock_irq(&x->wait.lock);
3772 timeout = schedule_timeout(timeout);
3773 spin_lock_irq(&x->wait.lock);
3774 if (!timeout) {
3775 __remove_wait_queue(&x->wait, &wait);
3776 goto out;
3777 }
3778 } while (!x->done);
3779 __remove_wait_queue(&x->wait, &wait);
3780 }
3781 x->done--;
3782out:
3783 spin_unlock_irq(&x->wait.lock);
3784 return timeout;
3785}
3786EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3787
3788
3789#define SLEEP_ON_VAR \
3790 unsigned long flags; \
3791 wait_queue_t wait; \
3792 init_waitqueue_entry(&wait, current);
3793
3794#define SLEEP_ON_HEAD \
3795 spin_lock_irqsave(&q->lock,flags); \
3796 __add_wait_queue(q, &wait); \
3797 spin_unlock(&q->lock);
3798
3799#define SLEEP_ON_TAIL \
3800 spin_lock_irq(&q->lock); \
3801 __remove_wait_queue(q, &wait); \
3802 spin_unlock_irqrestore(&q->lock, flags);
3803
3804void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3805{
3806 SLEEP_ON_VAR
3807
3808 current->state = TASK_INTERRUPTIBLE;
3809
3810 SLEEP_ON_HEAD
3811 schedule();
3812 SLEEP_ON_TAIL
3813}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003814EXPORT_SYMBOL(interruptible_sleep_on);
3815
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003816long fastcall __sched
3817interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003818{
3819 SLEEP_ON_VAR
3820
3821 current->state = TASK_INTERRUPTIBLE;
3822
3823 SLEEP_ON_HEAD
3824 timeout = schedule_timeout(timeout);
3825 SLEEP_ON_TAIL
3826
3827 return timeout;
3828}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003829EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3830
3831void fastcall __sched sleep_on(wait_queue_head_t *q)
3832{
3833 SLEEP_ON_VAR
3834
3835 current->state = TASK_UNINTERRUPTIBLE;
3836
3837 SLEEP_ON_HEAD
3838 schedule();
3839 SLEEP_ON_TAIL
3840}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003841EXPORT_SYMBOL(sleep_on);
3842
3843long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3844{
3845 SLEEP_ON_VAR
3846
3847 current->state = TASK_UNINTERRUPTIBLE;
3848
3849 SLEEP_ON_HEAD
3850 timeout = schedule_timeout(timeout);
3851 SLEEP_ON_TAIL
3852
3853 return timeout;
3854}
3855
3856EXPORT_SYMBOL(sleep_on_timeout);
3857
Ingo Molnarb29739f2006-06-27 02:54:51 -07003858#ifdef CONFIG_RT_MUTEXES
3859
3860/*
3861 * rt_mutex_setprio - set the current priority of a task
3862 * @p: task
3863 * @prio: prio value (kernel-internal form)
3864 *
3865 * This function changes the 'effective' priority of a task. It does
3866 * not touch ->normal_prio like __setscheduler().
3867 *
3868 * Used by the rt_mutex code to implement priority inheritance logic.
3869 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003870void rt_mutex_setprio(struct task_struct *p, int prio)
Ingo Molnarb29739f2006-06-27 02:54:51 -07003871{
3872 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02003873 int oldprio, on_rq;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003874 struct rq *rq;
Ingo Molnardd41f592007-07-09 18:51:59 +02003875 u64 now;
Ingo Molnarb29739f2006-06-27 02:54:51 -07003876
3877 BUG_ON(prio < 0 || prio > MAX_PRIO);
3878
3879 rq = task_rq_lock(p, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02003880 now = rq_clock(rq);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003881
Andrew Mortond5f9f942007-05-08 20:27:06 -07003882 oldprio = p->prio;
Ingo Molnardd41f592007-07-09 18:51:59 +02003883 on_rq = p->se.on_rq;
3884 if (on_rq)
3885 dequeue_task(rq, p, 0, now);
3886
3887 if (rt_prio(prio))
3888 p->sched_class = &rt_sched_class;
3889 else
3890 p->sched_class = &fair_sched_class;
3891
Ingo Molnarb29739f2006-06-27 02:54:51 -07003892 p->prio = prio;
3893
Ingo Molnardd41f592007-07-09 18:51:59 +02003894 if (on_rq) {
3895 enqueue_task(rq, p, 0, now);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003896 /*
3897 * Reschedule if we are currently running on this runqueue and
Andrew Mortond5f9f942007-05-08 20:27:06 -07003898 * our priority decreased, or if we are not currently running on
3899 * this runqueue and our priority is higher than the current's
Ingo Molnarb29739f2006-06-27 02:54:51 -07003900 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07003901 if (task_running(rq, p)) {
3902 if (p->prio > oldprio)
3903 resched_task(rq->curr);
Ingo Molnardd41f592007-07-09 18:51:59 +02003904 } else {
3905 check_preempt_curr(rq, p);
3906 }
Ingo Molnarb29739f2006-06-27 02:54:51 -07003907 }
3908 task_rq_unlock(rq, &flags);
3909}
3910
3911#endif
3912
Ingo Molnar36c8b582006-07-03 00:25:41 -07003913void set_user_nice(struct task_struct *p, long nice)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003914{
Ingo Molnardd41f592007-07-09 18:51:59 +02003915 int old_prio, delta, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003916 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003917 struct rq *rq;
Ingo Molnardd41f592007-07-09 18:51:59 +02003918 u64 now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003919
3920 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3921 return;
3922 /*
3923 * We have to be careful, if called from sys_setpriority(),
3924 * the task might be in the middle of scheduling on another CPU.
3925 */
3926 rq = task_rq_lock(p, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02003927 now = rq_clock(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003928 /*
3929 * The RT priorities are set via sched_setscheduler(), but we still
3930 * allow the 'normal' nice value to be set - but as expected
3931 * it wont have any effect on scheduling until the task is
Ingo Molnardd41f592007-07-09 18:51:59 +02003932 * SCHED_FIFO/SCHED_RR:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003933 */
Ingo Molnare05606d2007-07-09 18:51:59 +02003934 if (task_has_rt_policy(p)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003935 p->static_prio = NICE_TO_PRIO(nice);
3936 goto out_unlock;
3937 }
Ingo Molnardd41f592007-07-09 18:51:59 +02003938 on_rq = p->se.on_rq;
3939 if (on_rq) {
3940 dequeue_task(rq, p, 0, now);
3941 dec_load(rq, p, now);
Peter Williams2dd73a42006-06-27 02:54:34 -07003942 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003943
Linus Torvalds1da177e2005-04-16 15:20:36 -07003944 p->static_prio = NICE_TO_PRIO(nice);
Peter Williams2dd73a42006-06-27 02:54:34 -07003945 set_load_weight(p);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003946 old_prio = p->prio;
3947 p->prio = effective_prio(p);
3948 delta = p->prio - old_prio;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003949
Ingo Molnardd41f592007-07-09 18:51:59 +02003950 if (on_rq) {
3951 enqueue_task(rq, p, 0, now);
3952 inc_load(rq, p, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003953 /*
Andrew Mortond5f9f942007-05-08 20:27:06 -07003954 * If the task increased its priority or is running and
3955 * lowered its priority, then reschedule its CPU:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003956 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07003957 if (delta < 0 || (delta > 0 && task_running(rq, p)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003958 resched_task(rq->curr);
3959 }
3960out_unlock:
3961 task_rq_unlock(rq, &flags);
3962}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003963EXPORT_SYMBOL(set_user_nice);
3964
Matt Mackalle43379f2005-05-01 08:59:00 -07003965/*
3966 * can_nice - check if a task can reduce its nice value
3967 * @p: task
3968 * @nice: nice value
3969 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003970int can_nice(const struct task_struct *p, const int nice)
Matt Mackalle43379f2005-05-01 08:59:00 -07003971{
Matt Mackall024f4742005-08-18 11:24:19 -07003972 /* convert nice value [19,-20] to rlimit style value [1,40] */
3973 int nice_rlim = 20 - nice;
Ingo Molnar48f24c42006-07-03 00:25:40 -07003974
Matt Mackalle43379f2005-05-01 08:59:00 -07003975 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3976 capable(CAP_SYS_NICE));
3977}
3978
Linus Torvalds1da177e2005-04-16 15:20:36 -07003979#ifdef __ARCH_WANT_SYS_NICE
3980
3981/*
3982 * sys_nice - change the priority of the current process.
3983 * @increment: priority increment
3984 *
3985 * sys_setpriority is a more generic, but much slower function that
3986 * does similar things.
3987 */
3988asmlinkage long sys_nice(int increment)
3989{
Ingo Molnar48f24c42006-07-03 00:25:40 -07003990 long nice, retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003991
3992 /*
3993 * Setpriority might change our priority at the same moment.
3994 * We don't have to worry. Conceptually one call occurs first
3995 * and we have a single winner.
3996 */
Matt Mackalle43379f2005-05-01 08:59:00 -07003997 if (increment < -40)
3998 increment = -40;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003999 if (increment > 40)
4000 increment = 40;
4001
4002 nice = PRIO_TO_NICE(current->static_prio) + increment;
4003 if (nice < -20)
4004 nice = -20;
4005 if (nice > 19)
4006 nice = 19;
4007
Matt Mackalle43379f2005-05-01 08:59:00 -07004008 if (increment < 0 && !can_nice(current, nice))
4009 return -EPERM;
4010
Linus Torvalds1da177e2005-04-16 15:20:36 -07004011 retval = security_task_setnice(current, nice);
4012 if (retval)
4013 return retval;
4014
4015 set_user_nice(current, nice);
4016 return 0;
4017}
4018
4019#endif
4020
4021/**
4022 * task_prio - return the priority value of a given task.
4023 * @p: the task in question.
4024 *
4025 * This is the priority value as seen by users in /proc.
4026 * RT tasks are offset by -200. Normal tasks are centered
4027 * around 0, value goes from -16 to +15.
4028 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004029int task_prio(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004030{
4031 return p->prio - MAX_RT_PRIO;
4032}
4033
4034/**
4035 * task_nice - return the nice value of a given task.
4036 * @p: the task in question.
4037 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004038int task_nice(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004039{
4040 return TASK_NICE(p);
4041}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004042EXPORT_SYMBOL_GPL(task_nice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004043
4044/**
4045 * idle_cpu - is a given cpu idle currently?
4046 * @cpu: the processor in question.
4047 */
4048int idle_cpu(int cpu)
4049{
4050 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4051}
4052
Linus Torvalds1da177e2005-04-16 15:20:36 -07004053/**
4054 * idle_task - return the idle task for a given cpu.
4055 * @cpu: the processor in question.
4056 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004057struct task_struct *idle_task(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004058{
4059 return cpu_rq(cpu)->idle;
4060}
4061
4062/**
4063 * find_process_by_pid - find a process with a matching PID value.
4064 * @pid: the pid in question.
4065 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004066static inline struct task_struct *find_process_by_pid(pid_t pid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004067{
4068 return pid ? find_task_by_pid(pid) : current;
4069}
4070
4071/* Actually do priority change: must hold rq lock. */
Ingo Molnardd41f592007-07-09 18:51:59 +02004072static void
4073__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004074{
Ingo Molnardd41f592007-07-09 18:51:59 +02004075 BUG_ON(p->se.on_rq);
Ingo Molnar48f24c42006-07-03 00:25:40 -07004076
Linus Torvalds1da177e2005-04-16 15:20:36 -07004077 p->policy = policy;
Ingo Molnardd41f592007-07-09 18:51:59 +02004078 switch (p->policy) {
4079 case SCHED_NORMAL:
4080 case SCHED_BATCH:
4081 case SCHED_IDLE:
4082 p->sched_class = &fair_sched_class;
4083 break;
4084 case SCHED_FIFO:
4085 case SCHED_RR:
4086 p->sched_class = &rt_sched_class;
4087 break;
4088 }
4089
Linus Torvalds1da177e2005-04-16 15:20:36 -07004090 p->rt_priority = prio;
Ingo Molnarb29739f2006-06-27 02:54:51 -07004091 p->normal_prio = normal_prio(p);
4092 /* we are holding p->pi_lock already */
4093 p->prio = rt_mutex_getprio(p);
Peter Williams2dd73a42006-06-27 02:54:34 -07004094 set_load_weight(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004095}
4096
4097/**
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08004098 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004099 * @p: the task in question.
4100 * @policy: new policy.
4101 * @param: structure containing the new RT priority.
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004102 *
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08004103 * NOTE that the task may be already dead.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004104 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004105int sched_setscheduler(struct task_struct *p, int policy,
4106 struct sched_param *param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004107{
Ingo Molnardd41f592007-07-09 18:51:59 +02004108 int retval, oldprio, oldpolicy = -1, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004109 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004110 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004111
Steven Rostedt66e53932006-06-27 02:54:44 -07004112 /* may grab non-irq protected spin_locks */
4113 BUG_ON(in_interrupt());
Linus Torvalds1da177e2005-04-16 15:20:36 -07004114recheck:
4115 /* double check policy once rq lock held */
4116 if (policy < 0)
4117 policy = oldpolicy = p->policy;
4118 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
Ingo Molnardd41f592007-07-09 18:51:59 +02004119 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4120 policy != SCHED_IDLE)
Ingo Molnarb0a94992006-01-14 13:20:41 -08004121 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004122 /*
4123 * Valid priorities for SCHED_FIFO and SCHED_RR are
Ingo Molnardd41f592007-07-09 18:51:59 +02004124 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4125 * SCHED_BATCH and SCHED_IDLE is 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004126 */
4127 if (param->sched_priority < 0 ||
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004128 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
Steven Rostedtd46523e2005-07-25 16:28:39 -04004129 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004130 return -EINVAL;
Ingo Molnare05606d2007-07-09 18:51:59 +02004131 if (rt_policy(policy) != (param->sched_priority != 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004132 return -EINVAL;
4133
Olivier Croquette37e4ab32005-06-25 14:57:32 -07004134 /*
4135 * Allow unprivileged RT tasks to decrease priority:
4136 */
4137 if (!capable(CAP_SYS_NICE)) {
Ingo Molnare05606d2007-07-09 18:51:59 +02004138 if (rt_policy(policy)) {
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004139 unsigned long rlim_rtprio;
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004140
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004141 if (!lock_task_sighand(p, &flags))
4142 return -ESRCH;
4143 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4144 unlock_task_sighand(p, &flags);
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004145
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004146 /* can't set/change the rt policy */
4147 if (policy != p->policy && !rlim_rtprio)
4148 return -EPERM;
4149
4150 /* can't increase priority */
4151 if (param->sched_priority > p->rt_priority &&
4152 param->sched_priority > rlim_rtprio)
4153 return -EPERM;
4154 }
Ingo Molnardd41f592007-07-09 18:51:59 +02004155 /*
4156 * Like positive nice levels, dont allow tasks to
4157 * move out of SCHED_IDLE either:
4158 */
4159 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4160 return -EPERM;
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004161
Olivier Croquette37e4ab32005-06-25 14:57:32 -07004162 /* can't change other user's priorities */
4163 if ((current->euid != p->euid) &&
4164 (current->euid != p->uid))
4165 return -EPERM;
4166 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004167
4168 retval = security_task_setscheduler(p, policy, param);
4169 if (retval)
4170 return retval;
4171 /*
Ingo Molnarb29739f2006-06-27 02:54:51 -07004172 * make sure no PI-waiters arrive (or leave) while we are
4173 * changing the priority of the task:
4174 */
4175 spin_lock_irqsave(&p->pi_lock, flags);
4176 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07004177 * To be able to change p->policy safely, the apropriate
4178 * runqueue lock must be held.
4179 */
Ingo Molnarb29739f2006-06-27 02:54:51 -07004180 rq = __task_rq_lock(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004181 /* recheck policy now with rq lock held */
4182 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4183 policy = oldpolicy = -1;
Ingo Molnarb29739f2006-06-27 02:54:51 -07004184 __task_rq_unlock(rq);
4185 spin_unlock_irqrestore(&p->pi_lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004186 goto recheck;
4187 }
Ingo Molnardd41f592007-07-09 18:51:59 +02004188 on_rq = p->se.on_rq;
4189 if (on_rq)
4190 deactivate_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004191 oldprio = p->prio;
Ingo Molnardd41f592007-07-09 18:51:59 +02004192 __setscheduler(rq, p, policy, param->sched_priority);
4193 if (on_rq) {
4194 activate_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004195 /*
4196 * Reschedule if we are currently running on this runqueue and
Andrew Mortond5f9f942007-05-08 20:27:06 -07004197 * our priority decreased, or if we are not currently running on
4198 * this runqueue and our priority is higher than the current's
Linus Torvalds1da177e2005-04-16 15:20:36 -07004199 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07004200 if (task_running(rq, p)) {
4201 if (p->prio > oldprio)
4202 resched_task(rq->curr);
Ingo Molnardd41f592007-07-09 18:51:59 +02004203 } else {
4204 check_preempt_curr(rq, p);
4205 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004206 }
Ingo Molnarb29739f2006-06-27 02:54:51 -07004207 __task_rq_unlock(rq);
4208 spin_unlock_irqrestore(&p->pi_lock, flags);
4209
Thomas Gleixner95e02ca2006-06-27 02:55:02 -07004210 rt_mutex_adjust_pi(p);
4211
Linus Torvalds1da177e2005-04-16 15:20:36 -07004212 return 0;
4213}
4214EXPORT_SYMBOL_GPL(sched_setscheduler);
4215
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004216static int
4217do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004218{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004219 struct sched_param lparam;
4220 struct task_struct *p;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004221 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004222
4223 if (!param || pid < 0)
4224 return -EINVAL;
4225 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4226 return -EFAULT;
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004227
4228 rcu_read_lock();
4229 retval = -ESRCH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004230 p = find_process_by_pid(pid);
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004231 if (p != NULL)
4232 retval = sched_setscheduler(p, policy, &lparam);
4233 rcu_read_unlock();
Ingo Molnar36c8b582006-07-03 00:25:41 -07004234
Linus Torvalds1da177e2005-04-16 15:20:36 -07004235 return retval;
4236}
4237
4238/**
4239 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4240 * @pid: the pid in question.
4241 * @policy: new policy.
4242 * @param: structure containing the new RT priority.
4243 */
4244asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4245 struct sched_param __user *param)
4246{
Jason Baronc21761f2006-01-18 17:43:03 -08004247 /* negative values for policy are not valid */
4248 if (policy < 0)
4249 return -EINVAL;
4250
Linus Torvalds1da177e2005-04-16 15:20:36 -07004251 return do_sched_setscheduler(pid, policy, param);
4252}
4253
4254/**
4255 * sys_sched_setparam - set/change the RT priority of a thread
4256 * @pid: the pid in question.
4257 * @param: structure containing the new RT priority.
4258 */
4259asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4260{
4261 return do_sched_setscheduler(pid, -1, param);
4262}
4263
4264/**
4265 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4266 * @pid: the pid in question.
4267 */
4268asmlinkage long sys_sched_getscheduler(pid_t pid)
4269{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004270 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004271 int retval = -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004272
4273 if (pid < 0)
4274 goto out_nounlock;
4275
4276 retval = -ESRCH;
4277 read_lock(&tasklist_lock);
4278 p = find_process_by_pid(pid);
4279 if (p) {
4280 retval = security_task_getscheduler(p);
4281 if (!retval)
4282 retval = p->policy;
4283 }
4284 read_unlock(&tasklist_lock);
4285
4286out_nounlock:
4287 return retval;
4288}
4289
4290/**
4291 * sys_sched_getscheduler - get the RT priority of a thread
4292 * @pid: the pid in question.
4293 * @param: structure containing the RT priority.
4294 */
4295asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4296{
4297 struct sched_param lp;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004298 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004299 int retval = -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004300
4301 if (!param || pid < 0)
4302 goto out_nounlock;
4303
4304 read_lock(&tasklist_lock);
4305 p = find_process_by_pid(pid);
4306 retval = -ESRCH;
4307 if (!p)
4308 goto out_unlock;
4309
4310 retval = security_task_getscheduler(p);
4311 if (retval)
4312 goto out_unlock;
4313
4314 lp.sched_priority = p->rt_priority;
4315 read_unlock(&tasklist_lock);
4316
4317 /*
4318 * This one might sleep, we cannot do it with a spinlock held ...
4319 */
4320 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4321
4322out_nounlock:
4323 return retval;
4324
4325out_unlock:
4326 read_unlock(&tasklist_lock);
4327 return retval;
4328}
4329
4330long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4331{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004332 cpumask_t cpus_allowed;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004333 struct task_struct *p;
4334 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004335
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004336 mutex_lock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004337 read_lock(&tasklist_lock);
4338
4339 p = find_process_by_pid(pid);
4340 if (!p) {
4341 read_unlock(&tasklist_lock);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004342 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004343 return -ESRCH;
4344 }
4345
4346 /*
4347 * It is not safe to call set_cpus_allowed with the
4348 * tasklist_lock held. We will bump the task_struct's
4349 * usage count and then drop tasklist_lock.
4350 */
4351 get_task_struct(p);
4352 read_unlock(&tasklist_lock);
4353
4354 retval = -EPERM;
4355 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4356 !capable(CAP_SYS_NICE))
4357 goto out_unlock;
4358
David Quigleye7834f82006-06-23 02:03:59 -07004359 retval = security_task_setscheduler(p, 0, NULL);
4360 if (retval)
4361 goto out_unlock;
4362
Linus Torvalds1da177e2005-04-16 15:20:36 -07004363 cpus_allowed = cpuset_cpus_allowed(p);
4364 cpus_and(new_mask, new_mask, cpus_allowed);
4365 retval = set_cpus_allowed(p, new_mask);
4366
4367out_unlock:
4368 put_task_struct(p);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004369 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004370 return retval;
4371}
4372
4373static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4374 cpumask_t *new_mask)
4375{
4376 if (len < sizeof(cpumask_t)) {
4377 memset(new_mask, 0, sizeof(cpumask_t));
4378 } else if (len > sizeof(cpumask_t)) {
4379 len = sizeof(cpumask_t);
4380 }
4381 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4382}
4383
4384/**
4385 * sys_sched_setaffinity - set the cpu affinity of a process
4386 * @pid: pid of the process
4387 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4388 * @user_mask_ptr: user-space pointer to the new cpu mask
4389 */
4390asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4391 unsigned long __user *user_mask_ptr)
4392{
4393 cpumask_t new_mask;
4394 int retval;
4395
4396 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4397 if (retval)
4398 return retval;
4399
4400 return sched_setaffinity(pid, new_mask);
4401}
4402
4403/*
4404 * Represents all cpu's present in the system
4405 * In systems capable of hotplug, this map could dynamically grow
4406 * as new cpu's are detected in the system via any platform specific
4407 * method, such as ACPI for e.g.
4408 */
4409
Andi Kleen4cef0c62006-01-11 22:44:57 +01004410cpumask_t cpu_present_map __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004411EXPORT_SYMBOL(cpu_present_map);
4412
4413#ifndef CONFIG_SMP
Andi Kleen4cef0c62006-01-11 22:44:57 +01004414cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
Greg Bankse16b38f2006-10-02 02:17:40 -07004415EXPORT_SYMBOL(cpu_online_map);
4416
Andi Kleen4cef0c62006-01-11 22:44:57 +01004417cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
Greg Bankse16b38f2006-10-02 02:17:40 -07004418EXPORT_SYMBOL(cpu_possible_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004419#endif
4420
4421long sched_getaffinity(pid_t pid, cpumask_t *mask)
4422{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004423 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004424 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004425
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004426 mutex_lock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004427 read_lock(&tasklist_lock);
4428
4429 retval = -ESRCH;
4430 p = find_process_by_pid(pid);
4431 if (!p)
4432 goto out_unlock;
4433
David Quigleye7834f82006-06-23 02:03:59 -07004434 retval = security_task_getscheduler(p);
4435 if (retval)
4436 goto out_unlock;
4437
Jack Steiner2f7016d2006-02-01 03:05:18 -08004438 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004439
4440out_unlock:
4441 read_unlock(&tasklist_lock);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004442 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004443 if (retval)
4444 return retval;
4445
4446 return 0;
4447}
4448
4449/**
4450 * sys_sched_getaffinity - get the cpu affinity of a process
4451 * @pid: pid of the process
4452 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4453 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4454 */
4455asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4456 unsigned long __user *user_mask_ptr)
4457{
4458 int ret;
4459 cpumask_t mask;
4460
4461 if (len < sizeof(cpumask_t))
4462 return -EINVAL;
4463
4464 ret = sched_getaffinity(pid, &mask);
4465 if (ret < 0)
4466 return ret;
4467
4468 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4469 return -EFAULT;
4470
4471 return sizeof(cpumask_t);
4472}
4473
4474/**
4475 * sys_sched_yield - yield the current processor to other threads.
4476 *
Ingo Molnardd41f592007-07-09 18:51:59 +02004477 * This function yields the current CPU to other tasks. If there are no
4478 * other threads running on this CPU then this function will return.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004479 */
4480asmlinkage long sys_sched_yield(void)
4481{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004482 struct rq *rq = this_rq_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004483
4484 schedstat_inc(rq, yld_cnt);
Ingo Molnardd41f592007-07-09 18:51:59 +02004485 if (unlikely(rq->nr_running == 1))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004486 schedstat_inc(rq, yld_act_empty);
Ingo Molnardd41f592007-07-09 18:51:59 +02004487 else
4488 current->sched_class->yield_task(rq, current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004489
4490 /*
4491 * Since we are going to call schedule() anyway, there's
4492 * no need to preempt or enable interrupts:
4493 */
4494 __release(rq->lock);
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07004495 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004496 _raw_spin_unlock(&rq->lock);
4497 preempt_enable_no_resched();
4498
4499 schedule();
4500
4501 return 0;
4502}
4503
Andrew Mortone7b38402006-06-30 01:56:00 -07004504static void __cond_resched(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004505{
Ingo Molnar8e0a43d2006-06-23 02:05:23 -07004506#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4507 __might_sleep(__FILE__, __LINE__);
4508#endif
Ingo Molnar5bbcfd92005-07-07 17:57:04 -07004509 /*
4510 * The BKS might be reacquired before we have dropped
4511 * PREEMPT_ACTIVE, which could trigger a second
4512 * cond_resched() call.
4513 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004514 do {
4515 add_preempt_count(PREEMPT_ACTIVE);
4516 schedule();
4517 sub_preempt_count(PREEMPT_ACTIVE);
4518 } while (need_resched());
4519}
4520
4521int __sched cond_resched(void)
4522{
Ingo Molnar94142322006-12-29 16:48:13 -08004523 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4524 system_state == SYSTEM_RUNNING) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004525 __cond_resched();
4526 return 1;
4527 }
4528 return 0;
4529}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004530EXPORT_SYMBOL(cond_resched);
4531
4532/*
4533 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4534 * call schedule, and on return reacquire the lock.
4535 *
4536 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4537 * operations here to prevent schedule() from being called twice (once via
4538 * spin_unlock(), once by hand).
4539 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004540int cond_resched_lock(spinlock_t *lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004541{
Jan Kara6df3cec2005-06-13 15:52:32 -07004542 int ret = 0;
4543
Linus Torvalds1da177e2005-04-16 15:20:36 -07004544 if (need_lockbreak(lock)) {
4545 spin_unlock(lock);
4546 cpu_relax();
Jan Kara6df3cec2005-06-13 15:52:32 -07004547 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004548 spin_lock(lock);
4549 }
Ingo Molnar94142322006-12-29 16:48:13 -08004550 if (need_resched() && system_state == SYSTEM_RUNNING) {
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07004551 spin_release(&lock->dep_map, 1, _THIS_IP_);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004552 _raw_spin_unlock(lock);
4553 preempt_enable_no_resched();
4554 __cond_resched();
Jan Kara6df3cec2005-06-13 15:52:32 -07004555 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004556 spin_lock(lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004557 }
Jan Kara6df3cec2005-06-13 15:52:32 -07004558 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004559}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004560EXPORT_SYMBOL(cond_resched_lock);
4561
4562int __sched cond_resched_softirq(void)
4563{
4564 BUG_ON(!in_softirq());
4565
Ingo Molnar94142322006-12-29 16:48:13 -08004566 if (need_resched() && system_state == SYSTEM_RUNNING) {
Thomas Gleixner98d82562007-05-23 13:58:18 -07004567 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004568 __cond_resched();
4569 local_bh_disable();
4570 return 1;
4571 }
4572 return 0;
4573}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004574EXPORT_SYMBOL(cond_resched_softirq);
4575
Linus Torvalds1da177e2005-04-16 15:20:36 -07004576/**
4577 * yield - yield the current processor to other threads.
4578 *
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08004579 * This is a shortcut for kernel-space yielding - it marks the
Linus Torvalds1da177e2005-04-16 15:20:36 -07004580 * thread runnable and calls sys_sched_yield().
4581 */
4582void __sched yield(void)
4583{
4584 set_current_state(TASK_RUNNING);
4585 sys_sched_yield();
4586}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004587EXPORT_SYMBOL(yield);
4588
4589/*
4590 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4591 * that process accounting knows that this is a task in IO wait state.
4592 *
4593 * But don't do that if it is a deliberate, throttling IO wait (this task
4594 * has set its backing_dev_info: the queue against which it should throttle)
4595 */
4596void __sched io_schedule(void)
4597{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004598 struct rq *rq = &__raw_get_cpu_var(runqueues);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004599
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004600 delayacct_blkio_start();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004601 atomic_inc(&rq->nr_iowait);
4602 schedule();
4603 atomic_dec(&rq->nr_iowait);
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004604 delayacct_blkio_end();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004605}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004606EXPORT_SYMBOL(io_schedule);
4607
4608long __sched io_schedule_timeout(long timeout)
4609{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004610 struct rq *rq = &__raw_get_cpu_var(runqueues);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004611 long ret;
4612
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004613 delayacct_blkio_start();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004614 atomic_inc(&rq->nr_iowait);
4615 ret = schedule_timeout(timeout);
4616 atomic_dec(&rq->nr_iowait);
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004617 delayacct_blkio_end();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004618 return ret;
4619}
4620
4621/**
4622 * sys_sched_get_priority_max - return maximum RT priority.
4623 * @policy: scheduling class.
4624 *
4625 * this syscall returns the maximum rt_priority that can be used
4626 * by a given scheduling class.
4627 */
4628asmlinkage long sys_sched_get_priority_max(int policy)
4629{
4630 int ret = -EINVAL;
4631
4632 switch (policy) {
4633 case SCHED_FIFO:
4634 case SCHED_RR:
4635 ret = MAX_USER_RT_PRIO-1;
4636 break;
4637 case SCHED_NORMAL:
Ingo Molnarb0a94992006-01-14 13:20:41 -08004638 case SCHED_BATCH:
Ingo Molnardd41f592007-07-09 18:51:59 +02004639 case SCHED_IDLE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07004640 ret = 0;
4641 break;
4642 }
4643 return ret;
4644}
4645
4646/**
4647 * sys_sched_get_priority_min - return minimum RT priority.
4648 * @policy: scheduling class.
4649 *
4650 * this syscall returns the minimum rt_priority that can be used
4651 * by a given scheduling class.
4652 */
4653asmlinkage long sys_sched_get_priority_min(int policy)
4654{
4655 int ret = -EINVAL;
4656
4657 switch (policy) {
4658 case SCHED_FIFO:
4659 case SCHED_RR:
4660 ret = 1;
4661 break;
4662 case SCHED_NORMAL:
Ingo Molnarb0a94992006-01-14 13:20:41 -08004663 case SCHED_BATCH:
Ingo Molnardd41f592007-07-09 18:51:59 +02004664 case SCHED_IDLE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07004665 ret = 0;
4666 }
4667 return ret;
4668}
4669
4670/**
4671 * sys_sched_rr_get_interval - return the default timeslice of a process.
4672 * @pid: pid of the process.
4673 * @interval: userspace pointer to the timeslice value.
4674 *
4675 * this syscall writes the default timeslice value of a given process
4676 * into the user-space timespec buffer. A value of '0' means infinity.
4677 */
4678asmlinkage
4679long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4680{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004681 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004682 int retval = -EINVAL;
4683 struct timespec t;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004684
4685 if (pid < 0)
4686 goto out_nounlock;
4687
4688 retval = -ESRCH;
4689 read_lock(&tasklist_lock);
4690 p = find_process_by_pid(pid);
4691 if (!p)
4692 goto out_unlock;
4693
4694 retval = security_task_getscheduler(p);
4695 if (retval)
4696 goto out_unlock;
4697
Peter Williamsb78709c2006-06-26 16:58:00 +10004698 jiffies_to_timespec(p->policy == SCHED_FIFO ?
Ingo Molnardd41f592007-07-09 18:51:59 +02004699 0 : static_prio_timeslice(p->static_prio), &t);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004700 read_unlock(&tasklist_lock);
4701 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4702out_nounlock:
4703 return retval;
4704out_unlock:
4705 read_unlock(&tasklist_lock);
4706 return retval;
4707}
4708
Andreas Mohr2ed6e342006-07-10 04:43:52 -07004709static const char stat_nam[] = "RSDTtZX";
Ingo Molnar36c8b582006-07-03 00:25:41 -07004710
4711static void show_task(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004712{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004713 unsigned long free = 0;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004714 unsigned state;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004715
Linus Torvalds1da177e2005-04-16 15:20:36 -07004716 state = p->state ? __ffs(p->state) + 1 : 0;
Andreas Mohr2ed6e342006-07-10 04:43:52 -07004717 printk("%-13.13s %c", p->comm,
4718 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
Linus Torvalds1da177e2005-04-16 15:20:36 -07004719#if (BITS_PER_LONG == 32)
4720 if (state == TASK_RUNNING)
4721 printk(" running ");
4722 else
4723 printk(" %08lX ", thread_saved_pc(p));
4724#else
4725 if (state == TASK_RUNNING)
4726 printk(" running task ");
4727 else
4728 printk(" %016lx ", thread_saved_pc(p));
4729#endif
4730#ifdef CONFIG_DEBUG_STACK_USAGE
4731 {
Al Viro10ebffd2005-11-13 16:06:56 -08004732 unsigned long *n = end_of_stack(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004733 while (!*n)
4734 n++;
Al Viro10ebffd2005-11-13 16:06:56 -08004735 free = (unsigned long)n - (unsigned long)end_of_stack(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004736 }
4737#endif
Ingo Molnar35f6f752007-04-06 21:18:06 +02004738 printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004739 if (!p->mm)
4740 printk(" (L-TLB)\n");
4741 else
4742 printk(" (NOTLB)\n");
4743
4744 if (state != TASK_RUNNING)
4745 show_stack(p, NULL);
4746}
4747
Ingo Molnare59e2ae2006-12-06 20:35:59 -08004748void show_state_filter(unsigned long state_filter)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004749{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004750 struct task_struct *g, *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004751
4752#if (BITS_PER_LONG == 32)
4753 printk("\n"
Chris Caputo301827a2006-12-06 20:39:11 -08004754 " free sibling\n");
4755 printk(" task PC stack pid father child younger older\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004756#else
4757 printk("\n"
Chris Caputo301827a2006-12-06 20:39:11 -08004758 " free sibling\n");
4759 printk(" task PC stack pid father child younger older\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004760#endif
4761 read_lock(&tasklist_lock);
4762 do_each_thread(g, p) {
4763 /*
4764 * reset the NMI-timeout, listing all files on a slow
4765 * console might take alot of time:
4766 */
4767 touch_nmi_watchdog();
Ingo Molnar39bc89f2007-04-25 20:50:03 -07004768 if (!state_filter || (p->state & state_filter))
Ingo Molnare59e2ae2006-12-06 20:35:59 -08004769 show_task(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004770 } while_each_thread(g, p);
4771
Jeremy Fitzhardinge04c91672007-05-08 00:28:05 -07004772 touch_all_softlockup_watchdogs();
4773
Ingo Molnardd41f592007-07-09 18:51:59 +02004774#ifdef CONFIG_SCHED_DEBUG
4775 sysrq_sched_debug_show();
4776#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004777 read_unlock(&tasklist_lock);
Ingo Molnare59e2ae2006-12-06 20:35:59 -08004778 /*
4779 * Only show locks if all tasks are dumped:
4780 */
4781 if (state_filter == -1)
4782 debug_show_all_locks();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004783}
4784
Ingo Molnar1df21052007-07-09 18:51:58 +02004785void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4786{
Ingo Molnardd41f592007-07-09 18:51:59 +02004787 idle->sched_class = &idle_sched_class;
Ingo Molnar1df21052007-07-09 18:51:58 +02004788}
4789
Ingo Molnarf340c0d2005-06-28 16:40:42 +02004790/**
4791 * init_idle - set up an idle thread for a given CPU
4792 * @idle: task in question
4793 * @cpu: cpu the idle task belongs to
4794 *
4795 * NOTE: this function does not set the idle thread's NEED_RESCHED
4796 * flag, to make booting more robust.
4797 */
Nick Piggin5c1e1762006-10-03 01:14:04 -07004798void __cpuinit init_idle(struct task_struct *idle, int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004799{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004800 struct rq *rq = cpu_rq(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004801 unsigned long flags;
4802
Ingo Molnardd41f592007-07-09 18:51:59 +02004803 __sched_fork(idle);
4804 idle->se.exec_start = sched_clock();
4805
Ingo Molnarb29739f2006-06-27 02:54:51 -07004806 idle->prio = idle->normal_prio = MAX_PRIO;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004807 idle->cpus_allowed = cpumask_of_cpu(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02004808 __set_task_cpu(idle, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004809
4810 spin_lock_irqsave(&rq->lock, flags);
4811 rq->curr = rq->idle = idle;
Nick Piggin4866cde2005-06-25 14:57:23 -07004812#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4813 idle->oncpu = 1;
4814#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004815 spin_unlock_irqrestore(&rq->lock, flags);
4816
4817 /* Set the preempt count _outside_ the spinlocks! */
4818#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
Al Viroa1261f52005-11-13 16:06:55 -08004819 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004820#else
Al Viroa1261f52005-11-13 16:06:55 -08004821 task_thread_info(idle)->preempt_count = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004822#endif
Ingo Molnardd41f592007-07-09 18:51:59 +02004823 /*
4824 * The idle tasks have their own, simple scheduling class:
4825 */
4826 idle->sched_class = &idle_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004827}
4828
4829/*
4830 * In a system that switches off the HZ timer nohz_cpu_mask
4831 * indicates which cpus entered this state. This is used
4832 * in the rcu update to wait only for active cpus. For system
4833 * which do not switch off the HZ timer nohz_cpu_mask should
4834 * always be CPU_MASK_NONE.
4835 */
4836cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4837
Ingo Molnardd41f592007-07-09 18:51:59 +02004838/*
4839 * Increase the granularity value when there are more CPUs,
4840 * because with more CPUs the 'effective latency' as visible
4841 * to users decreases. But the relationship is not linear,
4842 * so pick a second-best guess by going with the log2 of the
4843 * number of CPUs.
4844 *
4845 * This idea comes from the SD scheduler of Con Kolivas:
4846 */
4847static inline void sched_init_granularity(void)
4848{
4849 unsigned int factor = 1 + ilog2(num_online_cpus());
4850 const unsigned long gran_limit = 10000000;
4851
4852 sysctl_sched_granularity *= factor;
4853 if (sysctl_sched_granularity > gran_limit)
4854 sysctl_sched_granularity = gran_limit;
4855
4856 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4857 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4858}
4859
Linus Torvalds1da177e2005-04-16 15:20:36 -07004860#ifdef CONFIG_SMP
4861/*
4862 * This is how migration works:
4863 *
Ingo Molnar70b97a72006-07-03 00:25:42 -07004864 * 1) we queue a struct migration_req structure in the source CPU's
Linus Torvalds1da177e2005-04-16 15:20:36 -07004865 * runqueue and wake up that CPU's migration thread.
4866 * 2) we down() the locked semaphore => thread blocks.
4867 * 3) migration thread wakes up (implicitly it forces the migrated
4868 * thread off the CPU)
4869 * 4) it gets the migration request and checks whether the migrated
4870 * task is still in the wrong runqueue.
4871 * 5) if it's in the wrong runqueue then the migration thread removes
4872 * it and puts it into the right queue.
4873 * 6) migration thread up()s the semaphore.
4874 * 7) we wake up and the migration is done.
4875 */
4876
4877/*
4878 * Change a given task's CPU affinity. Migrate the thread to a
4879 * proper CPU and schedule it away if the CPU it's executing on
4880 * is removed from the allowed bitmask.
4881 *
4882 * NOTE: the caller must have a valid reference to the task, the
4883 * task must not exit() & deallocate itself prematurely. The
4884 * call is not atomic; no spinlocks may be held.
4885 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004886int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004887{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004888 struct migration_req req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004889 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004890 struct rq *rq;
Ingo Molnar48f24c42006-07-03 00:25:40 -07004891 int ret = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004892
4893 rq = task_rq_lock(p, &flags);
4894 if (!cpus_intersects(new_mask, cpu_online_map)) {
4895 ret = -EINVAL;
4896 goto out;
4897 }
4898
4899 p->cpus_allowed = new_mask;
4900 /* Can the task run on the task's current CPU? If so, we're done */
4901 if (cpu_isset(task_cpu(p), new_mask))
4902 goto out;
4903
4904 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4905 /* Need help from migration thread: drop lock and wait. */
4906 task_rq_unlock(rq, &flags);
4907 wake_up_process(rq->migration_thread);
4908 wait_for_completion(&req.done);
4909 tlb_migrate_finish(p->mm);
4910 return 0;
4911 }
4912out:
4913 task_rq_unlock(rq, &flags);
Ingo Molnar48f24c42006-07-03 00:25:40 -07004914
Linus Torvalds1da177e2005-04-16 15:20:36 -07004915 return ret;
4916}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004917EXPORT_SYMBOL_GPL(set_cpus_allowed);
4918
4919/*
4920 * Move (not current) task off this cpu, onto dest cpu. We're doing
4921 * this because either it can't run here any more (set_cpus_allowed()
4922 * away from this CPU, or CPU going down), or because we're
4923 * attempting to rebalance this task on exec (sched_exec).
4924 *
4925 * So we race with normal scheduler movements, but that's OK, as long
4926 * as the task is no longer on this CPU.
Kirill Korotaevefc30812006-06-27 02:54:32 -07004927 *
4928 * Returns non-zero if task was successfully migrated.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004929 */
Kirill Korotaevefc30812006-06-27 02:54:32 -07004930static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004931{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004932 struct rq *rq_dest, *rq_src;
Ingo Molnardd41f592007-07-09 18:51:59 +02004933 int ret = 0, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004934
4935 if (unlikely(cpu_is_offline(dest_cpu)))
Kirill Korotaevefc30812006-06-27 02:54:32 -07004936 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004937
4938 rq_src = cpu_rq(src_cpu);
4939 rq_dest = cpu_rq(dest_cpu);
4940
4941 double_rq_lock(rq_src, rq_dest);
4942 /* Already moved. */
4943 if (task_cpu(p) != src_cpu)
4944 goto out;
4945 /* Affinity changed (again). */
4946 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4947 goto out;
4948
Ingo Molnardd41f592007-07-09 18:51:59 +02004949 on_rq = p->se.on_rq;
4950 if (on_rq)
4951 deactivate_task(rq_src, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004952 set_task_cpu(p, dest_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02004953 if (on_rq) {
4954 activate_task(rq_dest, p, 0);
4955 check_preempt_curr(rq_dest, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004956 }
Kirill Korotaevefc30812006-06-27 02:54:32 -07004957 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004958out:
4959 double_rq_unlock(rq_src, rq_dest);
Kirill Korotaevefc30812006-06-27 02:54:32 -07004960 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004961}
4962
4963/*
4964 * migration_thread - this is a highprio system thread that performs
4965 * thread migration by bumping thread off CPU then 'pushing' onto
4966 * another runqueue.
4967 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004968static int migration_thread(void *data)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004969{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004970 int cpu = (long)data;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004971 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004972
4973 rq = cpu_rq(cpu);
4974 BUG_ON(rq->migration_thread != current);
4975
4976 set_current_state(TASK_INTERRUPTIBLE);
4977 while (!kthread_should_stop()) {
Ingo Molnar70b97a72006-07-03 00:25:42 -07004978 struct migration_req *req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004979 struct list_head *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004980
Christoph Lameter3e1d1d22005-06-24 23:13:50 -07004981 try_to_freeze();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004982
4983 spin_lock_irq(&rq->lock);
4984
4985 if (cpu_is_offline(cpu)) {
4986 spin_unlock_irq(&rq->lock);
4987 goto wait_to_die;
4988 }
4989
4990 if (rq->active_balance) {
4991 active_load_balance(rq, cpu);
4992 rq->active_balance = 0;
4993 }
4994
4995 head = &rq->migration_queue;
4996
4997 if (list_empty(head)) {
4998 spin_unlock_irq(&rq->lock);
4999 schedule();
5000 set_current_state(TASK_INTERRUPTIBLE);
5001 continue;
5002 }
Ingo Molnar70b97a72006-07-03 00:25:42 -07005003 req = list_entry(head->next, struct migration_req, list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005004 list_del_init(head->next);
5005
Nick Piggin674311d2005-06-25 14:57:27 -07005006 spin_unlock(&rq->lock);
5007 __migrate_task(req->task, cpu, req->dest_cpu);
5008 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005009
5010 complete(&req->done);
5011 }
5012 __set_current_state(TASK_RUNNING);
5013 return 0;
5014
5015wait_to_die:
5016 /* Wait for kthread_stop */
5017 set_current_state(TASK_INTERRUPTIBLE);
5018 while (!kthread_should_stop()) {
5019 schedule();
5020 set_current_state(TASK_INTERRUPTIBLE);
5021 }
5022 __set_current_state(TASK_RUNNING);
5023 return 0;
5024}
5025
5026#ifdef CONFIG_HOTPLUG_CPU
Kirill Korotaev054b9102006-12-10 02:20:11 -08005027/*
5028 * Figure out where task on dead CPU should go, use force if neccessary.
5029 * NOTE: interrupts should be disabled by the caller
5030 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005031static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005032{
Kirill Korotaevefc30812006-06-27 02:54:32 -07005033 unsigned long flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005034 cpumask_t mask;
Ingo Molnar70b97a72006-07-03 00:25:42 -07005035 struct rq *rq;
5036 int dest_cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005037
Kirill Korotaevefc30812006-06-27 02:54:32 -07005038restart:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005039 /* On same node? */
5040 mask = node_to_cpumask(cpu_to_node(dead_cpu));
Ingo Molnar48f24c42006-07-03 00:25:40 -07005041 cpus_and(mask, mask, p->cpus_allowed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005042 dest_cpu = any_online_cpu(mask);
5043
5044 /* On any allowed CPU? */
5045 if (dest_cpu == NR_CPUS)
Ingo Molnar48f24c42006-07-03 00:25:40 -07005046 dest_cpu = any_online_cpu(p->cpus_allowed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005047
5048 /* No more Mr. Nice Guy. */
5049 if (dest_cpu == NR_CPUS) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07005050 rq = task_rq_lock(p, &flags);
5051 cpus_setall(p->cpus_allowed);
5052 dest_cpu = any_online_cpu(p->cpus_allowed);
Kirill Korotaevefc30812006-06-27 02:54:32 -07005053 task_rq_unlock(rq, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005054
5055 /*
5056 * Don't tell them about moving exiting tasks or
5057 * kernel threads (both mm NULL), since they never
5058 * leave kernel.
5059 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005060 if (p->mm && printk_ratelimit())
Linus Torvalds1da177e2005-04-16 15:20:36 -07005061 printk(KERN_INFO "process %d (%s) no "
5062 "longer affine to cpu%d\n",
Ingo Molnar48f24c42006-07-03 00:25:40 -07005063 p->pid, p->comm, dead_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005064 }
Ingo Molnar48f24c42006-07-03 00:25:40 -07005065 if (!__migrate_task(p, dead_cpu, dest_cpu))
Kirill Korotaevefc30812006-06-27 02:54:32 -07005066 goto restart;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005067}
5068
5069/*
5070 * While a dead CPU has no uninterruptible tasks queued at this point,
5071 * it might still have a nonzero ->nr_uninterruptible counter, because
5072 * for performance reasons the counter is not stricly tracking tasks to
5073 * their home CPUs. So we just add the counter to another CPU's counter,
5074 * to keep the global sum constant after CPU-down:
5075 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07005076static void migrate_nr_uninterruptible(struct rq *rq_src)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005077{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005078 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005079 unsigned long flags;
5080
5081 local_irq_save(flags);
5082 double_rq_lock(rq_src, rq_dest);
5083 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5084 rq_src->nr_uninterruptible = 0;
5085 double_rq_unlock(rq_src, rq_dest);
5086 local_irq_restore(flags);
5087}
5088
5089/* Run through task list and migrate tasks from the dead cpu. */
5090static void migrate_live_tasks(int src_cpu)
5091{
Ingo Molnar48f24c42006-07-03 00:25:40 -07005092 struct task_struct *p, *t;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005093
5094 write_lock_irq(&tasklist_lock);
5095
Ingo Molnar48f24c42006-07-03 00:25:40 -07005096 do_each_thread(t, p) {
5097 if (p == current)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005098 continue;
5099
Ingo Molnar48f24c42006-07-03 00:25:40 -07005100 if (task_cpu(p) == src_cpu)
5101 move_task_off_dead_cpu(src_cpu, p);
5102 } while_each_thread(t, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005103
5104 write_unlock_irq(&tasklist_lock);
5105}
5106
Ingo Molnardd41f592007-07-09 18:51:59 +02005107/*
5108 * Schedules idle task to be the next runnable task on current CPU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005109 * It does so by boosting its priority to highest possible and adding it to
Ingo Molnar48f24c42006-07-03 00:25:40 -07005110 * the _front_ of the runqueue. Used by CPU offline code.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005111 */
5112void sched_idle_next(void)
5113{
Ingo Molnar48f24c42006-07-03 00:25:40 -07005114 int this_cpu = smp_processor_id();
Ingo Molnar70b97a72006-07-03 00:25:42 -07005115 struct rq *rq = cpu_rq(this_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005116 struct task_struct *p = rq->idle;
5117 unsigned long flags;
5118
5119 /* cpu has to be offline */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005120 BUG_ON(cpu_online(this_cpu));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005121
Ingo Molnar48f24c42006-07-03 00:25:40 -07005122 /*
5123 * Strictly not necessary since rest of the CPUs are stopped by now
5124 * and interrupts disabled on the current cpu.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005125 */
5126 spin_lock_irqsave(&rq->lock, flags);
5127
Ingo Molnardd41f592007-07-09 18:51:59 +02005128 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005129
5130 /* Add idle task to the _front_ of its priority queue: */
Ingo Molnardd41f592007-07-09 18:51:59 +02005131 activate_idle_task(p, rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005132
5133 spin_unlock_irqrestore(&rq->lock, flags);
5134}
5135
Ingo Molnar48f24c42006-07-03 00:25:40 -07005136/*
5137 * Ensures that the idle task is using init_mm right before its cpu goes
Linus Torvalds1da177e2005-04-16 15:20:36 -07005138 * offline.
5139 */
5140void idle_task_exit(void)
5141{
5142 struct mm_struct *mm = current->active_mm;
5143
5144 BUG_ON(cpu_online(smp_processor_id()));
5145
5146 if (mm != &init_mm)
5147 switch_mm(mm, &init_mm, current);
5148 mmdrop(mm);
5149}
5150
Kirill Korotaev054b9102006-12-10 02:20:11 -08005151/* called under rq->lock with disabled interrupts */
Ingo Molnar36c8b582006-07-03 00:25:41 -07005152static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005153{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005154 struct rq *rq = cpu_rq(dead_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005155
5156 /* Must be exiting, otherwise would be on tasklist. */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005157 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005158
5159 /* Cannot have done final schedule yet: would have vanished. */
Oleg Nesterovc394cc92006-09-29 02:01:11 -07005160 BUG_ON(p->state == TASK_DEAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005161
Ingo Molnar48f24c42006-07-03 00:25:40 -07005162 get_task_struct(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005163
5164 /*
5165 * Drop lock around migration; if someone else moves it,
5166 * that's OK. No task can be added to this CPU, so iteration is
5167 * fine.
Kirill Korotaev054b9102006-12-10 02:20:11 -08005168 * NOTE: interrupts should be left disabled --dev@
Linus Torvalds1da177e2005-04-16 15:20:36 -07005169 */
Kirill Korotaev054b9102006-12-10 02:20:11 -08005170 spin_unlock(&rq->lock);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005171 move_task_off_dead_cpu(dead_cpu, p);
Kirill Korotaev054b9102006-12-10 02:20:11 -08005172 spin_lock(&rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005173
Ingo Molnar48f24c42006-07-03 00:25:40 -07005174 put_task_struct(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005175}
5176
5177/* release_task() removes task from tasklist, so we won't find dead tasks. */
5178static void migrate_dead_tasks(unsigned int dead_cpu)
5179{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005180 struct rq *rq = cpu_rq(dead_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02005181 struct task_struct *next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005182
Ingo Molnardd41f592007-07-09 18:51:59 +02005183 for ( ; ; ) {
5184 if (!rq->nr_running)
5185 break;
5186 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5187 if (!next)
5188 break;
5189 migrate_dead(dead_cpu, next);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005190 }
5191}
5192#endif /* CONFIG_HOTPLUG_CPU */
5193
5194/*
5195 * migration_call - callback that gets triggered when a CPU is added.
5196 * Here we can start up the necessary migration thread for the new CPU.
5197 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005198static int __cpuinit
5199migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005200{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005201 struct task_struct *p;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005202 int cpu = (long)hcpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005203 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07005204 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005205
5206 switch (action) {
Gautham R Shenoy5be93612007-05-09 02:34:04 -07005207 case CPU_LOCK_ACQUIRE:
5208 mutex_lock(&sched_hotcpu_mutex);
5209 break;
5210
Linus Torvalds1da177e2005-04-16 15:20:36 -07005211 case CPU_UP_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005212 case CPU_UP_PREPARE_FROZEN:
Ingo Molnardd41f592007-07-09 18:51:59 +02005213 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005214 if (IS_ERR(p))
5215 return NOTIFY_BAD;
5216 p->flags |= PF_NOFREEZE;
5217 kthread_bind(p, cpu);
5218 /* Must be high prio: stop_machine expects to yield to it. */
5219 rq = task_rq_lock(p, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02005220 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005221 task_rq_unlock(rq, &flags);
5222 cpu_rq(cpu)->migration_thread = p;
5223 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005224
Linus Torvalds1da177e2005-04-16 15:20:36 -07005225 case CPU_ONLINE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005226 case CPU_ONLINE_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005227 /* Strictly unneccessary, as first user will wake it. */
5228 wake_up_process(cpu_rq(cpu)->migration_thread);
5229 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005230
Linus Torvalds1da177e2005-04-16 15:20:36 -07005231#ifdef CONFIG_HOTPLUG_CPU
5232 case CPU_UP_CANCELED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005233 case CPU_UP_CANCELED_FROZEN:
Heiko Carstensfc75cdf2006-06-25 05:49:10 -07005234 if (!cpu_rq(cpu)->migration_thread)
5235 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005236 /* Unbind it from offline cpu so it can run. Fall thru. */
Heiko Carstensa4c4af72005-11-07 00:58:38 -08005237 kthread_bind(cpu_rq(cpu)->migration_thread,
5238 any_online_cpu(cpu_online_map));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005239 kthread_stop(cpu_rq(cpu)->migration_thread);
5240 cpu_rq(cpu)->migration_thread = NULL;
5241 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005242
Linus Torvalds1da177e2005-04-16 15:20:36 -07005243 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005244 case CPU_DEAD_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005245 migrate_live_tasks(cpu);
5246 rq = cpu_rq(cpu);
5247 kthread_stop(rq->migration_thread);
5248 rq->migration_thread = NULL;
5249 /* Idle task back to normal (off runqueue, low prio) */
5250 rq = task_rq_lock(rq->idle, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02005251 deactivate_task(rq, rq->idle, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005252 rq->idle->static_prio = MAX_PRIO;
Ingo Molnardd41f592007-07-09 18:51:59 +02005253 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5254 rq->idle->sched_class = &idle_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005255 migrate_dead_tasks(cpu);
5256 task_rq_unlock(rq, &flags);
5257 migrate_nr_uninterruptible(rq);
5258 BUG_ON(rq->nr_running != 0);
5259
5260 /* No need to migrate the tasks: it was best-effort if
Gautham R Shenoy5be93612007-05-09 02:34:04 -07005261 * they didn't take sched_hotcpu_mutex. Just wake up
Linus Torvalds1da177e2005-04-16 15:20:36 -07005262 * the requestors. */
5263 spin_lock_irq(&rq->lock);
5264 while (!list_empty(&rq->migration_queue)) {
Ingo Molnar70b97a72006-07-03 00:25:42 -07005265 struct migration_req *req;
5266
Linus Torvalds1da177e2005-04-16 15:20:36 -07005267 req = list_entry(rq->migration_queue.next,
Ingo Molnar70b97a72006-07-03 00:25:42 -07005268 struct migration_req, list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005269 list_del_init(&req->list);
5270 complete(&req->done);
5271 }
5272 spin_unlock_irq(&rq->lock);
5273 break;
5274#endif
Gautham R Shenoy5be93612007-05-09 02:34:04 -07005275 case CPU_LOCK_RELEASE:
5276 mutex_unlock(&sched_hotcpu_mutex);
5277 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005278 }
5279 return NOTIFY_OK;
5280}
5281
5282/* Register at highest priority so that task migration (migrate_all_tasks)
5283 * happens before everything else.
5284 */
Chandra Seetharaman26c21432006-06-27 02:54:10 -07005285static struct notifier_block __cpuinitdata migration_notifier = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005286 .notifier_call = migration_call,
5287 .priority = 10
5288};
5289
5290int __init migration_init(void)
5291{
5292 void *cpu = (void *)(long)smp_processor_id();
Akinobu Mita07dccf32006-09-29 02:00:22 -07005293 int err;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005294
5295 /* Start one for the boot CPU: */
Akinobu Mita07dccf32006-09-29 02:00:22 -07005296 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5297 BUG_ON(err == NOTIFY_BAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005298 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5299 register_cpu_notifier(&migration_notifier);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005300
Linus Torvalds1da177e2005-04-16 15:20:36 -07005301 return 0;
5302}
5303#endif
5304
5305#ifdef CONFIG_SMP
Christoph Lameter476f3532007-05-06 14:48:58 -07005306
5307/* Number of possible processor ids */
5308int nr_cpu_ids __read_mostly = NR_CPUS;
5309EXPORT_SYMBOL(nr_cpu_ids);
5310
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005311#undef SCHED_DOMAIN_DEBUG
Linus Torvalds1da177e2005-04-16 15:20:36 -07005312#ifdef SCHED_DOMAIN_DEBUG
5313static void sched_domain_debug(struct sched_domain *sd, int cpu)
5314{
5315 int level = 0;
5316
Nick Piggin41c7ce92005-06-25 14:57:24 -07005317 if (!sd) {
5318 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5319 return;
5320 }
5321
Linus Torvalds1da177e2005-04-16 15:20:36 -07005322 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5323
5324 do {
5325 int i;
5326 char str[NR_CPUS];
5327 struct sched_group *group = sd->groups;
5328 cpumask_t groupmask;
5329
5330 cpumask_scnprintf(str, NR_CPUS, sd->span);
5331 cpus_clear(groupmask);
5332
5333 printk(KERN_DEBUG);
5334 for (i = 0; i < level + 1; i++)
5335 printk(" ");
5336 printk("domain %d: ", level);
5337
5338 if (!(sd->flags & SD_LOAD_BALANCE)) {
5339 printk("does not load-balance\n");
5340 if (sd->parent)
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005341 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5342 " has parent");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005343 break;
5344 }
5345
5346 printk("span %s\n", str);
5347
5348 if (!cpu_isset(cpu, sd->span))
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005349 printk(KERN_ERR "ERROR: domain->span does not contain "
5350 "CPU%d\n", cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005351 if (!cpu_isset(cpu, group->cpumask))
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005352 printk(KERN_ERR "ERROR: domain->groups does not contain"
5353 " CPU%d\n", cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005354
5355 printk(KERN_DEBUG);
5356 for (i = 0; i < level + 2; i++)
5357 printk(" ");
5358 printk("groups:");
5359 do {
5360 if (!group) {
5361 printk("\n");
5362 printk(KERN_ERR "ERROR: group is NULL\n");
5363 break;
5364 }
5365
Eric Dumazet5517d862007-05-08 00:32:57 -07005366 if (!group->__cpu_power) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005367 printk("\n");
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005368 printk(KERN_ERR "ERROR: domain->cpu_power not "
5369 "set\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005370 }
5371
5372 if (!cpus_weight(group->cpumask)) {
5373 printk("\n");
5374 printk(KERN_ERR "ERROR: empty group\n");
5375 }
5376
5377 if (cpus_intersects(groupmask, group->cpumask)) {
5378 printk("\n");
5379 printk(KERN_ERR "ERROR: repeated CPUs\n");
5380 }
5381
5382 cpus_or(groupmask, groupmask, group->cpumask);
5383
5384 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5385 printk(" %s", str);
5386
5387 group = group->next;
5388 } while (group != sd->groups);
5389 printk("\n");
5390
5391 if (!cpus_equal(sd->span, groupmask))
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005392 printk(KERN_ERR "ERROR: groups don't span "
5393 "domain->span\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005394
5395 level++;
5396 sd = sd->parent;
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005397 if (!sd)
5398 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005399
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005400 if (!cpus_subset(groupmask, sd->span))
5401 printk(KERN_ERR "ERROR: parent span is not a superset "
5402 "of domain->span\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005403
5404 } while (sd);
5405}
5406#else
Ingo Molnar48f24c42006-07-03 00:25:40 -07005407# define sched_domain_debug(sd, cpu) do { } while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005408#endif
5409
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005410static int sd_degenerate(struct sched_domain *sd)
Suresh Siddha245af2c2005-06-25 14:57:25 -07005411{
5412 if (cpus_weight(sd->span) == 1)
5413 return 1;
5414
5415 /* Following flags need at least 2 groups */
5416 if (sd->flags & (SD_LOAD_BALANCE |
5417 SD_BALANCE_NEWIDLE |
5418 SD_BALANCE_FORK |
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005419 SD_BALANCE_EXEC |
5420 SD_SHARE_CPUPOWER |
5421 SD_SHARE_PKG_RESOURCES)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07005422 if (sd->groups != sd->groups->next)
5423 return 0;
5424 }
5425
5426 /* Following flags don't use groups */
5427 if (sd->flags & (SD_WAKE_IDLE |
5428 SD_WAKE_AFFINE |
5429 SD_WAKE_BALANCE))
5430 return 0;
5431
5432 return 1;
5433}
5434
Ingo Molnar48f24c42006-07-03 00:25:40 -07005435static int
5436sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
Suresh Siddha245af2c2005-06-25 14:57:25 -07005437{
5438 unsigned long cflags = sd->flags, pflags = parent->flags;
5439
5440 if (sd_degenerate(parent))
5441 return 1;
5442
5443 if (!cpus_equal(sd->span, parent->span))
5444 return 0;
5445
5446 /* Does parent contain flags not in child? */
5447 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5448 if (cflags & SD_WAKE_AFFINE)
5449 pflags &= ~SD_WAKE_BALANCE;
5450 /* Flags needing groups don't count if only 1 group in parent */
5451 if (parent->groups == parent->groups->next) {
5452 pflags &= ~(SD_LOAD_BALANCE |
5453 SD_BALANCE_NEWIDLE |
5454 SD_BALANCE_FORK |
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005455 SD_BALANCE_EXEC |
5456 SD_SHARE_CPUPOWER |
5457 SD_SHARE_PKG_RESOURCES);
Suresh Siddha245af2c2005-06-25 14:57:25 -07005458 }
5459 if (~cflags & pflags)
5460 return 0;
5461
5462 return 1;
5463}
5464
Linus Torvalds1da177e2005-04-16 15:20:36 -07005465/*
5466 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5467 * hold the hotplug lock.
5468 */
John Hawkes9c1cfda2005-09-06 15:18:14 -07005469static void cpu_attach_domain(struct sched_domain *sd, int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005470{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005471 struct rq *rq = cpu_rq(cpu);
Suresh Siddha245af2c2005-06-25 14:57:25 -07005472 struct sched_domain *tmp;
5473
5474 /* Remove the sched domains which do not contribute to scheduling. */
5475 for (tmp = sd; tmp; tmp = tmp->parent) {
5476 struct sched_domain *parent = tmp->parent;
5477 if (!parent)
5478 break;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005479 if (sd_parent_degenerate(tmp, parent)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07005480 tmp->parent = parent->parent;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005481 if (parent->parent)
5482 parent->parent->child = tmp;
5483 }
Suresh Siddha245af2c2005-06-25 14:57:25 -07005484 }
5485
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005486 if (sd && sd_degenerate(sd)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07005487 sd = sd->parent;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005488 if (sd)
5489 sd->child = NULL;
5490 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005491
5492 sched_domain_debug(sd, cpu);
5493
Nick Piggin674311d2005-06-25 14:57:27 -07005494 rcu_assign_pointer(rq->sd, sd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005495}
5496
5497/* cpus with isolated domains */
Tim Chen67af63a2006-12-22 01:07:50 -08005498static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005499
5500/* Setup the mask of cpus configured for isolated domains */
5501static int __init isolated_cpu_setup(char *str)
5502{
5503 int ints[NR_CPUS], i;
5504
5505 str = get_options(str, ARRAY_SIZE(ints), ints);
5506 cpus_clear(cpu_isolated_map);
5507 for (i = 1; i <= ints[0]; i++)
5508 if (ints[i] < NR_CPUS)
5509 cpu_set(ints[i], cpu_isolated_map);
5510 return 1;
5511}
5512
5513__setup ("isolcpus=", isolated_cpu_setup);
5514
5515/*
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005516 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5517 * to a function which identifies what group(along with sched group) a CPU
5518 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5519 * (due to the fact that we keep track of groups covered with a cpumask_t).
Linus Torvalds1da177e2005-04-16 15:20:36 -07005520 *
5521 * init_sched_build_groups will build a circular linked list of the groups
5522 * covered by the given span, and will set each group's ->cpumask correctly,
5523 * and ->cpu_power to 0.
5524 */
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005525static void
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005526init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5527 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5528 struct sched_group **sg))
Linus Torvalds1da177e2005-04-16 15:20:36 -07005529{
5530 struct sched_group *first = NULL, *last = NULL;
5531 cpumask_t covered = CPU_MASK_NONE;
5532 int i;
5533
5534 for_each_cpu_mask(i, span) {
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005535 struct sched_group *sg;
5536 int group = group_fn(i, cpu_map, &sg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005537 int j;
5538
5539 if (cpu_isset(i, covered))
5540 continue;
5541
5542 sg->cpumask = CPU_MASK_NONE;
Eric Dumazet5517d862007-05-08 00:32:57 -07005543 sg->__cpu_power = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005544
5545 for_each_cpu_mask(j, span) {
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005546 if (group_fn(j, cpu_map, NULL) != group)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005547 continue;
5548
5549 cpu_set(j, covered);
5550 cpu_set(j, sg->cpumask);
5551 }
5552 if (!first)
5553 first = sg;
5554 if (last)
5555 last->next = sg;
5556 last = sg;
5557 }
5558 last->next = first;
5559}
5560
John Hawkes9c1cfda2005-09-06 15:18:14 -07005561#define SD_NODES_PER_DOMAIN 16
Linus Torvalds1da177e2005-04-16 15:20:36 -07005562
John Hawkes9c1cfda2005-09-06 15:18:14 -07005563#ifdef CONFIG_NUMA
akpm@osdl.org198e2f12006-01-12 01:05:30 -08005564
John Hawkes9c1cfda2005-09-06 15:18:14 -07005565/**
5566 * find_next_best_node - find the next node to include in a sched_domain
5567 * @node: node whose sched_domain we're building
5568 * @used_nodes: nodes already in the sched_domain
5569 *
5570 * Find the next node to include in a given scheduling domain. Simply
5571 * finds the closest node not already in the @used_nodes map.
5572 *
5573 * Should use nodemask_t.
5574 */
5575static int find_next_best_node(int node, unsigned long *used_nodes)
5576{
5577 int i, n, val, min_val, best_node = 0;
5578
5579 min_val = INT_MAX;
5580
5581 for (i = 0; i < MAX_NUMNODES; i++) {
5582 /* Start at @node */
5583 n = (node + i) % MAX_NUMNODES;
5584
5585 if (!nr_cpus_node(n))
5586 continue;
5587
5588 /* Skip already used nodes */
5589 if (test_bit(n, used_nodes))
5590 continue;
5591
5592 /* Simple min distance search */
5593 val = node_distance(node, n);
5594
5595 if (val < min_val) {
5596 min_val = val;
5597 best_node = n;
5598 }
5599 }
5600
5601 set_bit(best_node, used_nodes);
5602 return best_node;
5603}
5604
5605/**
5606 * sched_domain_node_span - get a cpumask for a node's sched_domain
5607 * @node: node whose cpumask we're constructing
5608 * @size: number of nodes to include in this span
5609 *
5610 * Given a node, construct a good cpumask for its sched_domain to span. It
5611 * should be one that prevents unnecessary balancing, but also spreads tasks
5612 * out optimally.
5613 */
5614static cpumask_t sched_domain_node_span(int node)
5615{
John Hawkes9c1cfda2005-09-06 15:18:14 -07005616 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005617 cpumask_t span, nodemask;
5618 int i;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005619
5620 cpus_clear(span);
5621 bitmap_zero(used_nodes, MAX_NUMNODES);
5622
5623 nodemask = node_to_cpumask(node);
5624 cpus_or(span, span, nodemask);
5625 set_bit(node, used_nodes);
5626
5627 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5628 int next_node = find_next_best_node(node, used_nodes);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005629
John Hawkes9c1cfda2005-09-06 15:18:14 -07005630 nodemask = node_to_cpumask(next_node);
5631 cpus_or(span, span, nodemask);
5632 }
5633
5634 return span;
5635}
5636#endif
5637
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07005638int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005639
John Hawkes9c1cfda2005-09-06 15:18:14 -07005640/*
Ingo Molnar48f24c42006-07-03 00:25:40 -07005641 * SMT sched-domains:
John Hawkes9c1cfda2005-09-06 15:18:14 -07005642 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005643#ifdef CONFIG_SCHED_SMT
5644static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005645static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005646
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005647static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5648 struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005649{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005650 if (sg)
5651 *sg = &per_cpu(sched_group_cpus, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005652 return cpu;
5653}
5654#endif
5655
Ingo Molnar48f24c42006-07-03 00:25:40 -07005656/*
5657 * multi-core sched-domains:
5658 */
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005659#ifdef CONFIG_SCHED_MC
5660static DEFINE_PER_CPU(struct sched_domain, core_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005661static DEFINE_PER_CPU(struct sched_group, sched_group_core);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005662#endif
5663
5664#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005665static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5666 struct sched_group **sg)
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005667{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005668 int group;
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005669 cpumask_t mask = cpu_sibling_map[cpu];
5670 cpus_and(mask, mask, *cpu_map);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005671 group = first_cpu(mask);
5672 if (sg)
5673 *sg = &per_cpu(sched_group_core, group);
5674 return group;
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005675}
5676#elif defined(CONFIG_SCHED_MC)
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005677static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5678 struct sched_group **sg)
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005679{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005680 if (sg)
5681 *sg = &per_cpu(sched_group_core, cpu);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005682 return cpu;
5683}
5684#endif
5685
Linus Torvalds1da177e2005-04-16 15:20:36 -07005686static DEFINE_PER_CPU(struct sched_domain, phys_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005687static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005688
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005689static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5690 struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005691{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005692 int group;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005693#ifdef CONFIG_SCHED_MC
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005694 cpumask_t mask = cpu_coregroup_map(cpu);
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005695 cpus_and(mask, mask, *cpu_map);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005696 group = first_cpu(mask);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005697#elif defined(CONFIG_SCHED_SMT)
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005698 cpumask_t mask = cpu_sibling_map[cpu];
5699 cpus_and(mask, mask, *cpu_map);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005700 group = first_cpu(mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005701#else
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005702 group = cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005703#endif
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005704 if (sg)
5705 *sg = &per_cpu(sched_group_phys, group);
5706 return group;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005707}
5708
5709#ifdef CONFIG_NUMA
John Hawkes9c1cfda2005-09-06 15:18:14 -07005710/*
5711 * The init_sched_build_groups can't handle what we want to do with node
5712 * groups, so roll our own. Now each node has its own list of groups which
5713 * gets dynamically allocated.
5714 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005715static DEFINE_PER_CPU(struct sched_domain, node_domains);
John Hawkesd1b55132005-09-06 15:18:14 -07005716static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
John Hawkes9c1cfda2005-09-06 15:18:14 -07005717
5718static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005719static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005720
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005721static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5722 struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005723{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005724 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5725 int group;
5726
5727 cpus_and(nodemask, nodemask, *cpu_map);
5728 group = first_cpu(nodemask);
5729
5730 if (sg)
5731 *sg = &per_cpu(sched_group_allnodes, group);
5732 return group;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005733}
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005734
Siddha, Suresh B08069032006-03-27 01:15:23 -08005735static void init_numa_sched_groups_power(struct sched_group *group_head)
5736{
5737 struct sched_group *sg = group_head;
5738 int j;
5739
5740 if (!sg)
5741 return;
5742next_sg:
5743 for_each_cpu_mask(j, sg->cpumask) {
5744 struct sched_domain *sd;
5745
5746 sd = &per_cpu(phys_domains, j);
5747 if (j != first_cpu(sd->groups->cpumask)) {
5748 /*
5749 * Only add "power" once for each
5750 * physical package.
5751 */
5752 continue;
5753 }
5754
Eric Dumazet5517d862007-05-08 00:32:57 -07005755 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
Siddha, Suresh B08069032006-03-27 01:15:23 -08005756 }
5757 sg = sg->next;
5758 if (sg != group_head)
5759 goto next_sg;
5760}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005761#endif
5762
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005763#ifdef CONFIG_NUMA
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005764/* Free memory allocated for various sched_group structures */
5765static void free_sched_groups(const cpumask_t *cpu_map)
5766{
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005767 int cpu, i;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005768
5769 for_each_cpu_mask(cpu, *cpu_map) {
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005770 struct sched_group **sched_group_nodes
5771 = sched_group_nodes_bycpu[cpu];
5772
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005773 if (!sched_group_nodes)
5774 continue;
5775
5776 for (i = 0; i < MAX_NUMNODES; i++) {
5777 cpumask_t nodemask = node_to_cpumask(i);
5778 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5779
5780 cpus_and(nodemask, nodemask, *cpu_map);
5781 if (cpus_empty(nodemask))
5782 continue;
5783
5784 if (sg == NULL)
5785 continue;
5786 sg = sg->next;
5787next_sg:
5788 oldsg = sg;
5789 sg = sg->next;
5790 kfree(oldsg);
5791 if (oldsg != sched_group_nodes[i])
5792 goto next_sg;
5793 }
5794 kfree(sched_group_nodes);
5795 sched_group_nodes_bycpu[cpu] = NULL;
5796 }
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005797}
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005798#else
5799static void free_sched_groups(const cpumask_t *cpu_map)
5800{
5801}
5802#endif
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005803
Linus Torvalds1da177e2005-04-16 15:20:36 -07005804/*
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005805 * Initialize sched groups cpu_power.
5806 *
5807 * cpu_power indicates the capacity of sched group, which is used while
5808 * distributing the load between different sched groups in a sched domain.
5809 * Typically cpu_power for all the groups in a sched domain will be same unless
5810 * there are asymmetries in the topology. If there are asymmetries, group
5811 * having more cpu_power will pickup more load compared to the group having
5812 * less cpu_power.
5813 *
5814 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5815 * the maximum number of tasks a group can handle in the presence of other idle
5816 * or lightly loaded groups in the same sched domain.
5817 */
5818static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5819{
5820 struct sched_domain *child;
5821 struct sched_group *group;
5822
5823 WARN_ON(!sd || !sd->groups);
5824
5825 if (cpu != first_cpu(sd->groups->cpumask))
5826 return;
5827
5828 child = sd->child;
5829
Eric Dumazet5517d862007-05-08 00:32:57 -07005830 sd->groups->__cpu_power = 0;
5831
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005832 /*
5833 * For perf policy, if the groups in child domain share resources
5834 * (for example cores sharing some portions of the cache hierarchy
5835 * or SMT), then set this domain groups cpu_power such that each group
5836 * can handle only one task, when there are other idle groups in the
5837 * same sched domain.
5838 */
5839 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5840 (child->flags &
5841 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
Eric Dumazet5517d862007-05-08 00:32:57 -07005842 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005843 return;
5844 }
5845
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005846 /*
5847 * add cpu_power of each child group to this groups cpu_power
5848 */
5849 group = child->groups;
5850 do {
Eric Dumazet5517d862007-05-08 00:32:57 -07005851 sg_inc_cpu_power(sd->groups, group->__cpu_power);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005852 group = group->next;
5853 } while (group != child->groups);
5854}
5855
5856/*
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005857 * Build sched domains for a given set of cpus and attach the sched domains
5858 * to the individual cpus
Linus Torvalds1da177e2005-04-16 15:20:36 -07005859 */
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005860static int build_sched_domains(const cpumask_t *cpu_map)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005861{
5862 int i;
John Hawkesd1b55132005-09-06 15:18:14 -07005863#ifdef CONFIG_NUMA
5864 struct sched_group **sched_group_nodes = NULL;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005865 int sd_allnodes = 0;
John Hawkesd1b55132005-09-06 15:18:14 -07005866
5867 /*
5868 * Allocate the per-node list of sched groups
5869 */
Ingo Molnardd41f592007-07-09 18:51:59 +02005870 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
Srivatsa Vaddagirid3a5aa92006-06-27 02:54:39 -07005871 GFP_KERNEL);
John Hawkesd1b55132005-09-06 15:18:14 -07005872 if (!sched_group_nodes) {
5873 printk(KERN_WARNING "Can not alloc sched group node list\n");
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005874 return -ENOMEM;
John Hawkesd1b55132005-09-06 15:18:14 -07005875 }
5876 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5877#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005878
5879 /*
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005880 * Set up domains for cpus specified by the cpu_map.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005881 */
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005882 for_each_cpu_mask(i, *cpu_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005883 struct sched_domain *sd = NULL, *p;
5884 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5885
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005886 cpus_and(nodemask, nodemask, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005887
5888#ifdef CONFIG_NUMA
Ingo Molnardd41f592007-07-09 18:51:59 +02005889 if (cpus_weight(*cpu_map) >
5890 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
John Hawkes9c1cfda2005-09-06 15:18:14 -07005891 sd = &per_cpu(allnodes_domains, i);
5892 *sd = SD_ALLNODES_INIT;
5893 sd->span = *cpu_map;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005894 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005895 p = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005896 sd_allnodes = 1;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005897 } else
5898 p = NULL;
5899
Linus Torvalds1da177e2005-04-16 15:20:36 -07005900 sd = &per_cpu(node_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005901 *sd = SD_NODE_INIT;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005902 sd->span = sched_domain_node_span(cpu_to_node(i));
5903 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005904 if (p)
5905 p->child = sd;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005906 cpus_and(sd->span, sd->span, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005907#endif
5908
5909 p = sd;
5910 sd = &per_cpu(phys_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005911 *sd = SD_CPU_INIT;
5912 sd->span = nodemask;
5913 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005914 if (p)
5915 p->child = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005916 cpu_to_phys_group(i, cpu_map, &sd->groups);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005917
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005918#ifdef CONFIG_SCHED_MC
5919 p = sd;
5920 sd = &per_cpu(core_domains, i);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005921 *sd = SD_MC_INIT;
5922 sd->span = cpu_coregroup_map(i);
5923 cpus_and(sd->span, sd->span, *cpu_map);
5924 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005925 p->child = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005926 cpu_to_core_group(i, cpu_map, &sd->groups);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005927#endif
5928
Linus Torvalds1da177e2005-04-16 15:20:36 -07005929#ifdef CONFIG_SCHED_SMT
5930 p = sd;
5931 sd = &per_cpu(cpu_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005932 *sd = SD_SIBLING_INIT;
5933 sd->span = cpu_sibling_map[i];
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005934 cpus_and(sd->span, sd->span, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005935 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005936 p->child = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005937 cpu_to_cpu_group(i, cpu_map, &sd->groups);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005938#endif
5939 }
5940
5941#ifdef CONFIG_SCHED_SMT
5942 /* Set up CPU (sibling) groups */
John Hawkes9c1cfda2005-09-06 15:18:14 -07005943 for_each_cpu_mask(i, *cpu_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005944 cpumask_t this_sibling_map = cpu_sibling_map[i];
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005945 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005946 if (i != first_cpu(this_sibling_map))
5947 continue;
5948
Ingo Molnardd41f592007-07-09 18:51:59 +02005949 init_sched_build_groups(this_sibling_map, cpu_map,
5950 &cpu_to_cpu_group);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005951 }
5952#endif
5953
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005954#ifdef CONFIG_SCHED_MC
5955 /* Set up multi-core groups */
5956 for_each_cpu_mask(i, *cpu_map) {
5957 cpumask_t this_core_map = cpu_coregroup_map(i);
5958 cpus_and(this_core_map, this_core_map, *cpu_map);
5959 if (i != first_cpu(this_core_map))
5960 continue;
Ingo Molnardd41f592007-07-09 18:51:59 +02005961 init_sched_build_groups(this_core_map, cpu_map,
5962 &cpu_to_core_group);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005963 }
5964#endif
5965
Linus Torvalds1da177e2005-04-16 15:20:36 -07005966 /* Set up physical groups */
5967 for (i = 0; i < MAX_NUMNODES; i++) {
5968 cpumask_t nodemask = node_to_cpumask(i);
5969
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005970 cpus_and(nodemask, nodemask, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005971 if (cpus_empty(nodemask))
5972 continue;
5973
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005974 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005975 }
5976
5977#ifdef CONFIG_NUMA
5978 /* Set up node groups */
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005979 if (sd_allnodes)
Ingo Molnardd41f592007-07-09 18:51:59 +02005980 init_sched_build_groups(*cpu_map, cpu_map,
5981 &cpu_to_allnodes_group);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005982
5983 for (i = 0; i < MAX_NUMNODES; i++) {
5984 /* Set up node groups */
5985 struct sched_group *sg, *prev;
5986 cpumask_t nodemask = node_to_cpumask(i);
5987 cpumask_t domainspan;
5988 cpumask_t covered = CPU_MASK_NONE;
5989 int j;
5990
5991 cpus_and(nodemask, nodemask, *cpu_map);
John Hawkesd1b55132005-09-06 15:18:14 -07005992 if (cpus_empty(nodemask)) {
5993 sched_group_nodes[i] = NULL;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005994 continue;
John Hawkesd1b55132005-09-06 15:18:14 -07005995 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07005996
5997 domainspan = sched_domain_node_span(i);
5998 cpus_and(domainspan, domainspan, *cpu_map);
5999
Srivatsa Vaddagiri15f0b672006-06-27 02:54:40 -07006000 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006001 if (!sg) {
6002 printk(KERN_WARNING "Can not alloc domain group for "
6003 "node %d\n", i);
6004 goto error;
6005 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07006006 sched_group_nodes[i] = sg;
6007 for_each_cpu_mask(j, nodemask) {
6008 struct sched_domain *sd;
6009 sd = &per_cpu(node_domains, j);
6010 sd->groups = sg;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006011 }
Eric Dumazet5517d862007-05-08 00:32:57 -07006012 sg->__cpu_power = 0;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006013 sg->cpumask = nodemask;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006014 sg->next = sg;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006015 cpus_or(covered, covered, nodemask);
6016 prev = sg;
6017
6018 for (j = 0; j < MAX_NUMNODES; j++) {
6019 cpumask_t tmp, notcovered;
6020 int n = (i + j) % MAX_NUMNODES;
6021
6022 cpus_complement(notcovered, covered);
6023 cpus_and(tmp, notcovered, *cpu_map);
6024 cpus_and(tmp, tmp, domainspan);
6025 if (cpus_empty(tmp))
6026 break;
6027
6028 nodemask = node_to_cpumask(n);
6029 cpus_and(tmp, tmp, nodemask);
6030 if (cpus_empty(tmp))
6031 continue;
6032
Srivatsa Vaddagiri15f0b672006-06-27 02:54:40 -07006033 sg = kmalloc_node(sizeof(struct sched_group),
6034 GFP_KERNEL, i);
John Hawkes9c1cfda2005-09-06 15:18:14 -07006035 if (!sg) {
6036 printk(KERN_WARNING
6037 "Can not alloc domain group for node %d\n", j);
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006038 goto error;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006039 }
Eric Dumazet5517d862007-05-08 00:32:57 -07006040 sg->__cpu_power = 0;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006041 sg->cpumask = tmp;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006042 sg->next = prev->next;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006043 cpus_or(covered, covered, tmp);
6044 prev->next = sg;
6045 prev = sg;
6046 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07006047 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006048#endif
6049
6050 /* Calculate CPU power for physical packages and nodes */
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006051#ifdef CONFIG_SCHED_SMT
6052 for_each_cpu_mask(i, *cpu_map) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006053 struct sched_domain *sd = &per_cpu(cpu_domains, i);
6054
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006055 init_sched_groups_power(i, sd);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006056 }
6057#endif
6058#ifdef CONFIG_SCHED_MC
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006059 for_each_cpu_mask(i, *cpu_map) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006060 struct sched_domain *sd = &per_cpu(core_domains, i);
6061
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006062 init_sched_groups_power(i, sd);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006063 }
6064#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006065
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006066 for_each_cpu_mask(i, *cpu_map) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006067 struct sched_domain *sd = &per_cpu(phys_domains, i);
6068
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006069 init_sched_groups_power(i, sd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006070 }
6071
John Hawkes9c1cfda2005-09-06 15:18:14 -07006072#ifdef CONFIG_NUMA
Siddha, Suresh B08069032006-03-27 01:15:23 -08006073 for (i = 0; i < MAX_NUMNODES; i++)
6074 init_numa_sched_groups_power(sched_group_nodes[i]);
John Hawkes9c1cfda2005-09-06 15:18:14 -07006075
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08006076 if (sd_allnodes) {
6077 struct sched_group *sg;
Siddha, Suresh Bf712c0c2006-07-30 03:02:59 -07006078
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08006079 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
Siddha, Suresh Bf712c0c2006-07-30 03:02:59 -07006080 init_numa_sched_groups_power(sg);
6081 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07006082#endif
6083
Linus Torvalds1da177e2005-04-16 15:20:36 -07006084 /* Attach the domains */
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006085 for_each_cpu_mask(i, *cpu_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006086 struct sched_domain *sd;
6087#ifdef CONFIG_SCHED_SMT
6088 sd = &per_cpu(cpu_domains, i);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08006089#elif defined(CONFIG_SCHED_MC)
6090 sd = &per_cpu(core_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006091#else
6092 sd = &per_cpu(phys_domains, i);
6093#endif
6094 cpu_attach_domain(sd, i);
6095 }
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006096
6097 return 0;
6098
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07006099#ifdef CONFIG_NUMA
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006100error:
6101 free_sched_groups(cpu_map);
6102 return -ENOMEM;
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07006103#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006104}
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006105/*
6106 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6107 */
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006108static int arch_init_sched_domains(const cpumask_t *cpu_map)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006109{
6110 cpumask_t cpu_default_map;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006111 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006112
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006113 /*
6114 * Setup mask for cpus without special case scheduling requirements.
6115 * For now this just excludes isolated cpus, but could be used to
6116 * exclude other special cases in the future.
6117 */
6118 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6119
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006120 err = build_sched_domains(&cpu_default_map);
6121
6122 return err;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006123}
6124
6125static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006126{
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006127 free_sched_groups(cpu_map);
John Hawkes9c1cfda2005-09-06 15:18:14 -07006128}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006129
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006130/*
6131 * Detach sched domains from a group of cpus specified in cpu_map
6132 * These cpus will now be attached to the NULL domain
6133 */
Arjan van de Ven858119e2006-01-14 13:20:43 -08006134static void detach_destroy_domains(const cpumask_t *cpu_map)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006135{
6136 int i;
6137
6138 for_each_cpu_mask(i, *cpu_map)
6139 cpu_attach_domain(NULL, i);
6140 synchronize_sched();
6141 arch_destroy_sched_domains(cpu_map);
6142}
6143
6144/*
6145 * Partition sched domains as specified by the cpumasks below.
6146 * This attaches all cpus from the cpumasks to the NULL domain,
6147 * waits for a RCU quiescent period, recalculates sched
6148 * domain information and then attaches them back to the
6149 * correct sched domains
6150 * Call with hotplug lock held
6151 */
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006152int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006153{
6154 cpumask_t change_map;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006155 int err = 0;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006156
6157 cpus_and(*partition1, *partition1, cpu_online_map);
6158 cpus_and(*partition2, *partition2, cpu_online_map);
6159 cpus_or(change_map, *partition1, *partition2);
6160
6161 /* Detach sched domains from all of the affected cpus */
6162 detach_destroy_domains(&change_map);
6163 if (!cpus_empty(*partition1))
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006164 err = build_sched_domains(partition1);
6165 if (!err && !cpus_empty(*partition2))
6166 err = build_sched_domains(partition2);
6167
6168 return err;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006169}
6170
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006171#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6172int arch_reinit_sched_domains(void)
6173{
6174 int err;
6175
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006176 mutex_lock(&sched_hotcpu_mutex);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006177 detach_destroy_domains(&cpu_online_map);
6178 err = arch_init_sched_domains(&cpu_online_map);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006179 mutex_unlock(&sched_hotcpu_mutex);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006180
6181 return err;
6182}
6183
6184static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6185{
6186 int ret;
6187
6188 if (buf[0] != '0' && buf[0] != '1')
6189 return -EINVAL;
6190
6191 if (smt)
6192 sched_smt_power_savings = (buf[0] == '1');
6193 else
6194 sched_mc_power_savings = (buf[0] == '1');
6195
6196 ret = arch_reinit_sched_domains();
6197
6198 return ret ? ret : count;
6199}
6200
6201int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6202{
6203 int err = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07006204
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006205#ifdef CONFIG_SCHED_SMT
6206 if (smt_capable())
6207 err = sysfs_create_file(&cls->kset.kobj,
6208 &attr_sched_smt_power_savings.attr);
6209#endif
6210#ifdef CONFIG_SCHED_MC
6211 if (!err && mc_capable())
6212 err = sysfs_create_file(&cls->kset.kobj,
6213 &attr_sched_mc_power_savings.attr);
6214#endif
6215 return err;
6216}
6217#endif
6218
6219#ifdef CONFIG_SCHED_MC
6220static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6221{
6222 return sprintf(page, "%u\n", sched_mc_power_savings);
6223}
Ingo Molnar48f24c42006-07-03 00:25:40 -07006224static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6225 const char *buf, size_t count)
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006226{
6227 return sched_power_savings_store(buf, count, 0);
6228}
6229SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6230 sched_mc_power_savings_store);
6231#endif
6232
6233#ifdef CONFIG_SCHED_SMT
6234static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6235{
6236 return sprintf(page, "%u\n", sched_smt_power_savings);
6237}
Ingo Molnar48f24c42006-07-03 00:25:40 -07006238static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6239 const char *buf, size_t count)
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006240{
6241 return sched_power_savings_store(buf, count, 1);
6242}
6243SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6244 sched_smt_power_savings_store);
6245#endif
6246
Linus Torvalds1da177e2005-04-16 15:20:36 -07006247/*
6248 * Force a reinitialization of the sched domains hierarchy. The domains
6249 * and groups cannot be updated in place without racing with the balancing
Nick Piggin41c7ce92005-06-25 14:57:24 -07006250 * code, so we temporarily attach all running cpus to the NULL domain
Linus Torvalds1da177e2005-04-16 15:20:36 -07006251 * which will prevent rebalancing while the sched domains are recalculated.
6252 */
6253static int update_sched_domains(struct notifier_block *nfb,
6254 unsigned long action, void *hcpu)
6255{
Linus Torvalds1da177e2005-04-16 15:20:36 -07006256 switch (action) {
6257 case CPU_UP_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006258 case CPU_UP_PREPARE_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006259 case CPU_DOWN_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006260 case CPU_DOWN_PREPARE_FROZEN:
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006261 detach_destroy_domains(&cpu_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006262 return NOTIFY_OK;
6263
6264 case CPU_UP_CANCELED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006265 case CPU_UP_CANCELED_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006266 case CPU_DOWN_FAILED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006267 case CPU_DOWN_FAILED_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006268 case CPU_ONLINE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006269 case CPU_ONLINE_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006270 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006271 case CPU_DEAD_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006272 /*
6273 * Fall through and re-initialise the domains.
6274 */
6275 break;
6276 default:
6277 return NOTIFY_DONE;
6278 }
6279
6280 /* The hotplug lock is already held by cpu_up/cpu_down */
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006281 arch_init_sched_domains(&cpu_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006282
6283 return NOTIFY_OK;
6284}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006285
6286void __init sched_init_smp(void)
6287{
Nick Piggin5c1e1762006-10-03 01:14:04 -07006288 cpumask_t non_isolated_cpus;
6289
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006290 mutex_lock(&sched_hotcpu_mutex);
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006291 arch_init_sched_domains(&cpu_online_map);
Nathan Lynche5e56732007-01-10 23:15:28 -08006292 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
Nick Piggin5c1e1762006-10-03 01:14:04 -07006293 if (cpus_empty(non_isolated_cpus))
6294 cpu_set(smp_processor_id(), non_isolated_cpus);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006295 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006296 /* XXX: Theoretical race here - CPU may be hotplugged now */
6297 hotcpu_notifier(update_sched_domains, 0);
Nick Piggin5c1e1762006-10-03 01:14:04 -07006298
6299 /* Move init over to a non-isolated CPU */
6300 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6301 BUG();
Ingo Molnardd41f592007-07-09 18:51:59 +02006302 sched_init_granularity();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006303}
6304#else
6305void __init sched_init_smp(void)
6306{
Ingo Molnardd41f592007-07-09 18:51:59 +02006307 sched_init_granularity();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006308}
6309#endif /* CONFIG_SMP */
6310
6311int in_sched_functions(unsigned long addr)
6312{
6313 /* Linker adds these: start and end of __sched functions */
6314 extern char __sched_text_start[], __sched_text_end[];
Ingo Molnar48f24c42006-07-03 00:25:40 -07006315
Linus Torvalds1da177e2005-04-16 15:20:36 -07006316 return in_lock_functions(addr) ||
6317 (addr >= (unsigned long)__sched_text_start
6318 && addr < (unsigned long)__sched_text_end);
6319}
6320
Ingo Molnardd41f592007-07-09 18:51:59 +02006321static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6322{
6323 cfs_rq->tasks_timeline = RB_ROOT;
6324 cfs_rq->fair_clock = 1;
6325#ifdef CONFIG_FAIR_GROUP_SCHED
6326 cfs_rq->rq = rq;
6327#endif
6328}
6329
Linus Torvalds1da177e2005-04-16 15:20:36 -07006330void __init sched_init(void)
6331{
Ingo Molnardd41f592007-07-09 18:51:59 +02006332 u64 now = sched_clock();
Christoph Lameter476f3532007-05-06 14:48:58 -07006333 int highest_cpu = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02006334 int i, j;
6335
6336 /*
6337 * Link up the scheduling class hierarchy:
6338 */
6339 rt_sched_class.next = &fair_sched_class;
6340 fair_sched_class.next = &idle_sched_class;
6341 idle_sched_class.next = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006342
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08006343 for_each_possible_cpu(i) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006344 struct rt_prio_array *array;
Ingo Molnar70b97a72006-07-03 00:25:42 -07006345 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006346
6347 rq = cpu_rq(i);
6348 spin_lock_init(&rq->lock);
Ingo Molnarfcb99372006-07-03 00:25:10 -07006349 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
Nick Piggin78979862005-06-25 14:57:13 -07006350 rq->nr_running = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02006351 rq->clock = 1;
6352 init_cfs_rq(&rq->cfs, rq);
6353#ifdef CONFIG_FAIR_GROUP_SCHED
6354 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6355 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6356#endif
6357 rq->ls.load_update_last = now;
6358 rq->ls.load_update_start = now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006359
Ingo Molnardd41f592007-07-09 18:51:59 +02006360 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6361 rq->cpu_load[j] = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006362#ifdef CONFIG_SMP
Nick Piggin41c7ce92005-06-25 14:57:24 -07006363 rq->sd = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006364 rq->active_balance = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02006365 rq->next_balance = jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006366 rq->push_cpu = 0;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07006367 rq->cpu = i;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006368 rq->migration_thread = NULL;
6369 INIT_LIST_HEAD(&rq->migration_queue);
6370#endif
6371 atomic_set(&rq->nr_iowait, 0);
6372
Ingo Molnardd41f592007-07-09 18:51:59 +02006373 array = &rq->rt.active;
6374 for (j = 0; j < MAX_RT_PRIO; j++) {
6375 INIT_LIST_HEAD(array->queue + j);
6376 __clear_bit(j, array->bitmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006377 }
Christoph Lameter476f3532007-05-06 14:48:58 -07006378 highest_cpu = i;
Ingo Molnardd41f592007-07-09 18:51:59 +02006379 /* delimiter for bitsearch: */
6380 __set_bit(MAX_RT_PRIO, array->bitmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006381 }
6382
Peter Williams2dd73a42006-06-27 02:54:34 -07006383 set_load_weight(&init_task);
Heiko Carstensb50f60c2006-07-30 03:03:52 -07006384
Christoph Lameterc9819f42006-12-10 02:20:25 -08006385#ifdef CONFIG_SMP
Christoph Lameter476f3532007-05-06 14:48:58 -07006386 nr_cpu_ids = highest_cpu + 1;
Christoph Lameterc9819f42006-12-10 02:20:25 -08006387 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6388#endif
6389
Heiko Carstensb50f60c2006-07-30 03:03:52 -07006390#ifdef CONFIG_RT_MUTEXES
6391 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6392#endif
6393
Linus Torvalds1da177e2005-04-16 15:20:36 -07006394 /*
6395 * The boot idle thread does lazy MMU switching as well:
6396 */
6397 atomic_inc(&init_mm.mm_count);
6398 enter_lazy_tlb(&init_mm, current);
6399
6400 /*
6401 * Make us the idle thread. Technically, schedule() should not be
6402 * called from this thread, however somewhere below it might be,
6403 * but because we are the idle thread, we just pick up running again
6404 * when this runqueue becomes "idle".
6405 */
6406 init_idle(current, smp_processor_id());
Ingo Molnardd41f592007-07-09 18:51:59 +02006407 /*
6408 * During early bootup we pretend to be a normal task:
6409 */
6410 current->sched_class = &fair_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006411}
6412
6413#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6414void __might_sleep(char *file, int line)
6415{
Ingo Molnar48f24c42006-07-03 00:25:40 -07006416#ifdef in_atomic
Linus Torvalds1da177e2005-04-16 15:20:36 -07006417 static unsigned long prev_jiffy; /* ratelimiting */
6418
6419 if ((in_atomic() || irqs_disabled()) &&
6420 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6421 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6422 return;
6423 prev_jiffy = jiffies;
Ingo Molnar91368d72006-03-23 03:00:54 -08006424 printk(KERN_ERR "BUG: sleeping function called from invalid"
Linus Torvalds1da177e2005-04-16 15:20:36 -07006425 " context at %s:%d\n", file, line);
6426 printk("in_atomic():%d, irqs_disabled():%d\n",
6427 in_atomic(), irqs_disabled());
Peter Zijlstraa4c410f2006-12-06 20:37:21 -08006428 debug_show_held_locks(current);
Ingo Molnar3117df02006-12-13 00:34:43 -08006429 if (irqs_disabled())
6430 print_irqtrace_events(current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006431 dump_stack();
6432 }
6433#endif
6434}
6435EXPORT_SYMBOL(__might_sleep);
6436#endif
6437
6438#ifdef CONFIG_MAGIC_SYSRQ
6439void normalize_rt_tasks(void)
6440{
Ingo Molnara0f98a12007-06-17 18:37:45 +02006441 struct task_struct *g, *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006442 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07006443 struct rq *rq;
Ingo Molnardd41f592007-07-09 18:51:59 +02006444 int on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006445
6446 read_lock_irq(&tasklist_lock);
Ingo Molnara0f98a12007-06-17 18:37:45 +02006447 do_each_thread(g, p) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006448 p->se.fair_key = 0;
6449 p->se.wait_runtime = 0;
6450 p->se.wait_start_fair = 0;
6451 p->se.wait_start = 0;
6452 p->se.exec_start = 0;
6453 p->se.sleep_start = 0;
6454 p->se.sleep_start_fair = 0;
6455 p->se.block_start = 0;
6456 task_rq(p)->cfs.fair_clock = 0;
6457 task_rq(p)->clock = 0;
6458
6459 if (!rt_task(p)) {
6460 /*
6461 * Renice negative nice level userspace
6462 * tasks back to 0:
6463 */
6464 if (TASK_NICE(p) < 0 && p->mm)
6465 set_user_nice(p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006466 continue;
Ingo Molnardd41f592007-07-09 18:51:59 +02006467 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006468
Ingo Molnarb29739f2006-06-27 02:54:51 -07006469 spin_lock_irqsave(&p->pi_lock, flags);
6470 rq = __task_rq_lock(p);
Ingo Molnardd41f592007-07-09 18:51:59 +02006471#ifdef CONFIG_SMP
6472 /*
6473 * Do not touch the migration thread:
6474 */
6475 if (p == rq->migration_thread)
6476 goto out_unlock;
6477#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006478
Ingo Molnardd41f592007-07-09 18:51:59 +02006479 on_rq = p->se.on_rq;
6480 if (on_rq)
6481 deactivate_task(task_rq(p), p, 0);
6482 __setscheduler(rq, p, SCHED_NORMAL, 0);
6483 if (on_rq) {
6484 activate_task(task_rq(p), p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006485 resched_task(rq->curr);
6486 }
Ingo Molnardd41f592007-07-09 18:51:59 +02006487#ifdef CONFIG_SMP
6488 out_unlock:
6489#endif
Ingo Molnarb29739f2006-06-27 02:54:51 -07006490 __task_rq_unlock(rq);
6491 spin_unlock_irqrestore(&p->pi_lock, flags);
Ingo Molnara0f98a12007-06-17 18:37:45 +02006492 } while_each_thread(g, p);
6493
Linus Torvalds1da177e2005-04-16 15:20:36 -07006494 read_unlock_irq(&tasklist_lock);
6495}
6496
6497#endif /* CONFIG_MAGIC_SYSRQ */
Linus Torvalds1df5c102005-09-12 07:59:21 -07006498
6499#ifdef CONFIG_IA64
6500/*
6501 * These functions are only useful for the IA64 MCA handling.
6502 *
6503 * They can only be called when the whole system has been
6504 * stopped - every CPU needs to be quiescent, and no scheduling
6505 * activity can take place. Using them for anything else would
6506 * be a serious bug, and as a result, they aren't even visible
6507 * under any other configuration.
6508 */
6509
6510/**
6511 * curr_task - return the current task for a given cpu.
6512 * @cpu: the processor in question.
6513 *
6514 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6515 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07006516struct task_struct *curr_task(int cpu)
Linus Torvalds1df5c102005-09-12 07:59:21 -07006517{
6518 return cpu_curr(cpu);
6519}
6520
6521/**
6522 * set_curr_task - set the current task for a given cpu.
6523 * @cpu: the processor in question.
6524 * @p: the task pointer to set.
6525 *
6526 * Description: This function must only be used when non-maskable interrupts
6527 * are serviced on a separate stack. It allows the architecture to switch the
6528 * notion of the current task on a cpu in a non-blocking manner. This function
6529 * must be called with all CPU's synchronized, and interrupts disabled, the
6530 * and caller must save the original value of the current task (see
6531 * curr_task() above) and restore that value before reenabling interrupts and
6532 * re-starting the system.
6533 *
6534 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6535 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07006536void set_curr_task(int cpu, struct task_struct *p)
Linus Torvalds1df5c102005-09-12 07:59:21 -07006537{
6538 cpu_curr(cpu) = p;
6539}
6540
6541#endif