blob: 01ba4b1848a00197ff4a50caf5e0643cc7dd8a61 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * kernel/sched.c
3 *
4 * Kernel scheduler and related syscalls
5 *
6 * Copyright (C) 1991-2002 Linus Torvalds
7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 */
20
21#include <linux/mm.h>
22#include <linux/module.h>
23#include <linux/nmi.h>
24#include <linux/init.h>
25#include <asm/uaccess.h>
26#include <linux/highmem.h>
27#include <linux/smp_lock.h>
28#include <asm/mmu_context.h>
29#include <linux/interrupt.h>
Randy.Dunlapc59ede72006-01-11 12:17:46 -080030#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070031#include <linux/completion.h>
32#include <linux/kernel_stat.h>
Ingo Molnar9a11b49a2006-07-03 00:24:33 -070033#include <linux/debug_locks.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include <linux/security.h>
35#include <linux/notifier.h>
36#include <linux/profile.h>
Nigel Cunningham7dfb7102006-12-06 20:34:23 -080037#include <linux/freezer.h>
akpm@osdl.org198e2f12006-01-12 01:05:30 -080038#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070039#include <linux/blkdev.h>
40#include <linux/delay.h>
41#include <linux/smp.h>
42#include <linux/threads.h>
43#include <linux/timer.h>
44#include <linux/rcupdate.h>
45#include <linux/cpu.h>
46#include <linux/cpuset.h>
47#include <linux/percpu.h>
48#include <linux/kthread.h>
49#include <linux/seq_file.h>
50#include <linux/syscalls.h>
51#include <linux/times.h>
Jay Lan8f0ab512006-09-30 23:28:59 -070052#include <linux/tsacct_kern.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080053#include <linux/kprobes.h>
Shailabh Nagar0ff92242006-07-14 00:24:37 -070054#include <linux/delayacct.h>
Eric Dumazet5517d862007-05-08 00:32:57 -070055#include <linux/reciprocal_div.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070056
Eric Dumazet5517d862007-05-08 00:32:57 -070057#include <asm/tlb.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070058#include <asm/unistd.h>
59
60/*
Alexey Dobriyanb035b6d2007-02-10 01:45:10 -080061 * Scheduler clock - returns current time in nanosec units.
62 * This is default implementation.
63 * Architectures and sub-architectures can override this.
64 */
65unsigned long long __attribute__((weak)) sched_clock(void)
66{
67 return (unsigned long long)jiffies * (1000000000 / HZ);
68}
69
70/*
Linus Torvalds1da177e2005-04-16 15:20:36 -070071 * Convert user-nice values [ -20 ... 0 ... 19 ]
72 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
73 * and back.
74 */
75#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
76#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
77#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
78
79/*
80 * 'User priority' is the nice value converted to something we
81 * can work with better when scaling various scheduler parameters,
82 * it's a [ 0 ... 39 ] range.
83 */
84#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
85#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
86#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
87
88/*
89 * Some helpers for converting nanosecond timing to jiffy resolution
90 */
91#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
92#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
93
Ingo Molnar6aa645e2007-07-09 18:51:58 +020094#define NICE_0_LOAD SCHED_LOAD_SCALE
95#define NICE_0_SHIFT SCHED_LOAD_SHIFT
96
Linus Torvalds1da177e2005-04-16 15:20:36 -070097/*
98 * These are the 'tuning knobs' of the scheduler:
99 *
100 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
101 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
102 * Timeslices get refilled after they expire.
103 */
104#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
105#define DEF_TIMESLICE (100 * HZ / 1000)
106#define ON_RUNQUEUE_WEIGHT 30
107#define CHILD_PENALTY 95
108#define PARENT_PENALTY 100
109#define EXIT_WEIGHT 3
110#define PRIO_BONUS_RATIO 25
111#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
112#define INTERACTIVE_DELTA 2
113#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
114#define STARVATION_LIMIT (MAX_SLEEP_AVG)
115#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
116
117/*
118 * If a task is 'interactive' then we reinsert it in the active
119 * array after it has expired its current timeslice. (it will not
120 * continue to run immediately, it will still roundrobin with
121 * other interactive tasks.)
122 *
123 * This part scales the interactivity limit depending on niceness.
124 *
125 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
126 * Here are a few examples of different nice levels:
127 *
128 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
129 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
130 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
131 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
132 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
133 *
134 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
135 * priority range a task can explore, a value of '1' means the
136 * task is rated interactive.)
137 *
138 * Ie. nice +19 tasks can never get 'interactive' enough to be
139 * reinserted into the active array. And only heavily CPU-hog nice -20
140 * tasks will be expired. Default nice 0 tasks are somewhere between,
141 * it takes some effort for them to get interactive, but it's not
142 * too hard.
143 */
144
145#define CURRENT_BONUS(p) \
146 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
147 MAX_SLEEP_AVG)
148
149#define GRANULARITY (10 * HZ / 1000 ? : 1)
150
151#ifdef CONFIG_SMP
152#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
153 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
154 num_online_cpus())
155#else
156#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
157 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
158#endif
159
160#define SCALE(v1,v1_max,v2_max) \
161 (v1) * (v2_max) / (v1_max)
162
163#define DELTA(p) \
Martin Andersson013d3862006-03-27 01:15:18 -0800164 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
165 INTERACTIVE_DELTA)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700166
167#define TASK_INTERACTIVE(p) \
168 ((p)->prio <= (p)->static_prio - DELTA(p))
169
170#define INTERACTIVE_SLEEP(p) \
171 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
172 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
173
174#define TASK_PREEMPTS_CURR(p, rq) \
Andrew Mortond5f9f942007-05-08 20:27:06 -0700175 ((p)->prio < (rq)->curr->prio)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700176
Linus Torvalds1da177e2005-04-16 15:20:36 -0700177#define SCALE_PRIO(x, prio) \
Peter Williams2dd73a42006-06-27 02:54:34 -0700178 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700179
Peter Williams2dd73a42006-06-27 02:54:34 -0700180static unsigned int static_prio_timeslice(int static_prio)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181{
Peter Williams2dd73a42006-06-27 02:54:34 -0700182 if (static_prio < NICE_TO_PRIO(0))
183 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184 else
Peter Williams2dd73a42006-06-27 02:54:34 -0700185 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186}
Peter Williams2dd73a42006-06-27 02:54:34 -0700187
Eric Dumazet5517d862007-05-08 00:32:57 -0700188#ifdef CONFIG_SMP
189/*
190 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
191 * Since cpu_power is a 'constant', we can use a reciprocal divide.
192 */
193static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
194{
195 return reciprocal_divide(load, sg->reciprocal_cpu_power);
196}
197
198/*
199 * Each time a sched group cpu_power is changed,
200 * we must compute its reciprocal value
201 */
202static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
203{
204 sg->__cpu_power += val;
205 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
206}
207#endif
208
Borislav Petkov91fcdd42006-10-19 23:28:29 -0700209/*
210 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
211 * to time slice values: [800ms ... 100ms ... 5ms]
212 *
213 * The higher a thread's priority, the bigger timeslices
214 * it gets during one round of execution. But even the lowest
215 * priority thread gets MIN_TIMESLICE worth of execution time.
216 */
217
Ingo Molnar36c8b582006-07-03 00:25:41 -0700218static inline unsigned int task_timeslice(struct task_struct *p)
Peter Williams2dd73a42006-06-27 02:54:34 -0700219{
220 return static_prio_timeslice(p->static_prio);
221}
222
Ingo Molnare05606d2007-07-09 18:51:59 +0200223static inline int rt_policy(int policy)
224{
225 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
226 return 1;
227 return 0;
228}
229
230static inline int task_has_rt_policy(struct task_struct *p)
231{
232 return rt_policy(p->policy);
233}
234
Linus Torvalds1da177e2005-04-16 15:20:36 -0700235/*
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200236 * This is the priority-queue data structure of the RT scheduling class:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237 */
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200238struct rt_prio_array {
239 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
240 struct list_head queue[MAX_RT_PRIO];
241};
Linus Torvalds1da177e2005-04-16 15:20:36 -0700242
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200243struct load_stat {
244 struct load_weight load;
245 u64 load_update_start, load_update_last;
246 unsigned long delta_fair, delta_exec, delta_stat;
247};
248
249/* CFS-related fields in a runqueue */
250struct cfs_rq {
251 struct load_weight load;
252 unsigned long nr_running;
253
254 s64 fair_clock;
255 u64 exec_clock;
256 s64 wait_runtime;
257 u64 sleeper_bonus;
258 unsigned long wait_runtime_overruns, wait_runtime_underruns;
259
260 struct rb_root tasks_timeline;
261 struct rb_node *rb_leftmost;
262 struct rb_node *rb_load_balance_curr;
263#ifdef CONFIG_FAIR_GROUP_SCHED
264 /* 'curr' points to currently running entity on this cfs_rq.
265 * It is set to NULL otherwise (i.e when none are currently running).
266 */
267 struct sched_entity *curr;
268 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
269
270 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
271 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
272 * (like users, containers etc.)
273 *
274 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
275 * list is used during load balance.
276 */
277 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
278#endif
279};
280
281/* Real-Time classes' related field in a runqueue: */
282struct rt_rq {
283 struct rt_prio_array active;
284 int rt_load_balance_idx;
285 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
286};
287
288/*
289 * The prio-array type of the old scheduler:
290 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700291struct prio_array {
292 unsigned int nr_active;
Steven Rostedtd4448862006-06-27 02:54:29 -0700293 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700294 struct list_head queue[MAX_PRIO];
295};
296
297/*
298 * This is the main, per-CPU runqueue data structure.
299 *
300 * Locking rule: those places that want to lock multiple runqueues
301 * (such as the load balancing or the thread migration code), lock
302 * acquire operations must be ordered by ascending &runqueue.
303 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700304struct rq {
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200305 spinlock_t lock; /* runqueue lock */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306
307 /*
308 * nr_running and cpu_load should be in the same cacheline because
309 * remote CPUs use both these fields when doing load calculation.
310 */
311 unsigned long nr_running;
Peter Williams2dd73a42006-06-27 02:54:34 -0700312 unsigned long raw_weighted_load;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200313 #define CPU_LOAD_IDX_MAX 5
314 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
Siddha, Suresh Bbdecea32007-05-08 00:32:48 -0700315 unsigned char idle_at_tick;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -0700316#ifdef CONFIG_NO_HZ
317 unsigned char in_nohz_recently;
318#endif
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200319 struct load_stat ls; /* capture load from *all* tasks on this cpu */
320 unsigned long nr_load_updates;
321 u64 nr_switches;
322
323 struct cfs_rq cfs;
324#ifdef CONFIG_FAIR_GROUP_SCHED
325 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700326#endif
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200327 struct rt_rq rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700328
329 /*
330 * This is part of a global counter where only the total sum
331 * over all CPUs matters. A task can increase this counter on
332 * one CPU and if it got migrated afterwards it may decrease
333 * it on another CPU. Always updated under the runqueue lock:
334 */
335 unsigned long nr_uninterruptible;
336
337 unsigned long expired_timestamp;
Mike Galbraithb18ec802006-12-10 02:20:31 -0800338 unsigned long long most_recent_timestamp;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200339
Ingo Molnar36c8b582006-07-03 00:25:41 -0700340 struct task_struct *curr, *idle;
Christoph Lameterc9819f42006-12-10 02:20:25 -0800341 unsigned long next_balance;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700342 struct mm_struct *prev_mm;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200343
Ingo Molnar70b97a72006-07-03 00:25:42 -0700344 struct prio_array *active, *expired, arrays[2];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700345 int best_expired_prio;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200346
347 u64 clock, prev_clock_raw;
348 s64 clock_max_delta;
349
350 unsigned int clock_warps, clock_overflows;
351 unsigned int clock_unstable_events;
352
353 struct sched_class *load_balance_class;
354
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355 atomic_t nr_iowait;
356
357#ifdef CONFIG_SMP
358 struct sched_domain *sd;
359
360 /* For active balancing */
361 int active_balance;
362 int push_cpu;
Christoph Lameter0a2966b2006-09-25 23:30:51 -0700363 int cpu; /* cpu of this runqueue */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700364
Ingo Molnar36c8b582006-07-03 00:25:41 -0700365 struct task_struct *migration_thread;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700366 struct list_head migration_queue;
367#endif
368
369#ifdef CONFIG_SCHEDSTATS
370 /* latency stats */
371 struct sched_info rq_sched_info;
372
373 /* sys_sched_yield() stats */
374 unsigned long yld_exp_empty;
375 unsigned long yld_act_empty;
376 unsigned long yld_both_empty;
377 unsigned long yld_cnt;
378
379 /* schedule() stats */
380 unsigned long sched_switch;
381 unsigned long sched_cnt;
382 unsigned long sched_goidle;
383
384 /* try_to_wake_up() stats */
385 unsigned long ttwu_cnt;
386 unsigned long ttwu_local;
387#endif
Ingo Molnarfcb99372006-07-03 00:25:10 -0700388 struct lock_class_key rq_lock_key;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389};
390
Siddha, Suresh Bc3396622007-05-08 00:33:09 -0700391static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
Gautham R Shenoy5be93612007-05-09 02:34:04 -0700392static DEFINE_MUTEX(sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393
Ingo Molnardd41f592007-07-09 18:51:59 +0200394static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
395{
396 rq->curr->sched_class->check_preempt_curr(rq, p);
397}
398
Christoph Lameter0a2966b2006-09-25 23:30:51 -0700399static inline int cpu_of(struct rq *rq)
400{
401#ifdef CONFIG_SMP
402 return rq->cpu;
403#else
404 return 0;
405#endif
406}
407
Nick Piggin674311d2005-06-25 14:57:27 -0700408/*
Ingo Molnar20d315d2007-07-09 18:51:58 +0200409 * Per-runqueue clock, as finegrained as the platform can give us:
410 */
411static unsigned long long __rq_clock(struct rq *rq)
412{
413 u64 prev_raw = rq->prev_clock_raw;
414 u64 now = sched_clock();
415 s64 delta = now - prev_raw;
416 u64 clock = rq->clock;
417
418 /*
419 * Protect against sched_clock() occasionally going backwards:
420 */
421 if (unlikely(delta < 0)) {
422 clock++;
423 rq->clock_warps++;
424 } else {
425 /*
426 * Catch too large forward jumps too:
427 */
428 if (unlikely(delta > 2*TICK_NSEC)) {
429 clock++;
430 rq->clock_overflows++;
431 } else {
432 if (unlikely(delta > rq->clock_max_delta))
433 rq->clock_max_delta = delta;
434 clock += delta;
435 }
436 }
437
438 rq->prev_clock_raw = now;
439 rq->clock = clock;
440
441 return clock;
442}
443
444static inline unsigned long long rq_clock(struct rq *rq)
445{
446 int this_cpu = smp_processor_id();
447
448 if (this_cpu == cpu_of(rq))
449 return __rq_clock(rq);
450
451 return rq->clock;
452}
453
454/*
Nick Piggin674311d2005-06-25 14:57:27 -0700455 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -0700456 * See detach_destroy_domains: synchronize_sched for details.
Nick Piggin674311d2005-06-25 14:57:27 -0700457 *
458 * The domain tree of any CPU may only be accessed from within
459 * preempt-disabled sections.
460 */
Ingo Molnar48f24c42006-07-03 00:25:40 -0700461#define for_each_domain(cpu, __sd) \
462 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463
464#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
465#define this_rq() (&__get_cpu_var(runqueues))
466#define task_rq(p) cpu_rq(task_cpu(p))
467#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
468
Ingo Molnar138a8ae2007-07-09 18:51:58 +0200469#ifdef CONFIG_FAIR_GROUP_SCHED
470/* Change a task's ->cfs_rq if it moves across CPUs */
471static inline void set_task_cfs_rq(struct task_struct *p)
472{
473 p->se.cfs_rq = &task_rq(p)->cfs;
474}
475#else
476static inline void set_task_cfs_rq(struct task_struct *p)
477{
478}
479#endif
480
Linus Torvalds1da177e2005-04-16 15:20:36 -0700481#ifndef prepare_arch_switch
Nick Piggin4866cde2005-06-25 14:57:23 -0700482# define prepare_arch_switch(next) do { } while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483#endif
Nick Piggin4866cde2005-06-25 14:57:23 -0700484#ifndef finish_arch_switch
485# define finish_arch_switch(prev) do { } while (0)
486#endif
487
488#ifndef __ARCH_WANT_UNLOCKED_CTXSW
Ingo Molnar70b97a72006-07-03 00:25:42 -0700489static inline int task_running(struct rq *rq, struct task_struct *p)
Nick Piggin4866cde2005-06-25 14:57:23 -0700490{
491 return rq->curr == p;
492}
493
Ingo Molnar70b97a72006-07-03 00:25:42 -0700494static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -0700495{
496}
497
Ingo Molnar70b97a72006-07-03 00:25:42 -0700498static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
Nick Piggin4866cde2005-06-25 14:57:23 -0700499{
Ingo Molnarda04c032005-09-13 11:17:59 +0200500#ifdef CONFIG_DEBUG_SPINLOCK
501 /* this is a valid case when another task releases the spinlock */
502 rq->lock.owner = current;
503#endif
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700504 /*
505 * If we are tracking spinlock dependencies then we have to
506 * fix up the runqueue lock - which gets 'carried over' from
507 * prev into current:
508 */
509 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
510
Nick Piggin4866cde2005-06-25 14:57:23 -0700511 spin_unlock_irq(&rq->lock);
512}
513
514#else /* __ARCH_WANT_UNLOCKED_CTXSW */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700515static inline int task_running(struct rq *rq, struct task_struct *p)
Nick Piggin4866cde2005-06-25 14:57:23 -0700516{
517#ifdef CONFIG_SMP
518 return p->oncpu;
519#else
520 return rq->curr == p;
521#endif
522}
523
Ingo Molnar70b97a72006-07-03 00:25:42 -0700524static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -0700525{
526#ifdef CONFIG_SMP
527 /*
528 * We can optimise this out completely for !SMP, because the
529 * SMP rebalancing from interrupt is the only thing that cares
530 * here.
531 */
532 next->oncpu = 1;
533#endif
534#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
535 spin_unlock_irq(&rq->lock);
536#else
537 spin_unlock(&rq->lock);
538#endif
539}
540
Ingo Molnar70b97a72006-07-03 00:25:42 -0700541static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
Nick Piggin4866cde2005-06-25 14:57:23 -0700542{
543#ifdef CONFIG_SMP
544 /*
545 * After ->oncpu is cleared, the task can be moved to a different CPU.
546 * We must ensure this doesn't happen until the switch is completely
547 * finished.
548 */
549 smp_wmb();
550 prev->oncpu = 0;
551#endif
552#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
553 local_irq_enable();
554#endif
555}
556#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700557
558/*
Ingo Molnarb29739f2006-06-27 02:54:51 -0700559 * __task_rq_lock - lock the runqueue a given task resides on.
560 * Must be called interrupts disabled.
561 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700562static inline struct rq *__task_rq_lock(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700563 __acquires(rq->lock)
564{
Ingo Molnar70b97a72006-07-03 00:25:42 -0700565 struct rq *rq;
Ingo Molnarb29739f2006-06-27 02:54:51 -0700566
567repeat_lock_task:
568 rq = task_rq(p);
569 spin_lock(&rq->lock);
570 if (unlikely(rq != task_rq(p))) {
571 spin_unlock(&rq->lock);
572 goto repeat_lock_task;
573 }
574 return rq;
575}
576
577/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700578 * task_rq_lock - lock the runqueue a given task resides on and disable
579 * interrupts. Note the ordering: we can safely lookup the task_rq without
580 * explicitly disabling preemption.
581 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700582static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700583 __acquires(rq->lock)
584{
Ingo Molnar70b97a72006-07-03 00:25:42 -0700585 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700586
587repeat_lock_task:
588 local_irq_save(*flags);
589 rq = task_rq(p);
590 spin_lock(&rq->lock);
591 if (unlikely(rq != task_rq(p))) {
592 spin_unlock_irqrestore(&rq->lock, *flags);
593 goto repeat_lock_task;
594 }
595 return rq;
596}
597
Ingo Molnar70b97a72006-07-03 00:25:42 -0700598static inline void __task_rq_unlock(struct rq *rq)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700599 __releases(rq->lock)
600{
601 spin_unlock(&rq->lock);
602}
603
Ingo Molnar70b97a72006-07-03 00:25:42 -0700604static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700605 __releases(rq->lock)
606{
607 spin_unlock_irqrestore(&rq->lock, *flags);
608}
609
Linus Torvalds1da177e2005-04-16 15:20:36 -0700610/*
Robert P. J. Daycc2a73b2006-12-10 02:20:00 -0800611 * this_rq_lock - lock this runqueue and disable interrupts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700613static inline struct rq *this_rq_lock(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700614 __acquires(rq->lock)
615{
Ingo Molnar70b97a72006-07-03 00:25:42 -0700616 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700617
618 local_irq_disable();
619 rq = this_rq();
620 spin_lock(&rq->lock);
621
622 return rq;
623}
624
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200625/*
626 * resched_task - mark a task 'to be rescheduled now'.
627 *
628 * On UP this means the setting of the need_resched flag, on SMP it
629 * might also involve a cross-CPU call to trigger the scheduler on
630 * the target CPU.
631 */
632#ifdef CONFIG_SMP
633
634#ifndef tsk_is_polling
635#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
636#endif
637
638static void resched_task(struct task_struct *p)
639{
640 int cpu;
641
642 assert_spin_locked(&task_rq(p)->lock);
643
644 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
645 return;
646
647 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
648
649 cpu = task_cpu(p);
650 if (cpu == smp_processor_id())
651 return;
652
653 /* NEED_RESCHED must be visible before we test polling */
654 smp_mb();
655 if (!tsk_is_polling(p))
656 smp_send_reschedule(cpu);
657}
658
659static void resched_cpu(int cpu)
660{
661 struct rq *rq = cpu_rq(cpu);
662 unsigned long flags;
663
664 if (!spin_trylock_irqsave(&rq->lock, flags))
665 return;
666 resched_task(cpu_curr(cpu));
667 spin_unlock_irqrestore(&rq->lock, flags);
668}
669#else
670static inline void resched_task(struct task_struct *p)
671{
672 assert_spin_locked(&task_rq(p)->lock);
673 set_tsk_need_resched(p);
674}
675#endif
676
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200677static u64 div64_likely32(u64 divident, unsigned long divisor)
678{
679#if BITS_PER_LONG == 32
680 if (likely(divident <= 0xffffffffULL))
681 return (u32)divident / divisor;
682 do_div(divident, divisor);
683
684 return divident;
685#else
686 return divident / divisor;
687#endif
688}
689
690#if BITS_PER_LONG == 32
691# define WMULT_CONST (~0UL)
692#else
693# define WMULT_CONST (1UL << 32)
694#endif
695
696#define WMULT_SHIFT 32
697
698static inline unsigned long
699calc_delta_mine(unsigned long delta_exec, unsigned long weight,
700 struct load_weight *lw)
701{
702 u64 tmp;
703
704 if (unlikely(!lw->inv_weight))
705 lw->inv_weight = WMULT_CONST / lw->weight;
706
707 tmp = (u64)delta_exec * weight;
708 /*
709 * Check whether we'd overflow the 64-bit multiplication:
710 */
711 if (unlikely(tmp > WMULT_CONST)) {
712 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
713 >> (WMULT_SHIFT/2);
714 } else {
715 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
716 }
717
718 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
719}
720
721static inline unsigned long
722calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
723{
724 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
725}
726
727static void update_load_add(struct load_weight *lw, unsigned long inc)
728{
729 lw->weight += inc;
730 lw->inv_weight = 0;
731}
732
733static void update_load_sub(struct load_weight *lw, unsigned long dec)
734{
735 lw->weight -= dec;
736 lw->inv_weight = 0;
737}
738
739static void __update_curr_load(struct rq *rq, struct load_stat *ls)
740{
741 if (rq->curr != rq->idle && ls->load.weight) {
742 ls->delta_exec += ls->delta_stat;
743 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
744 ls->delta_stat = 0;
745 }
746}
747
748/*
749 * Update delta_exec, delta_fair fields for rq.
750 *
751 * delta_fair clock advances at a rate inversely proportional to
752 * total load (rq->ls.load.weight) on the runqueue, while
753 * delta_exec advances at the same rate as wall-clock (provided
754 * cpu is not idle).
755 *
756 * delta_exec / delta_fair is a measure of the (smoothened) load on this
757 * runqueue over any given interval. This (smoothened) load is used
758 * during load balance.
759 *
760 * This function is called /before/ updating rq->ls.load
761 * and when switching tasks.
762 */
763static void update_curr_load(struct rq *rq, u64 now)
764{
765 struct load_stat *ls = &rq->ls;
766 u64 start;
767
768 start = ls->load_update_start;
769 ls->load_update_start = now;
770 ls->delta_stat += now - start;
771 /*
772 * Stagger updates to ls->delta_fair. Very frequent updates
773 * can be expensive.
774 */
775 if (ls->delta_stat >= sysctl_sched_stat_granularity)
776 __update_curr_load(rq, ls);
777}
778
Linus Torvalds1da177e2005-04-16 15:20:36 -0700779/*
Peter Williams2dd73a42006-06-27 02:54:34 -0700780 * To aid in avoiding the subversion of "niceness" due to uneven distribution
781 * of tasks with abnormal "nice" values across CPUs the contribution that
782 * each task makes to its run queue's load is weighted according to its
783 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
784 * scaled version of the new time slice allocation that they receive on time
785 * slice expiry etc.
786 */
787
788/*
789 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
790 * If static_prio_timeslice() is ever changed to break this assumption then
791 * this code will need modification
792 */
793#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
Ingo Molnardd41f592007-07-09 18:51:59 +0200794#define load_weight(lp) \
Peter Williams2dd73a42006-06-27 02:54:34 -0700795 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
796#define PRIO_TO_LOAD_WEIGHT(prio) \
Ingo Molnardd41f592007-07-09 18:51:59 +0200797 load_weight(static_prio_timeslice(prio))
Peter Williams2dd73a42006-06-27 02:54:34 -0700798#define RTPRIO_TO_LOAD_WEIGHT(rp) \
Ingo Molnardd41f592007-07-09 18:51:59 +0200799 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
800
801#define WEIGHT_IDLEPRIO 2
802#define WMULT_IDLEPRIO (1 << 31)
803
804/*
805 * Nice levels are multiplicative, with a gentle 10% change for every
806 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
807 * nice 1, it will get ~10% less CPU time than another CPU-bound task
808 * that remained on nice 0.
809 *
810 * The "10% effect" is relative and cumulative: from _any_ nice level,
811 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
812 * it's +10% CPU usage.
813 */
814static const int prio_to_weight[40] = {
815/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
816/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
817/* 0 */ NICE_0_LOAD /* 1024 */,
818/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
819/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
820};
821
822static const u32 prio_to_wmult[40] = {
823 48356, 60446, 75558, 94446, 118058, 147573,
824 184467, 230589, 288233, 360285, 450347,
825 562979, 703746, 879575, 1099582, 1374389,
826 717986, 2147483, 2684354, 3355443, 4194304,
827 244160, 6557201, 8196502, 10250518, 12782640,
828 16025997, 19976592, 24970740, 31350126, 39045157,
829 49367440, 61356675, 76695844, 95443717, 119304647,
830 148102320, 186737708, 238609294, 286331153,
831};
Peter Williams2dd73a42006-06-27 02:54:34 -0700832
Ingo Molnar36c8b582006-07-03 00:25:41 -0700833static inline void
Ingo Molnardd41f592007-07-09 18:51:59 +0200834inc_load(struct rq *rq, const struct task_struct *p, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700835{
Ingo Molnardd41f592007-07-09 18:51:59 +0200836 update_curr_load(rq, now);
837 update_load_add(&rq->ls.load, p->se.load.weight);
Peter Williams2dd73a42006-06-27 02:54:34 -0700838}
839
Ingo Molnar36c8b582006-07-03 00:25:41 -0700840static inline void
Ingo Molnardd41f592007-07-09 18:51:59 +0200841dec_load(struct rq *rq, const struct task_struct *p, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700842{
Ingo Molnardd41f592007-07-09 18:51:59 +0200843 update_curr_load(rq, now);
844 update_load_sub(&rq->ls.load, p->se.load.weight);
Peter Williams2dd73a42006-06-27 02:54:34 -0700845}
846
Ingo Molnardd41f592007-07-09 18:51:59 +0200847static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700848{
849 rq->nr_running++;
Ingo Molnardd41f592007-07-09 18:51:59 +0200850 inc_load(rq, p, now);
Peter Williams2dd73a42006-06-27 02:54:34 -0700851}
852
Ingo Molnardd41f592007-07-09 18:51:59 +0200853static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700854{
855 rq->nr_running--;
Ingo Molnardd41f592007-07-09 18:51:59 +0200856 dec_load(rq, p, now);
Peter Williams2dd73a42006-06-27 02:54:34 -0700857}
858
Ingo Molnardd41f592007-07-09 18:51:59 +0200859static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
860
861/*
862 * runqueue iterator, to support SMP load-balancing between different
863 * scheduling classes, without having to expose their internal data
864 * structures to the load-balancing proper:
865 */
866struct rq_iterator {
867 void *arg;
868 struct task_struct *(*start)(void *);
869 struct task_struct *(*next)(void *);
870};
871
872static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
873 unsigned long max_nr_move, unsigned long max_load_move,
874 struct sched_domain *sd, enum cpu_idle_type idle,
875 int *all_pinned, unsigned long *load_moved,
876 int this_best_prio, int best_prio, int best_prio_seen,
877 struct rq_iterator *iterator);
878
879#include "sched_stats.h"
880#include "sched_rt.c"
881#include "sched_fair.c"
882#include "sched_idletask.c"
883#ifdef CONFIG_SCHED_DEBUG
884# include "sched_debug.c"
885#endif
886
887#define sched_class_highest (&rt_sched_class)
888
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200889static void set_load_weight(struct task_struct *p)
890{
Ingo Molnardd41f592007-07-09 18:51:59 +0200891 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
892 p->se.wait_runtime = 0;
893
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200894 if (task_has_rt_policy(p)) {
Ingo Molnardd41f592007-07-09 18:51:59 +0200895 p->se.load.weight = prio_to_weight[0] * 2;
896 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
897 return;
898 }
899
900 /*
901 * SCHED_IDLE tasks get minimal weight:
902 */
903 if (p->policy == SCHED_IDLE) {
904 p->se.load.weight = WEIGHT_IDLEPRIO;
905 p->se.load.inv_weight = WMULT_IDLEPRIO;
906 return;
907 }
908
909 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
910 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200911}
912
Ingo Molnardd41f592007-07-09 18:51:59 +0200913static void
914enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200915{
916 sched_info_queued(p);
Ingo Molnardd41f592007-07-09 18:51:59 +0200917 p->sched_class->enqueue_task(rq, p, wakeup, now);
918 p->se.on_rq = 1;
919}
920
921static void
922dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
923{
924 p->sched_class->dequeue_task(rq, p, sleep, now);
925 p->se.on_rq = 0;
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200926}
927
928/*
Ingo Molnardd41f592007-07-09 18:51:59 +0200929 * __normal_prio - return the priority that is based on the static prio
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200930 */
Ingo Molnar14531182007-07-09 18:51:59 +0200931static inline int __normal_prio(struct task_struct *p)
932{
Ingo Molnardd41f592007-07-09 18:51:59 +0200933 return p->static_prio;
Ingo Molnar14531182007-07-09 18:51:59 +0200934}
935
936/*
Ingo Molnarb29739f2006-06-27 02:54:51 -0700937 * Calculate the expected normal priority: i.e. priority
938 * without taking RT-inheritance into account. Might be
939 * boosted by interactivity modifiers. Changes upon fork,
940 * setprio syscalls, and whenever the interactivity
941 * estimator recalculates.
942 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700943static inline int normal_prio(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700944{
945 int prio;
946
Ingo Molnare05606d2007-07-09 18:51:59 +0200947 if (task_has_rt_policy(p))
Ingo Molnarb29739f2006-06-27 02:54:51 -0700948 prio = MAX_RT_PRIO-1 - p->rt_priority;
949 else
950 prio = __normal_prio(p);
951 return prio;
952}
953
954/*
955 * Calculate the current priority, i.e. the priority
956 * taken into account by the scheduler. This value might
957 * be boosted by RT tasks, or might be boosted by
958 * interactivity modifiers. Will be RT if the task got
959 * RT-boosted. If not then it returns p->normal_prio.
960 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700961static int effective_prio(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700962{
963 p->normal_prio = normal_prio(p);
964 /*
965 * If we are RT tasks or we were boosted to RT priority,
966 * keep the priority unchanged. Otherwise, update priority
967 * to the normal priority:
968 */
969 if (!rt_prio(p->prio))
970 return p->normal_prio;
971 return p->prio;
972}
973
974/*
Ingo Molnardd41f592007-07-09 18:51:59 +0200975 * activate_task - move a task to the runqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700976 */
Ingo Molnardd41f592007-07-09 18:51:59 +0200977static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700978{
Ingo Molnardd41f592007-07-09 18:51:59 +0200979 u64 now = rq_clock(rq);
Con Kolivasd425b272006-03-31 02:31:29 -0800980
Ingo Molnardd41f592007-07-09 18:51:59 +0200981 if (p->state == TASK_UNINTERRUPTIBLE)
982 rq->nr_uninterruptible--;
983
984 enqueue_task(rq, p, wakeup, now);
985 inc_nr_running(p, rq, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700986}
987
988/*
Ingo Molnardd41f592007-07-09 18:51:59 +0200989 * activate_idle_task - move idle task to the _front_ of runqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700990 */
Ingo Molnardd41f592007-07-09 18:51:59 +0200991static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700992{
Ingo Molnardd41f592007-07-09 18:51:59 +0200993 u64 now = rq_clock(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700994
Ingo Molnardd41f592007-07-09 18:51:59 +0200995 if (p->state == TASK_UNINTERRUPTIBLE)
996 rq->nr_uninterruptible--;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700997
Ingo Molnardd41f592007-07-09 18:51:59 +0200998 enqueue_task(rq, p, 0, now);
999 inc_nr_running(p, rq, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001000}
1001
1002/*
1003 * deactivate_task - remove a task from the runqueue.
1004 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001005static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001006{
Ingo Molnardd41f592007-07-09 18:51:59 +02001007 u64 now = rq_clock(rq);
1008
1009 if (p->state == TASK_UNINTERRUPTIBLE)
1010 rq->nr_uninterruptible++;
1011
1012 dequeue_task(rq, p, sleep, now);
1013 dec_nr_running(p, rq, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001014}
1015
Linus Torvalds1da177e2005-04-16 15:20:36 -07001016/**
1017 * task_curr - is this task currently executing on a CPU?
1018 * @p: the task in question.
1019 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001020inline int task_curr(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001021{
1022 return cpu_curr(task_cpu(p)) == p;
1023}
1024
Peter Williams2dd73a42006-06-27 02:54:34 -07001025/* Used instead of source_load when we know the type == 0 */
1026unsigned long weighted_cpuload(const int cpu)
1027{
Ingo Molnardd41f592007-07-09 18:51:59 +02001028 return cpu_rq(cpu)->ls.load.weight;
1029}
1030
1031static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1032{
1033#ifdef CONFIG_SMP
1034 task_thread_info(p)->cpu = cpu;
1035 set_task_cfs_rq(p);
1036#endif
Peter Williams2dd73a42006-06-27 02:54:34 -07001037}
1038
Linus Torvalds1da177e2005-04-16 15:20:36 -07001039#ifdef CONFIG_SMP
Ingo Molnarc65cc872007-07-09 18:51:58 +02001040
Ingo Molnardd41f592007-07-09 18:51:59 +02001041void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
Ingo Molnarc65cc872007-07-09 18:51:58 +02001042{
Ingo Molnardd41f592007-07-09 18:51:59 +02001043 int old_cpu = task_cpu(p);
1044 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1045 u64 clock_offset, fair_clock_offset;
1046
1047 clock_offset = old_rq->clock - new_rq->clock;
1048 fair_clock_offset = old_rq->cfs.fair_clock -
1049 new_rq->cfs.fair_clock;
1050 if (p->se.wait_start)
1051 p->se.wait_start -= clock_offset;
1052 if (p->se.wait_start_fair)
1053 p->se.wait_start_fair -= fair_clock_offset;
1054 if (p->se.sleep_start)
1055 p->se.sleep_start -= clock_offset;
1056 if (p->se.block_start)
1057 p->se.block_start -= clock_offset;
1058 if (p->se.sleep_start_fair)
1059 p->se.sleep_start_fair -= fair_clock_offset;
1060
1061 __set_task_cpu(p, new_cpu);
Ingo Molnarc65cc872007-07-09 18:51:58 +02001062}
1063
Ingo Molnar70b97a72006-07-03 00:25:42 -07001064struct migration_req {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001065 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001066
Ingo Molnar36c8b582006-07-03 00:25:41 -07001067 struct task_struct *task;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001068 int dest_cpu;
1069
Linus Torvalds1da177e2005-04-16 15:20:36 -07001070 struct completion done;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001071};
Linus Torvalds1da177e2005-04-16 15:20:36 -07001072
1073/*
1074 * The task's runqueue lock must be held.
1075 * Returns true if you have to wait for migration thread.
1076 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001077static int
Ingo Molnar70b97a72006-07-03 00:25:42 -07001078migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001079{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001080 struct rq *rq = task_rq(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001081
1082 /*
1083 * If the task is not on a runqueue (and not running), then
1084 * it is sufficient to simply update the task's cpu field.
1085 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001086 if (!p->se.on_rq && !task_running(rq, p)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001087 set_task_cpu(p, dest_cpu);
1088 return 0;
1089 }
1090
1091 init_completion(&req->done);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001092 req->task = p;
1093 req->dest_cpu = dest_cpu;
1094 list_add(&req->list, &rq->migration_queue);
Ingo Molnar48f24c42006-07-03 00:25:40 -07001095
Linus Torvalds1da177e2005-04-16 15:20:36 -07001096 return 1;
1097}
1098
1099/*
1100 * wait_task_inactive - wait for a thread to unschedule.
1101 *
1102 * The caller must ensure that the task *will* unschedule sometime soon,
1103 * else this function might spin for a *long* time. This function can't
1104 * be called with interrupts off, or it may introduce deadlock with
1105 * smp_call_function() if an IPI is sent by the same process we are
1106 * waiting to become inactive.
1107 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001108void wait_task_inactive(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001109{
1110 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02001111 int running, on_rq;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001112 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001113
1114repeat:
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001115 /*
1116 * We do the initial early heuristics without holding
1117 * any task-queue locks at all. We'll only try to get
1118 * the runqueue lock when things look like they will
1119 * work out!
1120 */
1121 rq = task_rq(p);
1122
1123 /*
1124 * If the task is actively running on another CPU
1125 * still, just relax and busy-wait without holding
1126 * any locks.
1127 *
1128 * NOTE! Since we don't hold any locks, it's not
1129 * even sure that "rq" stays as the right runqueue!
1130 * But we don't care, since "task_running()" will
1131 * return false if the runqueue has changed and p
1132 * is actually now running somewhere else!
1133 */
1134 while (task_running(rq, p))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001135 cpu_relax();
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001136
1137 /*
1138 * Ok, time to look more closely! We need the rq
1139 * lock now, to be *sure*. If we're wrong, we'll
1140 * just go back and repeat.
1141 */
1142 rq = task_rq_lock(p, &flags);
1143 running = task_running(rq, p);
Ingo Molnardd41f592007-07-09 18:51:59 +02001144 on_rq = p->se.on_rq;
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001145 task_rq_unlock(rq, &flags);
1146
1147 /*
1148 * Was it really running after all now that we
1149 * checked with the proper locks actually held?
1150 *
1151 * Oops. Go back and try again..
1152 */
1153 if (unlikely(running)) {
1154 cpu_relax();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001155 goto repeat;
1156 }
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001157
1158 /*
1159 * It's not enough that it's not actively running,
1160 * it must be off the runqueue _entirely_, and not
1161 * preempted!
1162 *
1163 * So if it wa still runnable (but just not actively
1164 * running right now), it's preempted, and we should
1165 * yield - it could be a while.
1166 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001167 if (unlikely(on_rq)) {
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001168 yield();
1169 goto repeat;
1170 }
1171
1172 /*
1173 * Ahh, all good. It wasn't running, and it wasn't
1174 * runnable, which means that it will never become
1175 * running in the future either. We're all done!
1176 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001177}
1178
1179/***
1180 * kick_process - kick a running thread to enter/exit the kernel
1181 * @p: the to-be-kicked thread
1182 *
1183 * Cause a process which is running on another CPU to enter
1184 * kernel-mode, without any delay. (to get signals handled.)
1185 *
1186 * NOTE: this function doesnt have to take the runqueue lock,
1187 * because all it wants to ensure is that the remote task enters
1188 * the kernel. If the IPI races and the task has been migrated
1189 * to another CPU then no harm is done and the purpose has been
1190 * achieved as well.
1191 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001192void kick_process(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001193{
1194 int cpu;
1195
1196 preempt_disable();
1197 cpu = task_cpu(p);
1198 if ((cpu != smp_processor_id()) && task_curr(p))
1199 smp_send_reschedule(cpu);
1200 preempt_enable();
1201}
1202
1203/*
Peter Williams2dd73a42006-06-27 02:54:34 -07001204 * Return a low guess at the load of a migration-source cpu weighted
1205 * according to the scheduling class and "nice" value.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001206 *
1207 * We want to under-estimate the load of migration sources, to
1208 * balance conservatively.
1209 */
Con Kolivasb9104722005-11-08 21:38:55 -08001210static inline unsigned long source_load(int cpu, int type)
1211{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001212 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02001213 unsigned long total = weighted_cpuload(cpu);
Nick Piggina2000572006-02-10 01:51:02 -08001214
Peter Williams2dd73a42006-06-27 02:54:34 -07001215 if (type == 0)
Ingo Molnardd41f592007-07-09 18:51:59 +02001216 return total;
Peter Williams2dd73a42006-06-27 02:54:34 -07001217
Ingo Molnardd41f592007-07-09 18:51:59 +02001218 return min(rq->cpu_load[type-1], total);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001219}
1220
1221/*
Peter Williams2dd73a42006-06-27 02:54:34 -07001222 * Return a high guess at the load of a migration-target cpu weighted
1223 * according to the scheduling class and "nice" value.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001224 */
Con Kolivasb9104722005-11-08 21:38:55 -08001225static inline unsigned long target_load(int cpu, int type)
1226{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001227 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02001228 unsigned long total = weighted_cpuload(cpu);
Nick Piggina2000572006-02-10 01:51:02 -08001229
Peter Williams2dd73a42006-06-27 02:54:34 -07001230 if (type == 0)
Ingo Molnardd41f592007-07-09 18:51:59 +02001231 return total;
Peter Williams2dd73a42006-06-27 02:54:34 -07001232
Ingo Molnardd41f592007-07-09 18:51:59 +02001233 return max(rq->cpu_load[type-1], total);
Peter Williams2dd73a42006-06-27 02:54:34 -07001234}
1235
1236/*
1237 * Return the average load per task on the cpu's run queue
1238 */
1239static inline unsigned long cpu_avg_load_per_task(int cpu)
1240{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001241 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02001242 unsigned long total = weighted_cpuload(cpu);
Peter Williams2dd73a42006-06-27 02:54:34 -07001243 unsigned long n = rq->nr_running;
1244
Ingo Molnardd41f592007-07-09 18:51:59 +02001245 return n ? total / n : SCHED_LOAD_SCALE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001246}
1247
Nick Piggin147cbb42005-06-25 14:57:19 -07001248/*
1249 * find_idlest_group finds and returns the least busy CPU group within the
1250 * domain.
1251 */
1252static struct sched_group *
1253find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1254{
1255 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1256 unsigned long min_load = ULONG_MAX, this_load = 0;
1257 int load_idx = sd->forkexec_idx;
1258 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1259
1260 do {
1261 unsigned long load, avg_load;
1262 int local_group;
1263 int i;
1264
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001265 /* Skip over this group if it has no CPUs allowed */
1266 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1267 goto nextgroup;
1268
Nick Piggin147cbb42005-06-25 14:57:19 -07001269 local_group = cpu_isset(this_cpu, group->cpumask);
Nick Piggin147cbb42005-06-25 14:57:19 -07001270
1271 /* Tally up the load of all CPUs in the group */
1272 avg_load = 0;
1273
1274 for_each_cpu_mask(i, group->cpumask) {
1275 /* Bias balancing toward cpus of our domain */
1276 if (local_group)
1277 load = source_load(i, load_idx);
1278 else
1279 load = target_load(i, load_idx);
1280
1281 avg_load += load;
1282 }
1283
1284 /* Adjust by relative CPU power of the group */
Eric Dumazet5517d862007-05-08 00:32:57 -07001285 avg_load = sg_div_cpu_power(group,
1286 avg_load * SCHED_LOAD_SCALE);
Nick Piggin147cbb42005-06-25 14:57:19 -07001287
1288 if (local_group) {
1289 this_load = avg_load;
1290 this = group;
1291 } else if (avg_load < min_load) {
1292 min_load = avg_load;
1293 idlest = group;
1294 }
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001295nextgroup:
Nick Piggin147cbb42005-06-25 14:57:19 -07001296 group = group->next;
1297 } while (group != sd->groups);
1298
1299 if (!idlest || 100*this_load < imbalance*min_load)
1300 return NULL;
1301 return idlest;
1302}
1303
1304/*
Satoru Takeuchi0feaece2006-10-03 01:14:10 -07001305 * find_idlest_cpu - find the idlest cpu among the cpus in group.
Nick Piggin147cbb42005-06-25 14:57:19 -07001306 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07001307static int
1308find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
Nick Piggin147cbb42005-06-25 14:57:19 -07001309{
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001310 cpumask_t tmp;
Nick Piggin147cbb42005-06-25 14:57:19 -07001311 unsigned long load, min_load = ULONG_MAX;
1312 int idlest = -1;
1313 int i;
1314
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001315 /* Traverse only the allowed CPUs */
1316 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1317
1318 for_each_cpu_mask(i, tmp) {
Peter Williams2dd73a42006-06-27 02:54:34 -07001319 load = weighted_cpuload(i);
Nick Piggin147cbb42005-06-25 14:57:19 -07001320
1321 if (load < min_load || (load == min_load && i == this_cpu)) {
1322 min_load = load;
1323 idlest = i;
1324 }
1325 }
1326
1327 return idlest;
1328}
1329
Nick Piggin476d1392005-06-25 14:57:29 -07001330/*
1331 * sched_balance_self: balance the current task (running on cpu) in domains
1332 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1333 * SD_BALANCE_EXEC.
1334 *
1335 * Balance, ie. select the least loaded group.
1336 *
1337 * Returns the target CPU number, or the same CPU if no balancing is needed.
1338 *
1339 * preempt must be disabled.
1340 */
1341static int sched_balance_self(int cpu, int flag)
1342{
1343 struct task_struct *t = current;
1344 struct sched_domain *tmp, *sd = NULL;
Nick Piggin147cbb42005-06-25 14:57:19 -07001345
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07001346 for_each_domain(cpu, tmp) {
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07001347 /*
1348 * If power savings logic is enabled for a domain, stop there.
1349 */
1350 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1351 break;
Nick Piggin476d1392005-06-25 14:57:29 -07001352 if (tmp->flags & flag)
1353 sd = tmp;
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07001354 }
Nick Piggin476d1392005-06-25 14:57:29 -07001355
1356 while (sd) {
1357 cpumask_t span;
1358 struct sched_group *group;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001359 int new_cpu, weight;
1360
1361 if (!(sd->flags & flag)) {
1362 sd = sd->child;
1363 continue;
1364 }
Nick Piggin476d1392005-06-25 14:57:29 -07001365
1366 span = sd->span;
1367 group = find_idlest_group(sd, t, cpu);
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001368 if (!group) {
1369 sd = sd->child;
1370 continue;
1371 }
Nick Piggin476d1392005-06-25 14:57:29 -07001372
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001373 new_cpu = find_idlest_cpu(group, t, cpu);
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001374 if (new_cpu == -1 || new_cpu == cpu) {
1375 /* Now try balancing at a lower domain level of cpu */
1376 sd = sd->child;
1377 continue;
1378 }
Nick Piggin476d1392005-06-25 14:57:29 -07001379
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001380 /* Now try balancing at a lower domain level of new_cpu */
Nick Piggin476d1392005-06-25 14:57:29 -07001381 cpu = new_cpu;
Nick Piggin476d1392005-06-25 14:57:29 -07001382 sd = NULL;
1383 weight = cpus_weight(span);
1384 for_each_domain(cpu, tmp) {
1385 if (weight <= cpus_weight(tmp->span))
1386 break;
1387 if (tmp->flags & flag)
1388 sd = tmp;
1389 }
1390 /* while loop will break here if sd == NULL */
1391 }
1392
1393 return cpu;
1394}
1395
1396#endif /* CONFIG_SMP */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001397
1398/*
1399 * wake_idle() will wake a task on an idle cpu if task->cpu is
1400 * not idle and an idle cpu is available. The span of cpus to
1401 * search starts with cpus closest then further out as needed,
1402 * so we always favor a closer, idle cpu.
1403 *
1404 * Returns the CPU we should wake onto.
1405 */
1406#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
Ingo Molnar36c8b582006-07-03 00:25:41 -07001407static int wake_idle(int cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001408{
1409 cpumask_t tmp;
1410 struct sched_domain *sd;
1411 int i;
1412
Siddha, Suresh B49531982007-05-08 00:33:01 -07001413 /*
1414 * If it is idle, then it is the best cpu to run this task.
1415 *
1416 * This cpu is also the best, if it has more than one task already.
1417 * Siblings must be also busy(in most cases) as they didn't already
1418 * pickup the extra load from this cpu and hence we need not check
1419 * sibling runqueue info. This will avoid the checks and cache miss
1420 * penalities associated with that.
1421 */
1422 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001423 return cpu;
1424
1425 for_each_domain(cpu, sd) {
1426 if (sd->flags & SD_WAKE_IDLE) {
Nick Piggine0f364f2005-06-25 14:57:06 -07001427 cpus_and(tmp, sd->span, p->cpus_allowed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001428 for_each_cpu_mask(i, tmp) {
1429 if (idle_cpu(i))
1430 return i;
1431 }
1432 }
Nick Piggine0f364f2005-06-25 14:57:06 -07001433 else
1434 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001435 }
1436 return cpu;
1437}
1438#else
Ingo Molnar36c8b582006-07-03 00:25:41 -07001439static inline int wake_idle(int cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001440{
1441 return cpu;
1442}
1443#endif
1444
1445/***
1446 * try_to_wake_up - wake up a thread
1447 * @p: the to-be-woken-up thread
1448 * @state: the mask of task states that can be woken
1449 * @sync: do a synchronous wakeup?
1450 *
1451 * Put it on the run-queue if it's not already there. The "current"
1452 * thread is always on the run-queue (except when the actual
1453 * re-schedule is in progress), and as such you're allowed to do
1454 * the simpler "current->state = TASK_RUNNING" to mark yourself
1455 * runnable without the overhead of this.
1456 *
1457 * returns failure only if the task is already active.
1458 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001459static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001460{
1461 int cpu, this_cpu, success = 0;
1462 unsigned long flags;
1463 long old_state;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001464 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001465#ifdef CONFIG_SMP
Nick Piggin78979862005-06-25 14:57:13 -07001466 struct sched_domain *sd, *this_sd = NULL;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001467 unsigned long load, this_load;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001468 int new_cpu;
1469#endif
1470
1471 rq = task_rq_lock(p, &flags);
1472 old_state = p->state;
1473 if (!(old_state & state))
1474 goto out;
1475
Ingo Molnardd41f592007-07-09 18:51:59 +02001476 if (p->se.on_rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001477 goto out_running;
1478
1479 cpu = task_cpu(p);
1480 this_cpu = smp_processor_id();
1481
1482#ifdef CONFIG_SMP
1483 if (unlikely(task_running(rq, p)))
1484 goto out_activate;
1485
Nick Piggin78979862005-06-25 14:57:13 -07001486 new_cpu = cpu;
1487
Linus Torvalds1da177e2005-04-16 15:20:36 -07001488 schedstat_inc(rq, ttwu_cnt);
1489 if (cpu == this_cpu) {
1490 schedstat_inc(rq, ttwu_local);
Nick Piggin78979862005-06-25 14:57:13 -07001491 goto out_set_cpu;
1492 }
1493
1494 for_each_domain(this_cpu, sd) {
1495 if (cpu_isset(cpu, sd->span)) {
1496 schedstat_inc(sd, ttwu_wake_remote);
1497 this_sd = sd;
1498 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001499 }
1500 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001501
Nick Piggin78979862005-06-25 14:57:13 -07001502 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001503 goto out_set_cpu;
1504
Linus Torvalds1da177e2005-04-16 15:20:36 -07001505 /*
Nick Piggin78979862005-06-25 14:57:13 -07001506 * Check for affine wakeup and passive balancing possibilities.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001507 */
Nick Piggin78979862005-06-25 14:57:13 -07001508 if (this_sd) {
1509 int idx = this_sd->wake_idx;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001510 unsigned int imbalance;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511
Nick Piggina3f21bc2005-06-25 14:57:15 -07001512 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1513
Nick Piggin78979862005-06-25 14:57:13 -07001514 load = source_load(cpu, idx);
1515 this_load = target_load(this_cpu, idx);
1516
Nick Piggin78979862005-06-25 14:57:13 -07001517 new_cpu = this_cpu; /* Wake to this CPU if we can */
1518
Nick Piggina3f21bc2005-06-25 14:57:15 -07001519 if (this_sd->flags & SD_WAKE_AFFINE) {
1520 unsigned long tl = this_load;
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08001521 unsigned long tl_per_task;
1522
1523 tl_per_task = cpu_avg_load_per_task(this_cpu);
Peter Williams2dd73a42006-06-27 02:54:34 -07001524
Linus Torvalds1da177e2005-04-16 15:20:36 -07001525 /*
Nick Piggina3f21bc2005-06-25 14:57:15 -07001526 * If sync wakeup then subtract the (maximum possible)
1527 * effect of the currently running task from the load
1528 * of the current CPU:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001529 */
Nick Piggina3f21bc2005-06-25 14:57:15 -07001530 if (sync)
Ingo Molnardd41f592007-07-09 18:51:59 +02001531 tl -= current->se.load.weight;
Nick Piggina3f21bc2005-06-25 14:57:15 -07001532
1533 if ((tl <= load &&
Peter Williams2dd73a42006-06-27 02:54:34 -07001534 tl + target_load(cpu, idx) <= tl_per_task) ||
Ingo Molnardd41f592007-07-09 18:51:59 +02001535 100*(tl + p->se.load.weight) <= imbalance*load) {
Nick Piggina3f21bc2005-06-25 14:57:15 -07001536 /*
1537 * This domain has SD_WAKE_AFFINE and
1538 * p is cache cold in this domain, and
1539 * there is no bad imbalance.
1540 */
1541 schedstat_inc(this_sd, ttwu_move_affine);
1542 goto out_set_cpu;
1543 }
1544 }
1545
1546 /*
1547 * Start passive balancing when half the imbalance_pct
1548 * limit is reached.
1549 */
1550 if (this_sd->flags & SD_WAKE_BALANCE) {
1551 if (imbalance*this_load <= 100*load) {
1552 schedstat_inc(this_sd, ttwu_move_balance);
1553 goto out_set_cpu;
1554 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001555 }
1556 }
1557
1558 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1559out_set_cpu:
1560 new_cpu = wake_idle(new_cpu, p);
1561 if (new_cpu != cpu) {
1562 set_task_cpu(p, new_cpu);
1563 task_rq_unlock(rq, &flags);
1564 /* might preempt at this point */
1565 rq = task_rq_lock(p, &flags);
1566 old_state = p->state;
1567 if (!(old_state & state))
1568 goto out;
Ingo Molnardd41f592007-07-09 18:51:59 +02001569 if (p->se.on_rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001570 goto out_running;
1571
1572 this_cpu = smp_processor_id();
1573 cpu = task_cpu(p);
1574 }
1575
1576out_activate:
1577#endif /* CONFIG_SMP */
Ingo Molnardd41f592007-07-09 18:51:59 +02001578 activate_task(rq, p, 1);
Ingo Molnard79fc0f2005-09-10 00:26:12 -07001579 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001580 * Sync wakeups (i.e. those types of wakeups where the waker
1581 * has indicated that it will leave the CPU in short order)
1582 * don't trigger a preemption, if the woken up task will run on
1583 * this cpu. (in this case the 'I will reschedule' promise of
1584 * the waker guarantees that the freshly woken up task is going
1585 * to be considered on this CPU.)
1586 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001587 if (!sync || cpu != this_cpu)
1588 check_preempt_curr(rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589 success = 1;
1590
1591out_running:
1592 p->state = TASK_RUNNING;
1593out:
1594 task_rq_unlock(rq, &flags);
1595
1596 return success;
1597}
1598
Ingo Molnar36c8b582006-07-03 00:25:41 -07001599int fastcall wake_up_process(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001600{
1601 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1602 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1603}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001604EXPORT_SYMBOL(wake_up_process);
1605
Ingo Molnar36c8b582006-07-03 00:25:41 -07001606int fastcall wake_up_state(struct task_struct *p, unsigned int state)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001607{
1608 return try_to_wake_up(p, state, 0);
1609}
1610
Linus Torvalds1da177e2005-04-16 15:20:36 -07001611/*
1612 * Perform scheduler related setup for a newly forked process p.
1613 * p is forked by current.
Ingo Molnardd41f592007-07-09 18:51:59 +02001614 *
1615 * __sched_fork() is basic setup used by init_idle() too:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001616 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001617static void __sched_fork(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001618{
Ingo Molnardd41f592007-07-09 18:51:59 +02001619 p->se.wait_start_fair = 0;
1620 p->se.wait_start = 0;
1621 p->se.exec_start = 0;
1622 p->se.sum_exec_runtime = 0;
1623 p->se.delta_exec = 0;
1624 p->se.delta_fair_run = 0;
1625 p->se.delta_fair_sleep = 0;
1626 p->se.wait_runtime = 0;
1627 p->se.sum_wait_runtime = 0;
1628 p->se.sum_sleep_runtime = 0;
1629 p->se.sleep_start = 0;
1630 p->se.sleep_start_fair = 0;
1631 p->se.block_start = 0;
1632 p->se.sleep_max = 0;
1633 p->se.block_max = 0;
1634 p->se.exec_max = 0;
1635 p->se.wait_max = 0;
1636 p->se.wait_runtime_overruns = 0;
1637 p->se.wait_runtime_underruns = 0;
Nick Piggin476d1392005-06-25 14:57:29 -07001638
Ingo Molnardd41f592007-07-09 18:51:59 +02001639 INIT_LIST_HEAD(&p->run_list);
1640 p->se.on_rq = 0;
Nick Piggin476d1392005-06-25 14:57:29 -07001641
Linus Torvalds1da177e2005-04-16 15:20:36 -07001642 /*
1643 * We mark the process as running here, but have not actually
1644 * inserted it onto the runqueue yet. This guarantees that
1645 * nobody will actually run it, and a signal or other external
1646 * event cannot wake it up and insert it on the runqueue either.
1647 */
1648 p->state = TASK_RUNNING;
Ingo Molnardd41f592007-07-09 18:51:59 +02001649}
1650
1651/*
1652 * fork()/clone()-time setup:
1653 */
1654void sched_fork(struct task_struct *p, int clone_flags)
1655{
1656 int cpu = get_cpu();
1657
1658 __sched_fork(p);
1659
1660#ifdef CONFIG_SMP
1661 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1662#endif
1663 __set_task_cpu(p, cpu);
Ingo Molnarb29739f2006-06-27 02:54:51 -07001664
1665 /*
1666 * Make sure we do not leak PI boosting priority to the child:
1667 */
1668 p->prio = current->normal_prio;
1669
Chandra Seetharaman52f17b62006-07-14 00:24:38 -07001670#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
Ingo Molnardd41f592007-07-09 18:51:59 +02001671 if (likely(sched_info_on()))
Chandra Seetharaman52f17b62006-07-14 00:24:38 -07001672 memset(&p->sched_info, 0, sizeof(p->sched_info));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001673#endif
Chen, Kenneth Wd6077cb2006-02-14 13:53:10 -08001674#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
Nick Piggin4866cde2005-06-25 14:57:23 -07001675 p->oncpu = 0;
1676#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001677#ifdef CONFIG_PREEMPT
Nick Piggin4866cde2005-06-25 14:57:23 -07001678 /* Want to start with kernel preemption disabled. */
Al Viroa1261f52005-11-13 16:06:55 -08001679 task_thread_info(p)->preempt_count = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001680#endif
Nick Piggin476d1392005-06-25 14:57:29 -07001681 put_cpu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001682}
1683
1684/*
Ingo Molnardd41f592007-07-09 18:51:59 +02001685 * After fork, child runs first. (default) If set to 0 then
1686 * parent will (try to) run first.
1687 */
1688unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1689
1690/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001691 * wake_up_new_task - wake up a newly created task for the first time.
1692 *
1693 * This function will do some initial scheduler statistics housekeeping
1694 * that must be done for every newly created context, then puts the task
1695 * on the runqueue and wakes it.
1696 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001697void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698{
1699 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02001700 struct rq *rq;
1701 int this_cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001702
1703 rq = task_rq_lock(p, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001704 BUG_ON(p->state != TASK_RUNNING);
Ingo Molnardd41f592007-07-09 18:51:59 +02001705 this_cpu = smp_processor_id(); /* parent's CPU */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001706
1707 p->prio = effective_prio(p);
1708
Ingo Molnardd41f592007-07-09 18:51:59 +02001709 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1710 task_cpu(p) != this_cpu || !current->se.on_rq) {
1711 activate_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001712 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001713 /*
Ingo Molnardd41f592007-07-09 18:51:59 +02001714 * Let the scheduling class do new task startup
1715 * management (if any):
Linus Torvalds1da177e2005-04-16 15:20:36 -07001716 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001717 p->sched_class->task_new(rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001718 }
Ingo Molnardd41f592007-07-09 18:51:59 +02001719 check_preempt_curr(rq, p);
1720 task_rq_unlock(rq, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001721}
1722
Linus Torvalds1da177e2005-04-16 15:20:36 -07001723/**
Nick Piggin4866cde2005-06-25 14:57:23 -07001724 * prepare_task_switch - prepare to switch tasks
1725 * @rq: the runqueue preparing to switch
1726 * @next: the task we are going to switch to.
1727 *
1728 * This is called with the rq lock held and interrupts off. It must
1729 * be paired with a subsequent finish_task_switch after the context
1730 * switch.
1731 *
1732 * prepare_task_switch sets up locking and calls architecture specific
1733 * hooks.
1734 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001735static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -07001736{
1737 prepare_lock_switch(rq, next);
1738 prepare_arch_switch(next);
1739}
1740
1741/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001742 * finish_task_switch - clean up after a task-switch
Jeff Garzik344baba2005-09-07 01:15:17 -04001743 * @rq: runqueue associated with task-switch
Linus Torvalds1da177e2005-04-16 15:20:36 -07001744 * @prev: the thread we just switched away from.
1745 *
Nick Piggin4866cde2005-06-25 14:57:23 -07001746 * finish_task_switch must be called after the context switch, paired
1747 * with a prepare_task_switch call before the context switch.
1748 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1749 * and do any other architecture-specific cleanup actions.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001750 *
1751 * Note that we may have delayed dropping an mm in context_switch(). If
1752 * so, we finish that here outside of the runqueue lock. (Doing it
1753 * with the lock held can cause deadlocks; see schedule() for
1754 * details.)
1755 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001756static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001757 __releases(rq->lock)
1758{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001759 struct mm_struct *mm = rq->prev_mm;
Oleg Nesterov55a101f2006-09-29 02:01:10 -07001760 long prev_state;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001761
1762 rq->prev_mm = NULL;
1763
1764 /*
1765 * A task struct has one reference for the use as "current".
Oleg Nesterovc394cc92006-09-29 02:01:11 -07001766 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
Oleg Nesterov55a101f2006-09-29 02:01:10 -07001767 * schedule one last time. The schedule call will never return, and
1768 * the scheduled task must drop that reference.
Oleg Nesterovc394cc92006-09-29 02:01:11 -07001769 * The test for TASK_DEAD must occur while the runqueue locks are
Linus Torvalds1da177e2005-04-16 15:20:36 -07001770 * still held, otherwise prev could be scheduled on another cpu, die
1771 * there before we look at prev->state, and then the reference would
1772 * be dropped twice.
1773 * Manfred Spraul <manfred@colorfullife.com>
1774 */
Oleg Nesterov55a101f2006-09-29 02:01:10 -07001775 prev_state = prev->state;
Nick Piggin4866cde2005-06-25 14:57:23 -07001776 finish_arch_switch(prev);
1777 finish_lock_switch(rq, prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001778 if (mm)
1779 mmdrop(mm);
Oleg Nesterovc394cc92006-09-29 02:01:11 -07001780 if (unlikely(prev_state == TASK_DEAD)) {
bibo maoc6fd91f2006-03-26 01:38:20 -08001781 /*
1782 * Remove function-return probe instances associated with this
1783 * task and put them back on the free list.
1784 */
1785 kprobe_flush_task(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001786 put_task_struct(prev);
bibo maoc6fd91f2006-03-26 01:38:20 -08001787 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001788}
1789
1790/**
1791 * schedule_tail - first thing a freshly forked thread must call.
1792 * @prev: the thread we just switched away from.
1793 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001794asmlinkage void schedule_tail(struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001795 __releases(rq->lock)
1796{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001797 struct rq *rq = this_rq();
1798
Nick Piggin4866cde2005-06-25 14:57:23 -07001799 finish_task_switch(rq, prev);
1800#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1801 /* In this case, finish_task_switch does not reenable preemption */
1802 preempt_enable();
1803#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001804 if (current->set_child_tid)
1805 put_user(current->pid, current->set_child_tid);
1806}
1807
1808/*
1809 * context_switch - switch to the new MM and the new
1810 * thread's register state.
1811 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001812static inline void
Ingo Molnar70b97a72006-07-03 00:25:42 -07001813context_switch(struct rq *rq, struct task_struct *prev,
Ingo Molnar36c8b582006-07-03 00:25:41 -07001814 struct task_struct *next)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001815{
Ingo Molnardd41f592007-07-09 18:51:59 +02001816 struct mm_struct *mm, *oldmm;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001817
Ingo Molnardd41f592007-07-09 18:51:59 +02001818 prepare_task_switch(rq, next);
1819 mm = next->mm;
1820 oldmm = prev->active_mm;
Zachary Amsden9226d122007-02-13 13:26:21 +01001821 /*
1822 * For paravirt, this is coupled with an exit in switch_to to
1823 * combine the page table reload and the switch backend into
1824 * one hypercall.
1825 */
1826 arch_enter_lazy_cpu_mode();
1827
Ingo Molnardd41f592007-07-09 18:51:59 +02001828 if (unlikely(!mm)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001829 next->active_mm = oldmm;
1830 atomic_inc(&oldmm->mm_count);
1831 enter_lazy_tlb(oldmm, next);
1832 } else
1833 switch_mm(oldmm, mm, next);
1834
Ingo Molnardd41f592007-07-09 18:51:59 +02001835 if (unlikely(!prev->mm)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001836 prev->active_mm = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001837 rq->prev_mm = oldmm;
1838 }
Ingo Molnar3a5f5e42006-07-14 00:24:27 -07001839 /*
1840 * Since the runqueue lock will be released by the next
1841 * task (which is an invalid locking op but in the case
1842 * of the scheduler it's an obvious special-case), so we
1843 * do an early lockdep release here:
1844 */
1845#ifndef __ARCH_WANT_UNLOCKED_CTXSW
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07001846 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
Ingo Molnar3a5f5e42006-07-14 00:24:27 -07001847#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001848
1849 /* Here we just switch the register state and the stack. */
1850 switch_to(prev, next, prev);
1851
Ingo Molnardd41f592007-07-09 18:51:59 +02001852 barrier();
1853 /*
1854 * this_rq must be evaluated again because prev may have moved
1855 * CPUs since it called schedule(), thus the 'rq' on its stack
1856 * frame will be invalid.
1857 */
1858 finish_task_switch(this_rq(), prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001859}
1860
1861/*
1862 * nr_running, nr_uninterruptible and nr_context_switches:
1863 *
1864 * externally visible scheduler statistics: current number of runnable
1865 * threads, current number of uninterruptible-sleeping threads, total
1866 * number of context switches performed since bootup.
1867 */
1868unsigned long nr_running(void)
1869{
1870 unsigned long i, sum = 0;
1871
1872 for_each_online_cpu(i)
1873 sum += cpu_rq(i)->nr_running;
1874
1875 return sum;
1876}
1877
1878unsigned long nr_uninterruptible(void)
1879{
1880 unsigned long i, sum = 0;
1881
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08001882 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001883 sum += cpu_rq(i)->nr_uninterruptible;
1884
1885 /*
1886 * Since we read the counters lockless, it might be slightly
1887 * inaccurate. Do not allow it to go below zero though:
1888 */
1889 if (unlikely((long)sum < 0))
1890 sum = 0;
1891
1892 return sum;
1893}
1894
1895unsigned long long nr_context_switches(void)
1896{
Steven Rostedtcc94abf2006-06-27 02:54:31 -07001897 int i;
1898 unsigned long long sum = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001899
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08001900 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001901 sum += cpu_rq(i)->nr_switches;
1902
1903 return sum;
1904}
1905
1906unsigned long nr_iowait(void)
1907{
1908 unsigned long i, sum = 0;
1909
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08001910 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001911 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1912
1913 return sum;
1914}
1915
Jack Steinerdb1b1fe2006-03-31 02:31:21 -08001916unsigned long nr_active(void)
1917{
1918 unsigned long i, running = 0, uninterruptible = 0;
1919
1920 for_each_online_cpu(i) {
1921 running += cpu_rq(i)->nr_running;
1922 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1923 }
1924
1925 if (unlikely((long)uninterruptible < 0))
1926 uninterruptible = 0;
1927
1928 return running + uninterruptible;
1929}
1930
Linus Torvalds1da177e2005-04-16 15:20:36 -07001931/*
Ingo Molnardd41f592007-07-09 18:51:59 +02001932 * Update rq->cpu_load[] statistics. This function is usually called every
1933 * scheduler tick (TICK_NSEC).
Ingo Molnar48f24c42006-07-03 00:25:40 -07001934 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001935static void update_cpu_load(struct rq *this_rq)
Ingo Molnar48f24c42006-07-03 00:25:40 -07001936{
Ingo Molnardd41f592007-07-09 18:51:59 +02001937 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1938 unsigned long total_load = this_rq->ls.load.weight;
1939 unsigned long this_load = total_load;
1940 struct load_stat *ls = &this_rq->ls;
1941 u64 now = __rq_clock(this_rq);
1942 int i, scale;
1943
1944 this_rq->nr_load_updates++;
1945 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1946 goto do_avg;
1947
1948 /* Update delta_fair/delta_exec fields first */
1949 update_curr_load(this_rq, now);
1950
1951 fair_delta64 = ls->delta_fair + 1;
1952 ls->delta_fair = 0;
1953
1954 exec_delta64 = ls->delta_exec + 1;
1955 ls->delta_exec = 0;
1956
1957 sample_interval64 = now - ls->load_update_last;
1958 ls->load_update_last = now;
1959
1960 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1961 sample_interval64 = TICK_NSEC;
1962
1963 if (exec_delta64 > sample_interval64)
1964 exec_delta64 = sample_interval64;
1965
1966 idle_delta64 = sample_interval64 - exec_delta64;
1967
1968 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1969 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1970
1971 this_load = (unsigned long)tmp64;
1972
1973do_avg:
1974
1975 /* Update our load: */
1976 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1977 unsigned long old_load, new_load;
1978
1979 /* scale is effectively 1 << i now, and >> i divides by scale */
1980
1981 old_load = this_rq->cpu_load[i];
1982 new_load = this_load;
1983
1984 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1985 }
Ingo Molnar48f24c42006-07-03 00:25:40 -07001986}
1987
Ingo Molnardd41f592007-07-09 18:51:59 +02001988#ifdef CONFIG_SMP
1989
Ingo Molnar48f24c42006-07-03 00:25:40 -07001990/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001991 * double_rq_lock - safely lock two runqueues
1992 *
1993 * Note this does not disable interrupts like task_rq_lock,
1994 * you need to do so manually before calling.
1995 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001996static void double_rq_lock(struct rq *rq1, struct rq *rq2)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001997 __acquires(rq1->lock)
1998 __acquires(rq2->lock)
1999{
Kirill Korotaev054b9102006-12-10 02:20:11 -08002000 BUG_ON(!irqs_disabled());
Linus Torvalds1da177e2005-04-16 15:20:36 -07002001 if (rq1 == rq2) {
2002 spin_lock(&rq1->lock);
2003 __acquire(rq2->lock); /* Fake it out ;) */
2004 } else {
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07002005 if (rq1 < rq2) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002006 spin_lock(&rq1->lock);
2007 spin_lock(&rq2->lock);
2008 } else {
2009 spin_lock(&rq2->lock);
2010 spin_lock(&rq1->lock);
2011 }
2012 }
2013}
2014
2015/*
2016 * double_rq_unlock - safely unlock two runqueues
2017 *
2018 * Note this does not restore interrupts like task_rq_unlock,
2019 * you need to do so manually after calling.
2020 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002021static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002022 __releases(rq1->lock)
2023 __releases(rq2->lock)
2024{
2025 spin_unlock(&rq1->lock);
2026 if (rq1 != rq2)
2027 spin_unlock(&rq2->lock);
2028 else
2029 __release(rq2->lock);
2030}
2031
2032/*
2033 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2034 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002035static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002036 __releases(this_rq->lock)
2037 __acquires(busiest->lock)
2038 __acquires(this_rq->lock)
2039{
Kirill Korotaev054b9102006-12-10 02:20:11 -08002040 if (unlikely(!irqs_disabled())) {
2041 /* printk() doesn't work good under rq->lock */
2042 spin_unlock(&this_rq->lock);
2043 BUG_ON(1);
2044 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002045 if (unlikely(!spin_trylock(&busiest->lock))) {
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07002046 if (busiest < this_rq) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002047 spin_unlock(&this_rq->lock);
2048 spin_lock(&busiest->lock);
2049 spin_lock(&this_rq->lock);
2050 } else
2051 spin_lock(&busiest->lock);
2052 }
2053}
2054
2055/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002056 * If dest_cpu is allowed for this process, migrate the task to it.
2057 * This is accomplished by forcing the cpu_allowed mask to only
2058 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
2059 * the cpu_allowed mask is restored.
2060 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07002061static void sched_migrate_task(struct task_struct *p, int dest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002062{
Ingo Molnar70b97a72006-07-03 00:25:42 -07002063 struct migration_req req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002064 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002065 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002066
2067 rq = task_rq_lock(p, &flags);
2068 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2069 || unlikely(cpu_is_offline(dest_cpu)))
2070 goto out;
2071
2072 /* force the process onto the specified CPU */
2073 if (migrate_task(p, dest_cpu, &req)) {
2074 /* Need to wait for migration thread (might exit: take ref). */
2075 struct task_struct *mt = rq->migration_thread;
Ingo Molnar36c8b582006-07-03 00:25:41 -07002076
Linus Torvalds1da177e2005-04-16 15:20:36 -07002077 get_task_struct(mt);
2078 task_rq_unlock(rq, &flags);
2079 wake_up_process(mt);
2080 put_task_struct(mt);
2081 wait_for_completion(&req.done);
Ingo Molnar36c8b582006-07-03 00:25:41 -07002082
Linus Torvalds1da177e2005-04-16 15:20:36 -07002083 return;
2084 }
2085out:
2086 task_rq_unlock(rq, &flags);
2087}
2088
2089/*
Nick Piggin476d1392005-06-25 14:57:29 -07002090 * sched_exec - execve() is a valuable balancing opportunity, because at
2091 * this point the task has the smallest effective memory and cache footprint.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002092 */
2093void sched_exec(void)
2094{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002095 int new_cpu, this_cpu = get_cpu();
Nick Piggin476d1392005-06-25 14:57:29 -07002096 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002097 put_cpu();
Nick Piggin476d1392005-06-25 14:57:29 -07002098 if (new_cpu != this_cpu)
2099 sched_migrate_task(current, new_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002100}
2101
2102/*
2103 * pull_task - move a task from a remote runqueue to the local runqueue.
2104 * Both runqueues must be locked.
2105 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002106static void pull_task(struct rq *src_rq, struct task_struct *p,
2107 struct rq *this_rq, int this_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002108{
Ingo Molnardd41f592007-07-09 18:51:59 +02002109 deactivate_task(src_rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002110 set_task_cpu(p, this_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02002111 activate_task(this_rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002112 /*
2113 * Note that idle threads have a prio of MAX_PRIO, for this test
2114 * to be always true for them.
2115 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002116 check_preempt_curr(this_rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002117}
2118
2119/*
2120 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2121 */
Arjan van de Ven858119e2006-01-14 13:20:43 -08002122static
Ingo Molnar70b97a72006-07-03 00:25:42 -07002123int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002124 struct sched_domain *sd, enum cpu_idle_type idle,
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07002125 int *all_pinned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002126{
2127 /*
2128 * We do not migrate tasks that are:
2129 * 1) running (obviously), or
2130 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2131 * 3) are cache-hot on their current CPU.
2132 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002133 if (!cpu_isset(this_cpu, p->cpus_allowed))
2134 return 0;
Nick Piggin81026792005-06-25 14:57:07 -07002135 *all_pinned = 0;
2136
2137 if (task_running(rq, p))
2138 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002139
2140 /*
Ingo Molnardd41f592007-07-09 18:51:59 +02002141 * Aggressive migration if too many balance attempts have failed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002142 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002143 if (sd->nr_balance_failed > sd->cache_nice_tries)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002144 return 1;
2145
Linus Torvalds1da177e2005-04-16 15:20:36 -07002146 return 1;
2147}
2148
Ingo Molnardd41f592007-07-09 18:51:59 +02002149static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2150 unsigned long max_nr_move, unsigned long max_load_move,
2151 struct sched_domain *sd, enum cpu_idle_type idle,
2152 int *all_pinned, unsigned long *load_moved,
2153 int this_best_prio, int best_prio, int best_prio_seen,
2154 struct rq_iterator *iterator)
2155{
2156 int pulled = 0, pinned = 0, skip_for_load;
2157 struct task_struct *p;
2158 long rem_load_move = max_load_move;
2159
2160 if (max_nr_move == 0 || max_load_move == 0)
2161 goto out;
2162
2163 pinned = 1;
2164
2165 /*
2166 * Start the load-balancing iterator:
2167 */
2168 p = iterator->start(iterator->arg);
2169next:
2170 if (!p)
2171 goto out;
2172 /*
2173 * To help distribute high priority tasks accross CPUs we don't
2174 * skip a task if it will be the highest priority task (i.e. smallest
2175 * prio value) on its new queue regardless of its load weight
2176 */
2177 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2178 SCHED_LOAD_SCALE_FUZZ;
2179 if (skip_for_load && p->prio < this_best_prio)
2180 skip_for_load = !best_prio_seen && p->prio == best_prio;
2181 if (skip_for_load ||
2182 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2183
2184 best_prio_seen |= p->prio == best_prio;
2185 p = iterator->next(iterator->arg);
2186 goto next;
2187 }
2188
2189 pull_task(busiest, p, this_rq, this_cpu);
2190 pulled++;
2191 rem_load_move -= p->se.load.weight;
2192
2193 /*
2194 * We only want to steal up to the prescribed number of tasks
2195 * and the prescribed amount of weighted load.
2196 */
2197 if (pulled < max_nr_move && rem_load_move > 0) {
2198 if (p->prio < this_best_prio)
2199 this_best_prio = p->prio;
2200 p = iterator->next(iterator->arg);
2201 goto next;
2202 }
2203out:
2204 /*
2205 * Right now, this is the only place pull_task() is called,
2206 * so we can safely collect pull_task() stats here rather than
2207 * inside pull_task().
2208 */
2209 schedstat_add(sd, lb_gained[idle], pulled);
2210
2211 if (all_pinned)
2212 *all_pinned = pinned;
2213 *load_moved = max_load_move - rem_load_move;
2214 return pulled;
2215}
Ingo Molnar48f24c42006-07-03 00:25:40 -07002216
Linus Torvalds1da177e2005-04-16 15:20:36 -07002217/*
Peter Williams2dd73a42006-06-27 02:54:34 -07002218 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2219 * load from busiest to this_rq, as part of a balancing operation within
2220 * "domain". Returns the number of tasks moved.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002221 *
2222 * Called with both runqueues locked.
2223 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002224static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
Peter Williams2dd73a42006-06-27 02:54:34 -07002225 unsigned long max_nr_move, unsigned long max_load_move,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002226 struct sched_domain *sd, enum cpu_idle_type idle,
Peter Williams2dd73a42006-06-27 02:54:34 -07002227 int *all_pinned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002228{
Ingo Molnardd41f592007-07-09 18:51:59 +02002229 struct sched_class *class = sched_class_highest;
2230 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2231 long rem_load_move = max_load_move;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002232
Ingo Molnardd41f592007-07-09 18:51:59 +02002233 do {
2234 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2235 max_nr_move, (unsigned long)rem_load_move,
2236 sd, idle, all_pinned, &load_moved);
2237 total_nr_moved += nr_moved;
2238 max_nr_move -= nr_moved;
2239 rem_load_move -= load_moved;
2240 class = class->next;
2241 } while (class && max_nr_move && rem_load_move > 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002242
Ingo Molnardd41f592007-07-09 18:51:59 +02002243 return total_nr_moved;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002244}
2245
2246/*
2247 * find_busiest_group finds and returns the busiest CPU group within the
Ingo Molnar48f24c42006-07-03 00:25:40 -07002248 * domain. It calculates and returns the amount of weighted load which
2249 * should be moved to restore balance via the imbalance parameter.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002250 */
2251static struct sched_group *
2252find_busiest_group(struct sched_domain *sd, int this_cpu,
Ingo Molnardd41f592007-07-09 18:51:59 +02002253 unsigned long *imbalance, enum cpu_idle_type idle,
2254 int *sd_idle, cpumask_t *cpus, int *balance)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002255{
2256 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2257 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
Siddha, Suresh B0c117f12005-09-10 00:26:21 -07002258 unsigned long max_pull;
Peter Williams2dd73a42006-06-27 02:54:34 -07002259 unsigned long busiest_load_per_task, busiest_nr_running;
2260 unsigned long this_load_per_task, this_nr_running;
Nick Piggin78979862005-06-25 14:57:13 -07002261 int load_idx;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002262#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2263 int power_savings_balance = 1;
2264 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2265 unsigned long min_nr_running = ULONG_MAX;
2266 struct sched_group *group_min = NULL, *group_leader = NULL;
2267#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002268
2269 max_load = this_load = total_load = total_pwr = 0;
Peter Williams2dd73a42006-06-27 02:54:34 -07002270 busiest_load_per_task = busiest_nr_running = 0;
2271 this_load_per_task = this_nr_running = 0;
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002272 if (idle == CPU_NOT_IDLE)
Nick Piggin78979862005-06-25 14:57:13 -07002273 load_idx = sd->busy_idx;
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002274 else if (idle == CPU_NEWLY_IDLE)
Nick Piggin78979862005-06-25 14:57:13 -07002275 load_idx = sd->newidle_idx;
2276 else
2277 load_idx = sd->idle_idx;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002278
2279 do {
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002280 unsigned long load, group_capacity;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002281 int local_group;
2282 int i;
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002283 unsigned int balance_cpu = -1, first_idle_cpu = 0;
Peter Williams2dd73a42006-06-27 02:54:34 -07002284 unsigned long sum_nr_running, sum_weighted_load;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002285
2286 local_group = cpu_isset(this_cpu, group->cpumask);
2287
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002288 if (local_group)
2289 balance_cpu = first_cpu(group->cpumask);
2290
Linus Torvalds1da177e2005-04-16 15:20:36 -07002291 /* Tally up the load of all CPUs in the group */
Peter Williams2dd73a42006-06-27 02:54:34 -07002292 sum_weighted_load = sum_nr_running = avg_load = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002293
2294 for_each_cpu_mask(i, group->cpumask) {
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002295 struct rq *rq;
2296
2297 if (!cpu_isset(i, *cpus))
2298 continue;
2299
2300 rq = cpu_rq(i);
Peter Williams2dd73a42006-06-27 02:54:34 -07002301
Nick Piggin5969fe02005-09-10 00:26:19 -07002302 if (*sd_idle && !idle_cpu(i))
2303 *sd_idle = 0;
2304
Linus Torvalds1da177e2005-04-16 15:20:36 -07002305 /* Bias balancing toward cpus of our domain */
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002306 if (local_group) {
2307 if (idle_cpu(i) && !first_idle_cpu) {
2308 first_idle_cpu = 1;
2309 balance_cpu = i;
2310 }
2311
Nick Piggina2000572006-02-10 01:51:02 -08002312 load = target_load(i, load_idx);
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002313 } else
Nick Piggina2000572006-02-10 01:51:02 -08002314 load = source_load(i, load_idx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002315
2316 avg_load += load;
Peter Williams2dd73a42006-06-27 02:54:34 -07002317 sum_nr_running += rq->nr_running;
Ingo Molnardd41f592007-07-09 18:51:59 +02002318 sum_weighted_load += weighted_cpuload(i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002319 }
2320
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002321 /*
2322 * First idle cpu or the first cpu(busiest) in this sched group
2323 * is eligible for doing load balancing at this and above
2324 * domains.
2325 */
2326 if (local_group && balance_cpu != this_cpu && balance) {
2327 *balance = 0;
2328 goto ret;
2329 }
2330
Linus Torvalds1da177e2005-04-16 15:20:36 -07002331 total_load += avg_load;
Eric Dumazet5517d862007-05-08 00:32:57 -07002332 total_pwr += group->__cpu_power;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002333
2334 /* Adjust by relative CPU power of the group */
Eric Dumazet5517d862007-05-08 00:32:57 -07002335 avg_load = sg_div_cpu_power(group,
2336 avg_load * SCHED_LOAD_SCALE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002337
Eric Dumazet5517d862007-05-08 00:32:57 -07002338 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002339
Linus Torvalds1da177e2005-04-16 15:20:36 -07002340 if (local_group) {
2341 this_load = avg_load;
2342 this = group;
Peter Williams2dd73a42006-06-27 02:54:34 -07002343 this_nr_running = sum_nr_running;
2344 this_load_per_task = sum_weighted_load;
2345 } else if (avg_load > max_load &&
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002346 sum_nr_running > group_capacity) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002347 max_load = avg_load;
2348 busiest = group;
Peter Williams2dd73a42006-06-27 02:54:34 -07002349 busiest_nr_running = sum_nr_running;
2350 busiest_load_per_task = sum_weighted_load;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002351 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002352
2353#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2354 /*
2355 * Busy processors will not participate in power savings
2356 * balance.
2357 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002358 if (idle == CPU_NOT_IDLE ||
2359 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2360 goto group_next;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002361
2362 /*
2363 * If the local group is idle or completely loaded
2364 * no need to do power savings balance at this domain
2365 */
2366 if (local_group && (this_nr_running >= group_capacity ||
2367 !this_nr_running))
2368 power_savings_balance = 0;
2369
Ingo Molnardd41f592007-07-09 18:51:59 +02002370 /*
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002371 * If a group is already running at full capacity or idle,
2372 * don't include that group in power savings calculations
Ingo Molnardd41f592007-07-09 18:51:59 +02002373 */
2374 if (!power_savings_balance || sum_nr_running >= group_capacity
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002375 || !sum_nr_running)
Ingo Molnardd41f592007-07-09 18:51:59 +02002376 goto group_next;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002377
Ingo Molnardd41f592007-07-09 18:51:59 +02002378 /*
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002379 * Calculate the group which has the least non-idle load.
Ingo Molnardd41f592007-07-09 18:51:59 +02002380 * This is the group from where we need to pick up the load
2381 * for saving power
2382 */
2383 if ((sum_nr_running < min_nr_running) ||
2384 (sum_nr_running == min_nr_running &&
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002385 first_cpu(group->cpumask) <
2386 first_cpu(group_min->cpumask))) {
Ingo Molnardd41f592007-07-09 18:51:59 +02002387 group_min = group;
2388 min_nr_running = sum_nr_running;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002389 min_load_per_task = sum_weighted_load /
2390 sum_nr_running;
Ingo Molnardd41f592007-07-09 18:51:59 +02002391 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002392
Ingo Molnardd41f592007-07-09 18:51:59 +02002393 /*
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002394 * Calculate the group which is almost near its
Ingo Molnardd41f592007-07-09 18:51:59 +02002395 * capacity but still has some space to pick up some load
2396 * from other group and save more power
2397 */
2398 if (sum_nr_running <= group_capacity - 1) {
2399 if (sum_nr_running > leader_nr_running ||
2400 (sum_nr_running == leader_nr_running &&
2401 first_cpu(group->cpumask) >
2402 first_cpu(group_leader->cpumask))) {
2403 group_leader = group;
2404 leader_nr_running = sum_nr_running;
2405 }
Ingo Molnar48f24c42006-07-03 00:25:40 -07002406 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002407group_next:
2408#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002409 group = group->next;
2410 } while (group != sd->groups);
2411
Peter Williams2dd73a42006-06-27 02:54:34 -07002412 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002413 goto out_balanced;
2414
2415 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2416
2417 if (this_load >= avg_load ||
2418 100*max_load <= sd->imbalance_pct*this_load)
2419 goto out_balanced;
2420
Peter Williams2dd73a42006-06-27 02:54:34 -07002421 busiest_load_per_task /= busiest_nr_running;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002422 /*
2423 * We're trying to get all the cpus to the average_load, so we don't
2424 * want to push ourselves above the average load, nor do we wish to
2425 * reduce the max loaded cpu below the average load, as either of these
2426 * actions would just result in more rebalancing later, and ping-pong
2427 * tasks around. Thus we look for the minimum possible imbalance.
2428 * Negative imbalances (*we* are more loaded than anyone else) will
2429 * be counted as no imbalance for these purposes -- we can't fix that
2430 * by pulling tasks to us. Be careful of negative numbers as they'll
2431 * appear as very large values with unsigned longs.
2432 */
Peter Williams2dd73a42006-06-27 02:54:34 -07002433 if (max_load <= busiest_load_per_task)
2434 goto out_balanced;
2435
2436 /*
2437 * In the presence of smp nice balancing, certain scenarios can have
2438 * max load less than avg load(as we skip the groups at or below
2439 * its cpu_power, while calculating max_load..)
2440 */
2441 if (max_load < avg_load) {
2442 *imbalance = 0;
2443 goto small_imbalance;
2444 }
Siddha, Suresh B0c117f12005-09-10 00:26:21 -07002445
2446 /* Don't want to pull so many tasks that a group would go idle */
Peter Williams2dd73a42006-06-27 02:54:34 -07002447 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
Siddha, Suresh B0c117f12005-09-10 00:26:21 -07002448
Linus Torvalds1da177e2005-04-16 15:20:36 -07002449 /* How much load to actually move to equalise the imbalance */
Eric Dumazet5517d862007-05-08 00:32:57 -07002450 *imbalance = min(max_pull * busiest->__cpu_power,
2451 (avg_load - this_load) * this->__cpu_power)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002452 / SCHED_LOAD_SCALE;
2453
Peter Williams2dd73a42006-06-27 02:54:34 -07002454 /*
2455 * if *imbalance is less than the average load per runnable task
2456 * there is no gaurantee that any tasks will be moved so we'll have
2457 * a think about bumping its value to force at least one task to be
2458 * moved
2459 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002460 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07002461 unsigned long tmp, pwr_now, pwr_move;
Peter Williams2dd73a42006-06-27 02:54:34 -07002462 unsigned int imbn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002463
Peter Williams2dd73a42006-06-27 02:54:34 -07002464small_imbalance:
2465 pwr_move = pwr_now = 0;
2466 imbn = 2;
2467 if (this_nr_running) {
2468 this_load_per_task /= this_nr_running;
2469 if (busiest_load_per_task > this_load_per_task)
2470 imbn = 1;
2471 } else
2472 this_load_per_task = SCHED_LOAD_SCALE;
2473
Ingo Molnardd41f592007-07-09 18:51:59 +02002474 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2475 busiest_load_per_task * imbn) {
Peter Williams2dd73a42006-06-27 02:54:34 -07002476 *imbalance = busiest_load_per_task;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002477 return busiest;
2478 }
2479
2480 /*
2481 * OK, we don't have enough imbalance to justify moving tasks,
2482 * however we may be able to increase total CPU power used by
2483 * moving them.
2484 */
2485
Eric Dumazet5517d862007-05-08 00:32:57 -07002486 pwr_now += busiest->__cpu_power *
2487 min(busiest_load_per_task, max_load);
2488 pwr_now += this->__cpu_power *
2489 min(this_load_per_task, this_load);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002490 pwr_now /= SCHED_LOAD_SCALE;
2491
2492 /* Amount of load we'd subtract */
Eric Dumazet5517d862007-05-08 00:32:57 -07002493 tmp = sg_div_cpu_power(busiest,
2494 busiest_load_per_task * SCHED_LOAD_SCALE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002495 if (max_load > tmp)
Eric Dumazet5517d862007-05-08 00:32:57 -07002496 pwr_move += busiest->__cpu_power *
Peter Williams2dd73a42006-06-27 02:54:34 -07002497 min(busiest_load_per_task, max_load - tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002498
2499 /* Amount of load we'd add */
Eric Dumazet5517d862007-05-08 00:32:57 -07002500 if (max_load * busiest->__cpu_power <
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08002501 busiest_load_per_task * SCHED_LOAD_SCALE)
Eric Dumazet5517d862007-05-08 00:32:57 -07002502 tmp = sg_div_cpu_power(this,
2503 max_load * busiest->__cpu_power);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002504 else
Eric Dumazet5517d862007-05-08 00:32:57 -07002505 tmp = sg_div_cpu_power(this,
2506 busiest_load_per_task * SCHED_LOAD_SCALE);
2507 pwr_move += this->__cpu_power *
2508 min(this_load_per_task, this_load + tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002509 pwr_move /= SCHED_LOAD_SCALE;
2510
2511 /* Move if we gain throughput */
2512 if (pwr_move <= pwr_now)
2513 goto out_balanced;
2514
Peter Williams2dd73a42006-06-27 02:54:34 -07002515 *imbalance = busiest_load_per_task;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002516 }
2517
Linus Torvalds1da177e2005-04-16 15:20:36 -07002518 return busiest;
2519
2520out_balanced:
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002521#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002522 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002523 goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002524
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002525 if (this == group_leader && group_leader != group_min) {
2526 *imbalance = min_load_per_task;
2527 return group_min;
2528 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002529#endif
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002530ret:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002531 *imbalance = 0;
2532 return NULL;
2533}
2534
2535/*
2536 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2537 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002538static struct rq *
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002539find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002540 unsigned long imbalance, cpumask_t *cpus)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002541{
Ingo Molnar70b97a72006-07-03 00:25:42 -07002542 struct rq *busiest = NULL, *rq;
Peter Williams2dd73a42006-06-27 02:54:34 -07002543 unsigned long max_load = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002544 int i;
2545
2546 for_each_cpu_mask(i, group->cpumask) {
Ingo Molnardd41f592007-07-09 18:51:59 +02002547 unsigned long wl;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002548
2549 if (!cpu_isset(i, *cpus))
2550 continue;
2551
Ingo Molnar48f24c42006-07-03 00:25:40 -07002552 rq = cpu_rq(i);
Ingo Molnardd41f592007-07-09 18:51:59 +02002553 wl = weighted_cpuload(i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002554
Ingo Molnardd41f592007-07-09 18:51:59 +02002555 if (rq->nr_running == 1 && wl > imbalance)
Peter Williams2dd73a42006-06-27 02:54:34 -07002556 continue;
2557
Ingo Molnardd41f592007-07-09 18:51:59 +02002558 if (wl > max_load) {
2559 max_load = wl;
Ingo Molnar48f24c42006-07-03 00:25:40 -07002560 busiest = rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002561 }
2562 }
2563
2564 return busiest;
2565}
2566
2567/*
Nick Piggin77391d72005-06-25 14:57:30 -07002568 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2569 * so long as it is large enough.
2570 */
2571#define MAX_PINNED_INTERVAL 512
2572
Ingo Molnar48f24c42006-07-03 00:25:40 -07002573static inline unsigned long minus_1_or_zero(unsigned long n)
2574{
2575 return n > 0 ? n - 1 : 0;
2576}
2577
Nick Piggin77391d72005-06-25 14:57:30 -07002578/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002579 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2580 * tasks if there is an imbalance.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002581 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002582static int load_balance(int this_cpu, struct rq *this_rq,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002583 struct sched_domain *sd, enum cpu_idle_type idle,
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002584 int *balance)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002585{
Ingo Molnar48f24c42006-07-03 00:25:40 -07002586 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002587 struct sched_group *group;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002588 unsigned long imbalance;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002589 struct rq *busiest;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002590 cpumask_t cpus = CPU_MASK_ALL;
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002591 unsigned long flags;
Nick Piggin5969fe02005-09-10 00:26:19 -07002592
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002593 /*
2594 * When power savings policy is enabled for the parent domain, idle
2595 * sibling can pick up load irrespective of busy siblings. In this case,
Ingo Molnardd41f592007-07-09 18:51:59 +02002596 * let the state of idle sibling percolate up as CPU_IDLE, instead of
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002597 * portraying it as CPU_NOT_IDLE.
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002598 */
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002599 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002600 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002601 sd_idle = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002602
Linus Torvalds1da177e2005-04-16 15:20:36 -07002603 schedstat_inc(sd, lb_cnt[idle]);
2604
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002605redo:
2606 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002607 &cpus, balance);
2608
Chen, Kenneth W06066712006-12-10 02:20:35 -08002609 if (*balance == 0)
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002610 goto out_balanced;
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002611
Linus Torvalds1da177e2005-04-16 15:20:36 -07002612 if (!group) {
2613 schedstat_inc(sd, lb_nobusyg[idle]);
2614 goto out_balanced;
2615 }
2616
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002617 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002618 if (!busiest) {
2619 schedstat_inc(sd, lb_nobusyq[idle]);
2620 goto out_balanced;
2621 }
2622
Nick Piggindb935db2005-06-25 14:57:11 -07002623 BUG_ON(busiest == this_rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002624
2625 schedstat_add(sd, lb_imbalance[idle], imbalance);
2626
2627 nr_moved = 0;
2628 if (busiest->nr_running > 1) {
2629 /*
2630 * Attempt to move tasks. If find_busiest_group has found
2631 * an imbalance but busiest->nr_running <= 1, the group is
2632 * still unbalanced. nr_moved simply stays zero, so it is
2633 * correctly treated as an imbalance.
2634 */
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002635 local_irq_save(flags);
Nick Piggine17224b2005-09-10 00:26:18 -07002636 double_rq_lock(this_rq, busiest);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002637 nr_moved = move_tasks(this_rq, this_cpu, busiest,
Ingo Molnar48f24c42006-07-03 00:25:40 -07002638 minus_1_or_zero(busiest->nr_running),
2639 imbalance, sd, idle, &all_pinned);
Nick Piggine17224b2005-09-10 00:26:18 -07002640 double_rq_unlock(this_rq, busiest);
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002641 local_irq_restore(flags);
Nick Piggin81026792005-06-25 14:57:07 -07002642
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002643 /*
2644 * some other cpu did the load balance for us.
2645 */
2646 if (nr_moved && this_cpu != smp_processor_id())
2647 resched_cpu(this_cpu);
2648
Nick Piggin81026792005-06-25 14:57:07 -07002649 /* All tasks on this runqueue were pinned by CPU affinity */
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002650 if (unlikely(all_pinned)) {
2651 cpu_clear(cpu_of(busiest), cpus);
2652 if (!cpus_empty(cpus))
2653 goto redo;
Nick Piggin81026792005-06-25 14:57:07 -07002654 goto out_balanced;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002655 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002656 }
Nick Piggin81026792005-06-25 14:57:07 -07002657
Linus Torvalds1da177e2005-04-16 15:20:36 -07002658 if (!nr_moved) {
2659 schedstat_inc(sd, lb_failed[idle]);
2660 sd->nr_balance_failed++;
2661
2662 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002663
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002664 spin_lock_irqsave(&busiest->lock, flags);
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002665
2666 /* don't kick the migration_thread, if the curr
2667 * task on busiest cpu can't be moved to this_cpu
2668 */
2669 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002670 spin_unlock_irqrestore(&busiest->lock, flags);
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002671 all_pinned = 1;
2672 goto out_one_pinned;
2673 }
2674
Linus Torvalds1da177e2005-04-16 15:20:36 -07002675 if (!busiest->active_balance) {
2676 busiest->active_balance = 1;
2677 busiest->push_cpu = this_cpu;
Nick Piggin81026792005-06-25 14:57:07 -07002678 active_balance = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002679 }
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002680 spin_unlock_irqrestore(&busiest->lock, flags);
Nick Piggin81026792005-06-25 14:57:07 -07002681 if (active_balance)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002682 wake_up_process(busiest->migration_thread);
2683
2684 /*
2685 * We've kicked active balancing, reset the failure
2686 * counter.
2687 */
Nick Piggin39507452005-06-25 14:57:09 -07002688 sd->nr_balance_failed = sd->cache_nice_tries+1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002689 }
Nick Piggin81026792005-06-25 14:57:07 -07002690 } else
Linus Torvalds1da177e2005-04-16 15:20:36 -07002691 sd->nr_balance_failed = 0;
2692
Nick Piggin81026792005-06-25 14:57:07 -07002693 if (likely(!active_balance)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002694 /* We were unbalanced, so reset the balancing interval */
2695 sd->balance_interval = sd->min_interval;
Nick Piggin81026792005-06-25 14:57:07 -07002696 } else {
2697 /*
2698 * If we've begun active balancing, start to back off. This
2699 * case may not be covered by the all_pinned logic if there
2700 * is only 1 task on the busy runqueue (because we don't call
2701 * move_tasks).
2702 */
2703 if (sd->balance_interval < sd->max_interval)
2704 sd->balance_interval *= 2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002705 }
2706
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002707 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002708 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002709 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002710 return nr_moved;
2711
2712out_balanced:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002713 schedstat_inc(sd, lb_balanced[idle]);
2714
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002715 sd->nr_balance_failed = 0;
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002716
2717out_one_pinned:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002718 /* tune up the balancing interval */
Nick Piggin77391d72005-06-25 14:57:30 -07002719 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2720 (sd->balance_interval < sd->max_interval))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002721 sd->balance_interval *= 2;
2722
Ingo Molnar48f24c42006-07-03 00:25:40 -07002723 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002724 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002725 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002726 return 0;
2727}
2728
2729/*
2730 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2731 * tasks if there is an imbalance.
2732 *
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002733 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
Linus Torvalds1da177e2005-04-16 15:20:36 -07002734 * this_rq is locked.
2735 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07002736static int
Ingo Molnar70b97a72006-07-03 00:25:42 -07002737load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002738{
2739 struct sched_group *group;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002740 struct rq *busiest = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002741 unsigned long imbalance;
2742 int nr_moved = 0;
Nick Piggin5969fe02005-09-10 00:26:19 -07002743 int sd_idle = 0;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002744 cpumask_t cpus = CPU_MASK_ALL;
Nick Piggin5969fe02005-09-10 00:26:19 -07002745
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002746 /*
2747 * When power savings policy is enabled for the parent domain, idle
2748 * sibling can pick up load irrespective of busy siblings. In this case,
2749 * let the state of idle sibling percolate up as IDLE, instead of
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002750 * portraying it as CPU_NOT_IDLE.
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002751 */
2752 if (sd->flags & SD_SHARE_CPUPOWER &&
2753 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002754 sd_idle = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002755
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002756 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002757redo:
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002758 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002759 &sd_idle, &cpus, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002760 if (!group) {
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002761 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002762 goto out_balanced;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002763 }
2764
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002765 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002766 &cpus);
Nick Piggindb935db2005-06-25 14:57:11 -07002767 if (!busiest) {
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002768 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002769 goto out_balanced;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002770 }
2771
Nick Piggindb935db2005-06-25 14:57:11 -07002772 BUG_ON(busiest == this_rq);
2773
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002774 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
Nick Piggind6d5cfa2005-09-10 00:26:16 -07002775
2776 nr_moved = 0;
2777 if (busiest->nr_running > 1) {
2778 /* Attempt to move tasks */
2779 double_lock_balance(this_rq, busiest);
2780 nr_moved = move_tasks(this_rq, this_cpu, busiest,
Peter Williams2dd73a42006-06-27 02:54:34 -07002781 minus_1_or_zero(busiest->nr_running),
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002782 imbalance, sd, CPU_NEWLY_IDLE, NULL);
Nick Piggind6d5cfa2005-09-10 00:26:16 -07002783 spin_unlock(&busiest->lock);
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002784
2785 if (!nr_moved) {
2786 cpu_clear(cpu_of(busiest), cpus);
2787 if (!cpus_empty(cpus))
2788 goto redo;
2789 }
Nick Piggind6d5cfa2005-09-10 00:26:16 -07002790 }
2791
Nick Piggin5969fe02005-09-10 00:26:19 -07002792 if (!nr_moved) {
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002793 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002794 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2795 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002796 return -1;
2797 } else
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002798 sd->nr_balance_failed = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002799
Linus Torvalds1da177e2005-04-16 15:20:36 -07002800 return nr_moved;
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002801
2802out_balanced:
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002803 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
Ingo Molnar48f24c42006-07-03 00:25:40 -07002804 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002805 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002806 return -1;
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002807 sd->nr_balance_failed = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07002808
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002809 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002810}
2811
2812/*
2813 * idle_balance is called by schedule() if this_cpu is about to become
2814 * idle. Attempts to pull tasks from other CPUs.
2815 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002816static void idle_balance(int this_cpu, struct rq *this_rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002817{
2818 struct sched_domain *sd;
Ingo Molnardd41f592007-07-09 18:51:59 +02002819 int pulled_task = -1;
2820 unsigned long next_balance = jiffies + HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002821
2822 for_each_domain(this_cpu, sd) {
Christoph Lameter92c4ca52007-06-23 17:16:33 -07002823 unsigned long interval;
2824
2825 if (!(sd->flags & SD_LOAD_BALANCE))
2826 continue;
2827
2828 if (sd->flags & SD_BALANCE_NEWIDLE)
Ingo Molnar48f24c42006-07-03 00:25:40 -07002829 /* If we've pulled tasks over stop searching: */
Christoph Lameter1bd77f22006-12-10 02:20:27 -08002830 pulled_task = load_balance_newidle(this_cpu,
Christoph Lameter92c4ca52007-06-23 17:16:33 -07002831 this_rq, sd);
2832
2833 interval = msecs_to_jiffies(sd->balance_interval);
2834 if (time_after(next_balance, sd->last_balance + interval))
2835 next_balance = sd->last_balance + interval;
2836 if (pulled_task)
2837 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002838 }
Ingo Molnardd41f592007-07-09 18:51:59 +02002839 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
Christoph Lameter1bd77f22006-12-10 02:20:27 -08002840 /*
2841 * We are going idle. next_balance may be set based on
2842 * a busy processor. So reset next_balance.
2843 */
2844 this_rq->next_balance = next_balance;
Ingo Molnardd41f592007-07-09 18:51:59 +02002845 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002846}
2847
2848/*
2849 * active_load_balance is run by migration threads. It pushes running tasks
2850 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2851 * running on each physical CPU where possible, and avoids physical /
2852 * logical imbalances.
2853 *
2854 * Called with busiest_rq locked.
2855 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002856static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002857{
Nick Piggin39507452005-06-25 14:57:09 -07002858 int target_cpu = busiest_rq->push_cpu;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002859 struct sched_domain *sd;
2860 struct rq *target_rq;
Nick Piggin39507452005-06-25 14:57:09 -07002861
Ingo Molnar48f24c42006-07-03 00:25:40 -07002862 /* Is there any task to move? */
Nick Piggin39507452005-06-25 14:57:09 -07002863 if (busiest_rq->nr_running <= 1)
Nick Piggin39507452005-06-25 14:57:09 -07002864 return;
2865
2866 target_rq = cpu_rq(target_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002867
2868 /*
Nick Piggin39507452005-06-25 14:57:09 -07002869 * This condition is "impossible", if it occurs
2870 * we need to fix it. Originally reported by
2871 * Bjorn Helgaas on a 128-cpu setup.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002872 */
Nick Piggin39507452005-06-25 14:57:09 -07002873 BUG_ON(busiest_rq == target_rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002874
Nick Piggin39507452005-06-25 14:57:09 -07002875 /* move a task from busiest_rq to target_rq */
2876 double_lock_balance(busiest_rq, target_rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002877
Nick Piggin39507452005-06-25 14:57:09 -07002878 /* Search for an sd spanning us and the target CPU. */
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07002879 for_each_domain(target_cpu, sd) {
Nick Piggin39507452005-06-25 14:57:09 -07002880 if ((sd->flags & SD_LOAD_BALANCE) &&
Ingo Molnar48f24c42006-07-03 00:25:40 -07002881 cpu_isset(busiest_cpu, sd->span))
Nick Piggin39507452005-06-25 14:57:09 -07002882 break;
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07002883 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002884
Ingo Molnar48f24c42006-07-03 00:25:40 -07002885 if (likely(sd)) {
2886 schedstat_inc(sd, alb_cnt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002887
Ingo Molnar48f24c42006-07-03 00:25:40 -07002888 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002889 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
Ingo Molnar48f24c42006-07-03 00:25:40 -07002890 NULL))
2891 schedstat_inc(sd, alb_pushed);
2892 else
2893 schedstat_inc(sd, alb_failed);
2894 }
Nick Piggin39507452005-06-25 14:57:09 -07002895 spin_unlock(&target_rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002896}
2897
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002898#ifdef CONFIG_NO_HZ
2899static struct {
2900 atomic_t load_balancer;
2901 cpumask_t cpu_mask;
2902} nohz ____cacheline_aligned = {
2903 .load_balancer = ATOMIC_INIT(-1),
2904 .cpu_mask = CPU_MASK_NONE,
2905};
2906
Christoph Lameter7835b982006-12-10 02:20:22 -08002907/*
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002908 * This routine will try to nominate the ilb (idle load balancing)
2909 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2910 * load balancing on behalf of all those cpus. If all the cpus in the system
2911 * go into this tickless mode, then there will be no ilb owner (as there is
2912 * no need for one) and all the cpus will sleep till the next wakeup event
2913 * arrives...
Christoph Lameter7835b982006-12-10 02:20:22 -08002914 *
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002915 * For the ilb owner, tick is not stopped. And this tick will be used
2916 * for idle load balancing. ilb owner will still be part of
2917 * nohz.cpu_mask..
2918 *
2919 * While stopping the tick, this cpu will become the ilb owner if there
2920 * is no other owner. And will be the owner till that cpu becomes busy
2921 * or if all cpus in the system stop their ticks at which point
2922 * there is no need for ilb owner.
2923 *
2924 * When the ilb owner becomes busy, it nominates another owner, during the
2925 * next busy scheduler_tick()
2926 */
2927int select_nohz_load_balancer(int stop_tick)
2928{
2929 int cpu = smp_processor_id();
2930
2931 if (stop_tick) {
2932 cpu_set(cpu, nohz.cpu_mask);
2933 cpu_rq(cpu)->in_nohz_recently = 1;
2934
2935 /*
2936 * If we are going offline and still the leader, give up!
2937 */
2938 if (cpu_is_offline(cpu) &&
2939 atomic_read(&nohz.load_balancer) == cpu) {
2940 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2941 BUG();
2942 return 0;
2943 }
2944
2945 /* time for ilb owner also to sleep */
2946 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2947 if (atomic_read(&nohz.load_balancer) == cpu)
2948 atomic_set(&nohz.load_balancer, -1);
2949 return 0;
2950 }
2951
2952 if (atomic_read(&nohz.load_balancer) == -1) {
2953 /* make me the ilb owner */
2954 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2955 return 1;
2956 } else if (atomic_read(&nohz.load_balancer) == cpu)
2957 return 1;
2958 } else {
2959 if (!cpu_isset(cpu, nohz.cpu_mask))
2960 return 0;
2961
2962 cpu_clear(cpu, nohz.cpu_mask);
2963
2964 if (atomic_read(&nohz.load_balancer) == cpu)
2965 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2966 BUG();
2967 }
2968 return 0;
2969}
2970#endif
2971
2972static DEFINE_SPINLOCK(balancing);
2973
2974/*
Christoph Lameter7835b982006-12-10 02:20:22 -08002975 * It checks each scheduling domain to see if it is due to be balanced,
2976 * and initiates a balancing operation if so.
2977 *
2978 * Balancing parameters are set up in arch_init_sched_domains.
2979 */
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002980static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
Christoph Lameter7835b982006-12-10 02:20:22 -08002981{
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002982 int balance = 1;
2983 struct rq *rq = cpu_rq(cpu);
Christoph Lameter7835b982006-12-10 02:20:22 -08002984 unsigned long interval;
2985 struct sched_domain *sd;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002986 /* Earliest time when we have to do rebalance again */
Christoph Lameterc9819f42006-12-10 02:20:25 -08002987 unsigned long next_balance = jiffies + 60*HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002988
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002989 for_each_domain(cpu, sd) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002990 if (!(sd->flags & SD_LOAD_BALANCE))
2991 continue;
2992
2993 interval = sd->balance_interval;
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002994 if (idle != CPU_IDLE)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002995 interval *= sd->busy_factor;
2996
2997 /* scale ms to jiffies */
2998 interval = msecs_to_jiffies(interval);
2999 if (unlikely(!interval))
3000 interval = 1;
Ingo Molnardd41f592007-07-09 18:51:59 +02003001 if (interval > HZ*NR_CPUS/10)
3002 interval = HZ*NR_CPUS/10;
3003
Linus Torvalds1da177e2005-04-16 15:20:36 -07003004
Christoph Lameter08c183f2006-12-10 02:20:29 -08003005 if (sd->flags & SD_SERIALIZE) {
3006 if (!spin_trylock(&balancing))
3007 goto out;
3008 }
3009
Christoph Lameterc9819f42006-12-10 02:20:25 -08003010 if (time_after_eq(jiffies, sd->last_balance + interval)) {
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003011 if (load_balance(cpu, rq, sd, idle, &balance)) {
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07003012 /*
3013 * We've pulled tasks over so either we're no
Nick Piggin5969fe02005-09-10 00:26:19 -07003014 * longer idle, or one of our SMT siblings is
3015 * not idle.
3016 */
Ingo Molnard15bcfd2007-07-09 18:51:57 +02003017 idle = CPU_NOT_IDLE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003018 }
Christoph Lameter1bd77f22006-12-10 02:20:27 -08003019 sd->last_balance = jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003020 }
Christoph Lameter08c183f2006-12-10 02:20:29 -08003021 if (sd->flags & SD_SERIALIZE)
3022 spin_unlock(&balancing);
3023out:
Christoph Lameterc9819f42006-12-10 02:20:25 -08003024 if (time_after(next_balance, sd->last_balance + interval))
3025 next_balance = sd->last_balance + interval;
Siddha, Suresh B783609c2006-12-10 02:20:33 -08003026
3027 /*
3028 * Stop the load balance at this level. There is another
3029 * CPU in our sched group which is doing load balancing more
3030 * actively.
3031 */
3032 if (!balance)
3033 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003034 }
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003035 rq->next_balance = next_balance;
3036}
3037
3038/*
3039 * run_rebalance_domains is triggered when needed from the scheduler tick.
3040 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3041 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3042 */
3043static void run_rebalance_domains(struct softirq_action *h)
3044{
Ingo Molnardd41f592007-07-09 18:51:59 +02003045 int this_cpu = smp_processor_id();
3046 struct rq *this_rq = cpu_rq(this_cpu);
3047 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3048 CPU_IDLE : CPU_NOT_IDLE;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003049
Ingo Molnardd41f592007-07-09 18:51:59 +02003050 rebalance_domains(this_cpu, idle);
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003051
3052#ifdef CONFIG_NO_HZ
3053 /*
3054 * If this cpu is the owner for idle load balancing, then do the
3055 * balancing on behalf of the other idle cpus whose ticks are
3056 * stopped.
3057 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003058 if (this_rq->idle_at_tick &&
3059 atomic_read(&nohz.load_balancer) == this_cpu) {
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003060 cpumask_t cpus = nohz.cpu_mask;
3061 struct rq *rq;
3062 int balance_cpu;
3063
Ingo Molnardd41f592007-07-09 18:51:59 +02003064 cpu_clear(this_cpu, cpus);
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003065 for_each_cpu_mask(balance_cpu, cpus) {
3066 /*
3067 * If this cpu gets work to do, stop the load balancing
3068 * work being done for other cpus. Next load
3069 * balancing owner will pick it up.
3070 */
3071 if (need_resched())
3072 break;
3073
Ingo Molnardd41f592007-07-09 18:51:59 +02003074 rebalance_domains(balance_cpu, SCHED_IDLE);
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003075
3076 rq = cpu_rq(balance_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02003077 if (time_after(this_rq->next_balance, rq->next_balance))
3078 this_rq->next_balance = rq->next_balance;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003079 }
3080 }
3081#endif
3082}
3083
3084/*
3085 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3086 *
3087 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3088 * idle load balancing owner or decide to stop the periodic load balancing,
3089 * if the whole system is idle.
3090 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003091static inline void trigger_load_balance(struct rq *rq, int cpu)
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003092{
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003093#ifdef CONFIG_NO_HZ
3094 /*
3095 * If we were in the nohz mode recently and busy at the current
3096 * scheduler tick, then check if we need to nominate new idle
3097 * load balancer.
3098 */
3099 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3100 rq->in_nohz_recently = 0;
3101
3102 if (atomic_read(&nohz.load_balancer) == cpu) {
3103 cpu_clear(cpu, nohz.cpu_mask);
3104 atomic_set(&nohz.load_balancer, -1);
3105 }
3106
3107 if (atomic_read(&nohz.load_balancer) == -1) {
3108 /*
3109 * simple selection for now: Nominate the
3110 * first cpu in the nohz list to be the next
3111 * ilb owner.
3112 *
3113 * TBD: Traverse the sched domains and nominate
3114 * the nearest cpu in the nohz.cpu_mask.
3115 */
3116 int ilb = first_cpu(nohz.cpu_mask);
3117
3118 if (ilb != NR_CPUS)
3119 resched_cpu(ilb);
3120 }
3121 }
3122
3123 /*
3124 * If this cpu is idle and doing idle load balancing for all the
3125 * cpus with ticks stopped, is it time for that to stop?
3126 */
3127 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3128 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3129 resched_cpu(cpu);
3130 return;
3131 }
3132
3133 /*
3134 * If this cpu is idle and the idle load balancing is done by
3135 * someone else, then no need raise the SCHED_SOFTIRQ
3136 */
3137 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3138 cpu_isset(cpu, nohz.cpu_mask))
3139 return;
3140#endif
3141 if (time_after_eq(jiffies, rq->next_balance))
3142 raise_softirq(SCHED_SOFTIRQ);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003143}
Ingo Molnardd41f592007-07-09 18:51:59 +02003144
3145#else /* CONFIG_SMP */
3146
Linus Torvalds1da177e2005-04-16 15:20:36 -07003147/*
3148 * on UP we do not need to balance between CPUs:
3149 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07003150static inline void idle_balance(int cpu, struct rq *rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003151{
3152}
Ingo Molnardd41f592007-07-09 18:51:59 +02003153
3154/* Avoid "used but not defined" warning on UP */
3155static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3156 unsigned long max_nr_move, unsigned long max_load_move,
3157 struct sched_domain *sd, enum cpu_idle_type idle,
3158 int *all_pinned, unsigned long *load_moved,
3159 int this_best_prio, int best_prio, int best_prio_seen,
3160 struct rq_iterator *iterator)
3161{
3162 *load_moved = 0;
3163
3164 return 0;
3165}
3166
Linus Torvalds1da177e2005-04-16 15:20:36 -07003167#endif
3168
Linus Torvalds1da177e2005-04-16 15:20:36 -07003169DEFINE_PER_CPU(struct kernel_stat, kstat);
3170
3171EXPORT_PER_CPU_SYMBOL(kstat);
3172
3173/*
Ingo Molnar41b86e92007-07-09 18:51:58 +02003174 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3175 * that have not yet been banked in case the task is currently running.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003176 */
Ingo Molnar41b86e92007-07-09 18:51:58 +02003177unsigned long long task_sched_runtime(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003178{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003179 unsigned long flags;
Ingo Molnar41b86e92007-07-09 18:51:58 +02003180 u64 ns, delta_exec;
3181 struct rq *rq;
Ingo Molnar48f24c42006-07-03 00:25:40 -07003182
Ingo Molnar41b86e92007-07-09 18:51:58 +02003183 rq = task_rq_lock(p, &flags);
3184 ns = p->se.sum_exec_runtime;
3185 if (rq->curr == p) {
3186 delta_exec = rq_clock(rq) - p->se.exec_start;
3187 if ((s64)delta_exec > 0)
3188 ns += delta_exec;
3189 }
3190 task_rq_unlock(rq, &flags);
Ingo Molnar48f24c42006-07-03 00:25:40 -07003191
Linus Torvalds1da177e2005-04-16 15:20:36 -07003192 return ns;
3193}
3194
3195/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07003196 * Account user cpu time to a process.
3197 * @p: the process that the cpu time gets accounted to
3198 * @hardirq_offset: the offset to subtract from hardirq_count()
3199 * @cputime: the cpu time spent in user space since the last update
3200 */
3201void account_user_time(struct task_struct *p, cputime_t cputime)
3202{
3203 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3204 cputime64_t tmp;
3205
3206 p->utime = cputime_add(p->utime, cputime);
3207
3208 /* Add user time to cpustat. */
3209 tmp = cputime_to_cputime64(cputime);
3210 if (TASK_NICE(p) > 0)
3211 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3212 else
3213 cpustat->user = cputime64_add(cpustat->user, tmp);
3214}
3215
3216/*
3217 * Account system cpu time to a process.
3218 * @p: the process that the cpu time gets accounted to
3219 * @hardirq_offset: the offset to subtract from hardirq_count()
3220 * @cputime: the cpu time spent in kernel space since the last update
3221 */
3222void account_system_time(struct task_struct *p, int hardirq_offset,
3223 cputime_t cputime)
3224{
3225 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003226 struct rq *rq = this_rq();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003227 cputime64_t tmp;
3228
3229 p->stime = cputime_add(p->stime, cputime);
3230
3231 /* Add system time to cpustat. */
3232 tmp = cputime_to_cputime64(cputime);
3233 if (hardirq_count() - hardirq_offset)
3234 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3235 else if (softirq_count())
3236 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3237 else if (p != rq->idle)
3238 cpustat->system = cputime64_add(cpustat->system, tmp);
3239 else if (atomic_read(&rq->nr_iowait) > 0)
3240 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3241 else
3242 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3243 /* Account for system time used */
3244 acct_update_integrals(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003245}
3246
3247/*
3248 * Account for involuntary wait time.
3249 * @p: the process from which the cpu time has been stolen
3250 * @steal: the cpu time spent in involuntary wait
3251 */
3252void account_steal_time(struct task_struct *p, cputime_t steal)
3253{
3254 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3255 cputime64_t tmp = cputime_to_cputime64(steal);
Ingo Molnar70b97a72006-07-03 00:25:42 -07003256 struct rq *rq = this_rq();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003257
3258 if (p == rq->idle) {
3259 p->stime = cputime_add(p->stime, steal);
3260 if (atomic_read(&rq->nr_iowait) > 0)
3261 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3262 else
3263 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3264 } else
3265 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3266}
3267
Christoph Lameter7835b982006-12-10 02:20:22 -08003268/*
3269 * This function gets called by the timer code, with HZ frequency.
3270 * We call it with interrupts disabled.
3271 *
3272 * It also gets called by the fork code, when changing the parent's
3273 * timeslices.
3274 */
3275void scheduler_tick(void)
3276{
Christoph Lameter7835b982006-12-10 02:20:22 -08003277 int cpu = smp_processor_id();
3278 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02003279 struct task_struct *curr = rq->curr;
Christoph Lameter7835b982006-12-10 02:20:22 -08003280
Ingo Molnardd41f592007-07-09 18:51:59 +02003281 spin_lock(&rq->lock);
3282 if (curr != rq->idle) /* FIXME: needed? */
3283 curr->sched_class->task_tick(rq, curr);
3284 update_cpu_load(rq);
3285 spin_unlock(&rq->lock);
3286
Christoph Lametere418e1c2006-12-10 02:20:23 -08003287#ifdef CONFIG_SMP
Ingo Molnardd41f592007-07-09 18:51:59 +02003288 rq->idle_at_tick = idle_cpu(cpu);
3289 trigger_load_balance(rq, cpu);
Christoph Lametere418e1c2006-12-10 02:20:23 -08003290#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003291}
3292
Linus Torvalds1da177e2005-04-16 15:20:36 -07003293#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3294
3295void fastcall add_preempt_count(int val)
3296{
3297 /*
3298 * Underflow?
3299 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003300 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3301 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003302 preempt_count() += val;
3303 /*
3304 * Spinlock count overflowing soon?
3305 */
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08003306 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3307 PREEMPT_MASK - 10);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003308}
3309EXPORT_SYMBOL(add_preempt_count);
3310
3311void fastcall sub_preempt_count(int val)
3312{
3313 /*
3314 * Underflow?
3315 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003316 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3317 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003318 /*
3319 * Is the spinlock portion underflowing?
3320 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003321 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3322 !(preempt_count() & PREEMPT_MASK)))
3323 return;
3324
Linus Torvalds1da177e2005-04-16 15:20:36 -07003325 preempt_count() -= val;
3326}
3327EXPORT_SYMBOL(sub_preempt_count);
3328
3329#endif
3330
3331/*
Ingo Molnardd41f592007-07-09 18:51:59 +02003332 * Print scheduling while atomic bug:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003333 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003334static noinline void __schedule_bug(struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003335{
Ingo Molnardd41f592007-07-09 18:51:59 +02003336 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3337 prev->comm, preempt_count(), prev->pid);
3338 debug_show_held_locks(prev);
3339 if (irqs_disabled())
3340 print_irqtrace_events(prev);
3341 dump_stack();
3342}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003343
Ingo Molnardd41f592007-07-09 18:51:59 +02003344/*
3345 * Various schedule()-time debugging checks and statistics:
3346 */
3347static inline void schedule_debug(struct task_struct *prev)
3348{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003349 /*
3350 * Test if we are atomic. Since do_exit() needs to call into
3351 * schedule() atomically, we ignore that path for now.
3352 * Otherwise, whine if we are scheduling when we should not be.
3353 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003354 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3355 __schedule_bug(prev);
3356
Linus Torvalds1da177e2005-04-16 15:20:36 -07003357 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3358
Ingo Molnardd41f592007-07-09 18:51:59 +02003359 schedstat_inc(this_rq(), sched_cnt);
3360}
3361
3362/*
3363 * Pick up the highest-prio task:
3364 */
3365static inline struct task_struct *
3366pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3367{
3368 struct sched_class *class;
3369 struct task_struct *p;
3370
3371 /*
3372 * Optimization: we know that if all tasks are in
3373 * the fair class we can call that function directly:
3374 */
3375 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3376 p = fair_sched_class.pick_next_task(rq, now);
3377 if (likely(p))
3378 return p;
3379 }
3380
3381 class = sched_class_highest;
3382 for ( ; ; ) {
3383 p = class->pick_next_task(rq, now);
3384 if (p)
3385 return p;
3386 /*
3387 * Will never be NULL as the idle class always
3388 * returns a non-NULL p:
3389 */
3390 class = class->next;
3391 }
3392}
3393
3394/*
3395 * schedule() is the main scheduler function.
3396 */
3397asmlinkage void __sched schedule(void)
3398{
3399 struct task_struct *prev, *next;
3400 long *switch_count;
3401 struct rq *rq;
3402 u64 now;
3403 int cpu;
3404
Linus Torvalds1da177e2005-04-16 15:20:36 -07003405need_resched:
3406 preempt_disable();
Ingo Molnardd41f592007-07-09 18:51:59 +02003407 cpu = smp_processor_id();
3408 rq = cpu_rq(cpu);
3409 rcu_qsctr_inc(cpu);
3410 prev = rq->curr;
3411 switch_count = &prev->nivcsw;
3412
Linus Torvalds1da177e2005-04-16 15:20:36 -07003413 release_kernel_lock(prev);
3414need_resched_nonpreemptible:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003415
Ingo Molnardd41f592007-07-09 18:51:59 +02003416 schedule_debug(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003417
3418 spin_lock_irq(&rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003419 clear_tsk_need_resched(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003420
Ingo Molnardd41f592007-07-09 18:51:59 +02003421 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3422 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3423 unlikely(signal_pending(prev)))) {
3424 prev->state = TASK_RUNNING;
3425 } else {
3426 deactivate_task(rq, prev, 1);
3427 }
3428 switch_count = &prev->nvcsw;
3429 }
3430
3431 if (unlikely(!rq->nr_running))
3432 idle_balance(cpu, rq);
3433
3434 now = __rq_clock(rq);
3435 prev->sched_class->put_prev_task(rq, prev, now);
3436 next = pick_next_task(rq, prev, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003437
3438 sched_info_switch(prev, next);
Ingo Molnardd41f592007-07-09 18:51:59 +02003439
Linus Torvalds1da177e2005-04-16 15:20:36 -07003440 if (likely(prev != next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003441 rq->nr_switches++;
3442 rq->curr = next;
3443 ++*switch_count;
3444
Ingo Molnardd41f592007-07-09 18:51:59 +02003445 context_switch(rq, prev, next); /* unlocks the rq */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003446 } else
3447 spin_unlock_irq(&rq->lock);
3448
Ingo Molnardd41f592007-07-09 18:51:59 +02003449 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3450 cpu = smp_processor_id();
3451 rq = cpu_rq(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003452 goto need_resched_nonpreemptible;
Ingo Molnardd41f592007-07-09 18:51:59 +02003453 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003454 preempt_enable_no_resched();
3455 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3456 goto need_resched;
3457}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003458EXPORT_SYMBOL(schedule);
3459
3460#ifdef CONFIG_PREEMPT
3461/*
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003462 * this is the entry point to schedule() from in-kernel preemption
Linus Torvalds1da177e2005-04-16 15:20:36 -07003463 * off of preempt_enable. Kernel preemptions off return from interrupt
3464 * occur there and call schedule directly.
3465 */
3466asmlinkage void __sched preempt_schedule(void)
3467{
3468 struct thread_info *ti = current_thread_info();
3469#ifdef CONFIG_PREEMPT_BKL
3470 struct task_struct *task = current;
3471 int saved_lock_depth;
3472#endif
3473 /*
3474 * If there is a non-zero preempt_count or interrupts are disabled,
3475 * we do not want to preempt the current task. Just return..
3476 */
Nick Pigginbeed33a2006-10-11 01:21:52 -07003477 if (likely(ti->preempt_count || irqs_disabled()))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003478 return;
3479
3480need_resched:
3481 add_preempt_count(PREEMPT_ACTIVE);
3482 /*
3483 * We keep the big kernel semaphore locked, but we
3484 * clear ->lock_depth so that schedule() doesnt
3485 * auto-release the semaphore:
3486 */
3487#ifdef CONFIG_PREEMPT_BKL
3488 saved_lock_depth = task->lock_depth;
3489 task->lock_depth = -1;
3490#endif
3491 schedule();
3492#ifdef CONFIG_PREEMPT_BKL
3493 task->lock_depth = saved_lock_depth;
3494#endif
3495 sub_preempt_count(PREEMPT_ACTIVE);
3496
3497 /* we could miss a preemption opportunity between schedule and now */
3498 barrier();
3499 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3500 goto need_resched;
3501}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003502EXPORT_SYMBOL(preempt_schedule);
3503
3504/*
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003505 * this is the entry point to schedule() from kernel preemption
Linus Torvalds1da177e2005-04-16 15:20:36 -07003506 * off of irq context.
3507 * Note, that this is called and return with irqs disabled. This will
3508 * protect us against recursive calling from irq.
3509 */
3510asmlinkage void __sched preempt_schedule_irq(void)
3511{
3512 struct thread_info *ti = current_thread_info();
3513#ifdef CONFIG_PREEMPT_BKL
3514 struct task_struct *task = current;
3515 int saved_lock_depth;
3516#endif
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003517 /* Catch callers which need to be fixed */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003518 BUG_ON(ti->preempt_count || !irqs_disabled());
3519
3520need_resched:
3521 add_preempt_count(PREEMPT_ACTIVE);
3522 /*
3523 * We keep the big kernel semaphore locked, but we
3524 * clear ->lock_depth so that schedule() doesnt
3525 * auto-release the semaphore:
3526 */
3527#ifdef CONFIG_PREEMPT_BKL
3528 saved_lock_depth = task->lock_depth;
3529 task->lock_depth = -1;
3530#endif
3531 local_irq_enable();
3532 schedule();
3533 local_irq_disable();
3534#ifdef CONFIG_PREEMPT_BKL
3535 task->lock_depth = saved_lock_depth;
3536#endif
3537 sub_preempt_count(PREEMPT_ACTIVE);
3538
3539 /* we could miss a preemption opportunity between schedule and now */
3540 barrier();
3541 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3542 goto need_resched;
3543}
3544
3545#endif /* CONFIG_PREEMPT */
3546
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003547int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3548 void *key)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003549{
Ingo Molnar48f24c42006-07-03 00:25:40 -07003550 return try_to_wake_up(curr->private, mode, sync);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003551}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003552EXPORT_SYMBOL(default_wake_function);
3553
3554/*
3555 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3556 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3557 * number) then we wake all the non-exclusive tasks and one exclusive task.
3558 *
3559 * There are circumstances in which we can try to wake a task which has already
3560 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3561 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3562 */
3563static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3564 int nr_exclusive, int sync, void *key)
3565{
3566 struct list_head *tmp, *next;
3567
3568 list_for_each_safe(tmp, next, &q->task_list) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07003569 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3570 unsigned flags = curr->flags;
3571
Linus Torvalds1da177e2005-04-16 15:20:36 -07003572 if (curr->func(curr, mode, sync, key) &&
Ingo Molnar48f24c42006-07-03 00:25:40 -07003573 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003574 break;
3575 }
3576}
3577
3578/**
3579 * __wake_up - wake up threads blocked on a waitqueue.
3580 * @q: the waitqueue
3581 * @mode: which threads
3582 * @nr_exclusive: how many wake-one or wake-many threads to wake up
Martin Waitz67be2dd2005-05-01 08:59:26 -07003583 * @key: is directly passed to the wakeup function
Linus Torvalds1da177e2005-04-16 15:20:36 -07003584 */
3585void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003586 int nr_exclusive, void *key)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003587{
3588 unsigned long flags;
3589
3590 spin_lock_irqsave(&q->lock, flags);
3591 __wake_up_common(q, mode, nr_exclusive, 0, key);
3592 spin_unlock_irqrestore(&q->lock, flags);
3593}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003594EXPORT_SYMBOL(__wake_up);
3595
3596/*
3597 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3598 */
3599void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3600{
3601 __wake_up_common(q, mode, 1, 0, NULL);
3602}
3603
3604/**
Martin Waitz67be2dd2005-05-01 08:59:26 -07003605 * __wake_up_sync - wake up threads blocked on a waitqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003606 * @q: the waitqueue
3607 * @mode: which threads
3608 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3609 *
3610 * The sync wakeup differs that the waker knows that it will schedule
3611 * away soon, so while the target thread will be woken up, it will not
3612 * be migrated to another CPU - ie. the two threads are 'synchronized'
3613 * with each other. This can prevent needless bouncing between CPUs.
3614 *
3615 * On UP it can prevent extra preemption.
3616 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003617void fastcall
3618__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003619{
3620 unsigned long flags;
3621 int sync = 1;
3622
3623 if (unlikely(!q))
3624 return;
3625
3626 if (unlikely(!nr_exclusive))
3627 sync = 0;
3628
3629 spin_lock_irqsave(&q->lock, flags);
3630 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3631 spin_unlock_irqrestore(&q->lock, flags);
3632}
3633EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3634
3635void fastcall complete(struct completion *x)
3636{
3637 unsigned long flags;
3638
3639 spin_lock_irqsave(&x->wait.lock, flags);
3640 x->done++;
3641 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3642 1, 0, NULL);
3643 spin_unlock_irqrestore(&x->wait.lock, flags);
3644}
3645EXPORT_SYMBOL(complete);
3646
3647void fastcall complete_all(struct completion *x)
3648{
3649 unsigned long flags;
3650
3651 spin_lock_irqsave(&x->wait.lock, flags);
3652 x->done += UINT_MAX/2;
3653 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3654 0, 0, NULL);
3655 spin_unlock_irqrestore(&x->wait.lock, flags);
3656}
3657EXPORT_SYMBOL(complete_all);
3658
3659void fastcall __sched wait_for_completion(struct completion *x)
3660{
3661 might_sleep();
Ingo Molnar48f24c42006-07-03 00:25:40 -07003662
Linus Torvalds1da177e2005-04-16 15:20:36 -07003663 spin_lock_irq(&x->wait.lock);
3664 if (!x->done) {
3665 DECLARE_WAITQUEUE(wait, current);
3666
3667 wait.flags |= WQ_FLAG_EXCLUSIVE;
3668 __add_wait_queue_tail(&x->wait, &wait);
3669 do {
3670 __set_current_state(TASK_UNINTERRUPTIBLE);
3671 spin_unlock_irq(&x->wait.lock);
3672 schedule();
3673 spin_lock_irq(&x->wait.lock);
3674 } while (!x->done);
3675 __remove_wait_queue(&x->wait, &wait);
3676 }
3677 x->done--;
3678 spin_unlock_irq(&x->wait.lock);
3679}
3680EXPORT_SYMBOL(wait_for_completion);
3681
3682unsigned long fastcall __sched
3683wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3684{
3685 might_sleep();
3686
3687 spin_lock_irq(&x->wait.lock);
3688 if (!x->done) {
3689 DECLARE_WAITQUEUE(wait, current);
3690
3691 wait.flags |= WQ_FLAG_EXCLUSIVE;
3692 __add_wait_queue_tail(&x->wait, &wait);
3693 do {
3694 __set_current_state(TASK_UNINTERRUPTIBLE);
3695 spin_unlock_irq(&x->wait.lock);
3696 timeout = schedule_timeout(timeout);
3697 spin_lock_irq(&x->wait.lock);
3698 if (!timeout) {
3699 __remove_wait_queue(&x->wait, &wait);
3700 goto out;
3701 }
3702 } while (!x->done);
3703 __remove_wait_queue(&x->wait, &wait);
3704 }
3705 x->done--;
3706out:
3707 spin_unlock_irq(&x->wait.lock);
3708 return timeout;
3709}
3710EXPORT_SYMBOL(wait_for_completion_timeout);
3711
3712int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3713{
3714 int ret = 0;
3715
3716 might_sleep();
3717
3718 spin_lock_irq(&x->wait.lock);
3719 if (!x->done) {
3720 DECLARE_WAITQUEUE(wait, current);
3721
3722 wait.flags |= WQ_FLAG_EXCLUSIVE;
3723 __add_wait_queue_tail(&x->wait, &wait);
3724 do {
3725 if (signal_pending(current)) {
3726 ret = -ERESTARTSYS;
3727 __remove_wait_queue(&x->wait, &wait);
3728 goto out;
3729 }
3730 __set_current_state(TASK_INTERRUPTIBLE);
3731 spin_unlock_irq(&x->wait.lock);
3732 schedule();
3733 spin_lock_irq(&x->wait.lock);
3734 } while (!x->done);
3735 __remove_wait_queue(&x->wait, &wait);
3736 }
3737 x->done--;
3738out:
3739 spin_unlock_irq(&x->wait.lock);
3740
3741 return ret;
3742}
3743EXPORT_SYMBOL(wait_for_completion_interruptible);
3744
3745unsigned long fastcall __sched
3746wait_for_completion_interruptible_timeout(struct completion *x,
3747 unsigned long timeout)
3748{
3749 might_sleep();
3750
3751 spin_lock_irq(&x->wait.lock);
3752 if (!x->done) {
3753 DECLARE_WAITQUEUE(wait, current);
3754
3755 wait.flags |= WQ_FLAG_EXCLUSIVE;
3756 __add_wait_queue_tail(&x->wait, &wait);
3757 do {
3758 if (signal_pending(current)) {
3759 timeout = -ERESTARTSYS;
3760 __remove_wait_queue(&x->wait, &wait);
3761 goto out;
3762 }
3763 __set_current_state(TASK_INTERRUPTIBLE);
3764 spin_unlock_irq(&x->wait.lock);
3765 timeout = schedule_timeout(timeout);
3766 spin_lock_irq(&x->wait.lock);
3767 if (!timeout) {
3768 __remove_wait_queue(&x->wait, &wait);
3769 goto out;
3770 }
3771 } while (!x->done);
3772 __remove_wait_queue(&x->wait, &wait);
3773 }
3774 x->done--;
3775out:
3776 spin_unlock_irq(&x->wait.lock);
3777 return timeout;
3778}
3779EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3780
3781
3782#define SLEEP_ON_VAR \
3783 unsigned long flags; \
3784 wait_queue_t wait; \
3785 init_waitqueue_entry(&wait, current);
3786
3787#define SLEEP_ON_HEAD \
3788 spin_lock_irqsave(&q->lock,flags); \
3789 __add_wait_queue(q, &wait); \
3790 spin_unlock(&q->lock);
3791
3792#define SLEEP_ON_TAIL \
3793 spin_lock_irq(&q->lock); \
3794 __remove_wait_queue(q, &wait); \
3795 spin_unlock_irqrestore(&q->lock, flags);
3796
3797void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3798{
3799 SLEEP_ON_VAR
3800
3801 current->state = TASK_INTERRUPTIBLE;
3802
3803 SLEEP_ON_HEAD
3804 schedule();
3805 SLEEP_ON_TAIL
3806}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003807EXPORT_SYMBOL(interruptible_sleep_on);
3808
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003809long fastcall __sched
3810interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003811{
3812 SLEEP_ON_VAR
3813
3814 current->state = TASK_INTERRUPTIBLE;
3815
3816 SLEEP_ON_HEAD
3817 timeout = schedule_timeout(timeout);
3818 SLEEP_ON_TAIL
3819
3820 return timeout;
3821}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003822EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3823
3824void fastcall __sched sleep_on(wait_queue_head_t *q)
3825{
3826 SLEEP_ON_VAR
3827
3828 current->state = TASK_UNINTERRUPTIBLE;
3829
3830 SLEEP_ON_HEAD
3831 schedule();
3832 SLEEP_ON_TAIL
3833}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003834EXPORT_SYMBOL(sleep_on);
3835
3836long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3837{
3838 SLEEP_ON_VAR
3839
3840 current->state = TASK_UNINTERRUPTIBLE;
3841
3842 SLEEP_ON_HEAD
3843 timeout = schedule_timeout(timeout);
3844 SLEEP_ON_TAIL
3845
3846 return timeout;
3847}
3848
3849EXPORT_SYMBOL(sleep_on_timeout);
3850
Ingo Molnarb29739f2006-06-27 02:54:51 -07003851#ifdef CONFIG_RT_MUTEXES
3852
3853/*
3854 * rt_mutex_setprio - set the current priority of a task
3855 * @p: task
3856 * @prio: prio value (kernel-internal form)
3857 *
3858 * This function changes the 'effective' priority of a task. It does
3859 * not touch ->normal_prio like __setscheduler().
3860 *
3861 * Used by the rt_mutex code to implement priority inheritance logic.
3862 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003863void rt_mutex_setprio(struct task_struct *p, int prio)
Ingo Molnarb29739f2006-06-27 02:54:51 -07003864{
3865 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02003866 int oldprio, on_rq;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003867 struct rq *rq;
Ingo Molnardd41f592007-07-09 18:51:59 +02003868 u64 now;
Ingo Molnarb29739f2006-06-27 02:54:51 -07003869
3870 BUG_ON(prio < 0 || prio > MAX_PRIO);
3871
3872 rq = task_rq_lock(p, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02003873 now = rq_clock(rq);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003874
Andrew Mortond5f9f942007-05-08 20:27:06 -07003875 oldprio = p->prio;
Ingo Molnardd41f592007-07-09 18:51:59 +02003876 on_rq = p->se.on_rq;
3877 if (on_rq)
3878 dequeue_task(rq, p, 0, now);
3879
3880 if (rt_prio(prio))
3881 p->sched_class = &rt_sched_class;
3882 else
3883 p->sched_class = &fair_sched_class;
3884
Ingo Molnarb29739f2006-06-27 02:54:51 -07003885 p->prio = prio;
3886
Ingo Molnardd41f592007-07-09 18:51:59 +02003887 if (on_rq) {
3888 enqueue_task(rq, p, 0, now);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003889 /*
3890 * Reschedule if we are currently running on this runqueue and
Andrew Mortond5f9f942007-05-08 20:27:06 -07003891 * our priority decreased, or if we are not currently running on
3892 * this runqueue and our priority is higher than the current's
Ingo Molnarb29739f2006-06-27 02:54:51 -07003893 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07003894 if (task_running(rq, p)) {
3895 if (p->prio > oldprio)
3896 resched_task(rq->curr);
Ingo Molnardd41f592007-07-09 18:51:59 +02003897 } else {
3898 check_preempt_curr(rq, p);
3899 }
Ingo Molnarb29739f2006-06-27 02:54:51 -07003900 }
3901 task_rq_unlock(rq, &flags);
3902}
3903
3904#endif
3905
Ingo Molnar36c8b582006-07-03 00:25:41 -07003906void set_user_nice(struct task_struct *p, long nice)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003907{
Ingo Molnardd41f592007-07-09 18:51:59 +02003908 int old_prio, delta, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003909 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003910 struct rq *rq;
Ingo Molnardd41f592007-07-09 18:51:59 +02003911 u64 now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003912
3913 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3914 return;
3915 /*
3916 * We have to be careful, if called from sys_setpriority(),
3917 * the task might be in the middle of scheduling on another CPU.
3918 */
3919 rq = task_rq_lock(p, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02003920 now = rq_clock(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003921 /*
3922 * The RT priorities are set via sched_setscheduler(), but we still
3923 * allow the 'normal' nice value to be set - but as expected
3924 * it wont have any effect on scheduling until the task is
Ingo Molnardd41f592007-07-09 18:51:59 +02003925 * SCHED_FIFO/SCHED_RR:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003926 */
Ingo Molnare05606d2007-07-09 18:51:59 +02003927 if (task_has_rt_policy(p)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003928 p->static_prio = NICE_TO_PRIO(nice);
3929 goto out_unlock;
3930 }
Ingo Molnardd41f592007-07-09 18:51:59 +02003931 on_rq = p->se.on_rq;
3932 if (on_rq) {
3933 dequeue_task(rq, p, 0, now);
3934 dec_load(rq, p, now);
Peter Williams2dd73a42006-06-27 02:54:34 -07003935 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003936
Linus Torvalds1da177e2005-04-16 15:20:36 -07003937 p->static_prio = NICE_TO_PRIO(nice);
Peter Williams2dd73a42006-06-27 02:54:34 -07003938 set_load_weight(p);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003939 old_prio = p->prio;
3940 p->prio = effective_prio(p);
3941 delta = p->prio - old_prio;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003942
Ingo Molnardd41f592007-07-09 18:51:59 +02003943 if (on_rq) {
3944 enqueue_task(rq, p, 0, now);
3945 inc_load(rq, p, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003946 /*
Andrew Mortond5f9f942007-05-08 20:27:06 -07003947 * If the task increased its priority or is running and
3948 * lowered its priority, then reschedule its CPU:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003949 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07003950 if (delta < 0 || (delta > 0 && task_running(rq, p)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003951 resched_task(rq->curr);
3952 }
3953out_unlock:
3954 task_rq_unlock(rq, &flags);
3955}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003956EXPORT_SYMBOL(set_user_nice);
3957
Matt Mackalle43379f2005-05-01 08:59:00 -07003958/*
3959 * can_nice - check if a task can reduce its nice value
3960 * @p: task
3961 * @nice: nice value
3962 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003963int can_nice(const struct task_struct *p, const int nice)
Matt Mackalle43379f2005-05-01 08:59:00 -07003964{
Matt Mackall024f4742005-08-18 11:24:19 -07003965 /* convert nice value [19,-20] to rlimit style value [1,40] */
3966 int nice_rlim = 20 - nice;
Ingo Molnar48f24c42006-07-03 00:25:40 -07003967
Matt Mackalle43379f2005-05-01 08:59:00 -07003968 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3969 capable(CAP_SYS_NICE));
3970}
3971
Linus Torvalds1da177e2005-04-16 15:20:36 -07003972#ifdef __ARCH_WANT_SYS_NICE
3973
3974/*
3975 * sys_nice - change the priority of the current process.
3976 * @increment: priority increment
3977 *
3978 * sys_setpriority is a more generic, but much slower function that
3979 * does similar things.
3980 */
3981asmlinkage long sys_nice(int increment)
3982{
Ingo Molnar48f24c42006-07-03 00:25:40 -07003983 long nice, retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003984
3985 /*
3986 * Setpriority might change our priority at the same moment.
3987 * We don't have to worry. Conceptually one call occurs first
3988 * and we have a single winner.
3989 */
Matt Mackalle43379f2005-05-01 08:59:00 -07003990 if (increment < -40)
3991 increment = -40;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003992 if (increment > 40)
3993 increment = 40;
3994
3995 nice = PRIO_TO_NICE(current->static_prio) + increment;
3996 if (nice < -20)
3997 nice = -20;
3998 if (nice > 19)
3999 nice = 19;
4000
Matt Mackalle43379f2005-05-01 08:59:00 -07004001 if (increment < 0 && !can_nice(current, nice))
4002 return -EPERM;
4003
Linus Torvalds1da177e2005-04-16 15:20:36 -07004004 retval = security_task_setnice(current, nice);
4005 if (retval)
4006 return retval;
4007
4008 set_user_nice(current, nice);
4009 return 0;
4010}
4011
4012#endif
4013
4014/**
4015 * task_prio - return the priority value of a given task.
4016 * @p: the task in question.
4017 *
4018 * This is the priority value as seen by users in /proc.
4019 * RT tasks are offset by -200. Normal tasks are centered
4020 * around 0, value goes from -16 to +15.
4021 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004022int task_prio(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004023{
4024 return p->prio - MAX_RT_PRIO;
4025}
4026
4027/**
4028 * task_nice - return the nice value of a given task.
4029 * @p: the task in question.
4030 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004031int task_nice(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004032{
4033 return TASK_NICE(p);
4034}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004035EXPORT_SYMBOL_GPL(task_nice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004036
4037/**
4038 * idle_cpu - is a given cpu idle currently?
4039 * @cpu: the processor in question.
4040 */
4041int idle_cpu(int cpu)
4042{
4043 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4044}
4045
Linus Torvalds1da177e2005-04-16 15:20:36 -07004046/**
4047 * idle_task - return the idle task for a given cpu.
4048 * @cpu: the processor in question.
4049 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004050struct task_struct *idle_task(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004051{
4052 return cpu_rq(cpu)->idle;
4053}
4054
4055/**
4056 * find_process_by_pid - find a process with a matching PID value.
4057 * @pid: the pid in question.
4058 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004059static inline struct task_struct *find_process_by_pid(pid_t pid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004060{
4061 return pid ? find_task_by_pid(pid) : current;
4062}
4063
4064/* Actually do priority change: must hold rq lock. */
Ingo Molnardd41f592007-07-09 18:51:59 +02004065static void
4066__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004067{
Ingo Molnardd41f592007-07-09 18:51:59 +02004068 BUG_ON(p->se.on_rq);
Ingo Molnar48f24c42006-07-03 00:25:40 -07004069
Linus Torvalds1da177e2005-04-16 15:20:36 -07004070 p->policy = policy;
Ingo Molnardd41f592007-07-09 18:51:59 +02004071 switch (p->policy) {
4072 case SCHED_NORMAL:
4073 case SCHED_BATCH:
4074 case SCHED_IDLE:
4075 p->sched_class = &fair_sched_class;
4076 break;
4077 case SCHED_FIFO:
4078 case SCHED_RR:
4079 p->sched_class = &rt_sched_class;
4080 break;
4081 }
4082
Linus Torvalds1da177e2005-04-16 15:20:36 -07004083 p->rt_priority = prio;
Ingo Molnarb29739f2006-06-27 02:54:51 -07004084 p->normal_prio = normal_prio(p);
4085 /* we are holding p->pi_lock already */
4086 p->prio = rt_mutex_getprio(p);
Peter Williams2dd73a42006-06-27 02:54:34 -07004087 set_load_weight(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004088}
4089
4090/**
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08004091 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004092 * @p: the task in question.
4093 * @policy: new policy.
4094 * @param: structure containing the new RT priority.
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004095 *
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08004096 * NOTE that the task may be already dead.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004097 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004098int sched_setscheduler(struct task_struct *p, int policy,
4099 struct sched_param *param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004100{
Ingo Molnardd41f592007-07-09 18:51:59 +02004101 int retval, oldprio, oldpolicy = -1, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004102 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004103 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004104
Steven Rostedt66e53932006-06-27 02:54:44 -07004105 /* may grab non-irq protected spin_locks */
4106 BUG_ON(in_interrupt());
Linus Torvalds1da177e2005-04-16 15:20:36 -07004107recheck:
4108 /* double check policy once rq lock held */
4109 if (policy < 0)
4110 policy = oldpolicy = p->policy;
4111 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
Ingo Molnardd41f592007-07-09 18:51:59 +02004112 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4113 policy != SCHED_IDLE)
Ingo Molnarb0a94992006-01-14 13:20:41 -08004114 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004115 /*
4116 * Valid priorities for SCHED_FIFO and SCHED_RR are
Ingo Molnardd41f592007-07-09 18:51:59 +02004117 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4118 * SCHED_BATCH and SCHED_IDLE is 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004119 */
4120 if (param->sched_priority < 0 ||
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004121 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
Steven Rostedtd46523e2005-07-25 16:28:39 -04004122 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004123 return -EINVAL;
Ingo Molnare05606d2007-07-09 18:51:59 +02004124 if (rt_policy(policy) != (param->sched_priority != 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004125 return -EINVAL;
4126
Olivier Croquette37e4ab32005-06-25 14:57:32 -07004127 /*
4128 * Allow unprivileged RT tasks to decrease priority:
4129 */
4130 if (!capable(CAP_SYS_NICE)) {
Ingo Molnare05606d2007-07-09 18:51:59 +02004131 if (rt_policy(policy)) {
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004132 unsigned long rlim_rtprio;
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004133
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004134 if (!lock_task_sighand(p, &flags))
4135 return -ESRCH;
4136 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4137 unlock_task_sighand(p, &flags);
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004138
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004139 /* can't set/change the rt policy */
4140 if (policy != p->policy && !rlim_rtprio)
4141 return -EPERM;
4142
4143 /* can't increase priority */
4144 if (param->sched_priority > p->rt_priority &&
4145 param->sched_priority > rlim_rtprio)
4146 return -EPERM;
4147 }
Ingo Molnardd41f592007-07-09 18:51:59 +02004148 /*
4149 * Like positive nice levels, dont allow tasks to
4150 * move out of SCHED_IDLE either:
4151 */
4152 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4153 return -EPERM;
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004154
Olivier Croquette37e4ab32005-06-25 14:57:32 -07004155 /* can't change other user's priorities */
4156 if ((current->euid != p->euid) &&
4157 (current->euid != p->uid))
4158 return -EPERM;
4159 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004160
4161 retval = security_task_setscheduler(p, policy, param);
4162 if (retval)
4163 return retval;
4164 /*
Ingo Molnarb29739f2006-06-27 02:54:51 -07004165 * make sure no PI-waiters arrive (or leave) while we are
4166 * changing the priority of the task:
4167 */
4168 spin_lock_irqsave(&p->pi_lock, flags);
4169 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07004170 * To be able to change p->policy safely, the apropriate
4171 * runqueue lock must be held.
4172 */
Ingo Molnarb29739f2006-06-27 02:54:51 -07004173 rq = __task_rq_lock(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004174 /* recheck policy now with rq lock held */
4175 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4176 policy = oldpolicy = -1;
Ingo Molnarb29739f2006-06-27 02:54:51 -07004177 __task_rq_unlock(rq);
4178 spin_unlock_irqrestore(&p->pi_lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004179 goto recheck;
4180 }
Ingo Molnardd41f592007-07-09 18:51:59 +02004181 on_rq = p->se.on_rq;
4182 if (on_rq)
4183 deactivate_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004184 oldprio = p->prio;
Ingo Molnardd41f592007-07-09 18:51:59 +02004185 __setscheduler(rq, p, policy, param->sched_priority);
4186 if (on_rq) {
4187 activate_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004188 /*
4189 * Reschedule if we are currently running on this runqueue and
Andrew Mortond5f9f942007-05-08 20:27:06 -07004190 * our priority decreased, or if we are not currently running on
4191 * this runqueue and our priority is higher than the current's
Linus Torvalds1da177e2005-04-16 15:20:36 -07004192 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07004193 if (task_running(rq, p)) {
4194 if (p->prio > oldprio)
4195 resched_task(rq->curr);
Ingo Molnardd41f592007-07-09 18:51:59 +02004196 } else {
4197 check_preempt_curr(rq, p);
4198 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004199 }
Ingo Molnarb29739f2006-06-27 02:54:51 -07004200 __task_rq_unlock(rq);
4201 spin_unlock_irqrestore(&p->pi_lock, flags);
4202
Thomas Gleixner95e02ca2006-06-27 02:55:02 -07004203 rt_mutex_adjust_pi(p);
4204
Linus Torvalds1da177e2005-04-16 15:20:36 -07004205 return 0;
4206}
4207EXPORT_SYMBOL_GPL(sched_setscheduler);
4208
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004209static int
4210do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004211{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004212 struct sched_param lparam;
4213 struct task_struct *p;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004214 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004215
4216 if (!param || pid < 0)
4217 return -EINVAL;
4218 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4219 return -EFAULT;
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004220
4221 rcu_read_lock();
4222 retval = -ESRCH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004223 p = find_process_by_pid(pid);
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004224 if (p != NULL)
4225 retval = sched_setscheduler(p, policy, &lparam);
4226 rcu_read_unlock();
Ingo Molnar36c8b582006-07-03 00:25:41 -07004227
Linus Torvalds1da177e2005-04-16 15:20:36 -07004228 return retval;
4229}
4230
4231/**
4232 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4233 * @pid: the pid in question.
4234 * @policy: new policy.
4235 * @param: structure containing the new RT priority.
4236 */
4237asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4238 struct sched_param __user *param)
4239{
Jason Baronc21761f2006-01-18 17:43:03 -08004240 /* negative values for policy are not valid */
4241 if (policy < 0)
4242 return -EINVAL;
4243
Linus Torvalds1da177e2005-04-16 15:20:36 -07004244 return do_sched_setscheduler(pid, policy, param);
4245}
4246
4247/**
4248 * sys_sched_setparam - set/change the RT priority of a thread
4249 * @pid: the pid in question.
4250 * @param: structure containing the new RT priority.
4251 */
4252asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4253{
4254 return do_sched_setscheduler(pid, -1, param);
4255}
4256
4257/**
4258 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4259 * @pid: the pid in question.
4260 */
4261asmlinkage long sys_sched_getscheduler(pid_t pid)
4262{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004263 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004264 int retval = -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004265
4266 if (pid < 0)
4267 goto out_nounlock;
4268
4269 retval = -ESRCH;
4270 read_lock(&tasklist_lock);
4271 p = find_process_by_pid(pid);
4272 if (p) {
4273 retval = security_task_getscheduler(p);
4274 if (!retval)
4275 retval = p->policy;
4276 }
4277 read_unlock(&tasklist_lock);
4278
4279out_nounlock:
4280 return retval;
4281}
4282
4283/**
4284 * sys_sched_getscheduler - get the RT priority of a thread
4285 * @pid: the pid in question.
4286 * @param: structure containing the RT priority.
4287 */
4288asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4289{
4290 struct sched_param lp;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004291 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004292 int retval = -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004293
4294 if (!param || pid < 0)
4295 goto out_nounlock;
4296
4297 read_lock(&tasklist_lock);
4298 p = find_process_by_pid(pid);
4299 retval = -ESRCH;
4300 if (!p)
4301 goto out_unlock;
4302
4303 retval = security_task_getscheduler(p);
4304 if (retval)
4305 goto out_unlock;
4306
4307 lp.sched_priority = p->rt_priority;
4308 read_unlock(&tasklist_lock);
4309
4310 /*
4311 * This one might sleep, we cannot do it with a spinlock held ...
4312 */
4313 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4314
4315out_nounlock:
4316 return retval;
4317
4318out_unlock:
4319 read_unlock(&tasklist_lock);
4320 return retval;
4321}
4322
4323long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4324{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004325 cpumask_t cpus_allowed;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004326 struct task_struct *p;
4327 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004328
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004329 mutex_lock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004330 read_lock(&tasklist_lock);
4331
4332 p = find_process_by_pid(pid);
4333 if (!p) {
4334 read_unlock(&tasklist_lock);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004335 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004336 return -ESRCH;
4337 }
4338
4339 /*
4340 * It is not safe to call set_cpus_allowed with the
4341 * tasklist_lock held. We will bump the task_struct's
4342 * usage count and then drop tasklist_lock.
4343 */
4344 get_task_struct(p);
4345 read_unlock(&tasklist_lock);
4346
4347 retval = -EPERM;
4348 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4349 !capable(CAP_SYS_NICE))
4350 goto out_unlock;
4351
David Quigleye7834f82006-06-23 02:03:59 -07004352 retval = security_task_setscheduler(p, 0, NULL);
4353 if (retval)
4354 goto out_unlock;
4355
Linus Torvalds1da177e2005-04-16 15:20:36 -07004356 cpus_allowed = cpuset_cpus_allowed(p);
4357 cpus_and(new_mask, new_mask, cpus_allowed);
4358 retval = set_cpus_allowed(p, new_mask);
4359
4360out_unlock:
4361 put_task_struct(p);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004362 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004363 return retval;
4364}
4365
4366static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4367 cpumask_t *new_mask)
4368{
4369 if (len < sizeof(cpumask_t)) {
4370 memset(new_mask, 0, sizeof(cpumask_t));
4371 } else if (len > sizeof(cpumask_t)) {
4372 len = sizeof(cpumask_t);
4373 }
4374 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4375}
4376
4377/**
4378 * sys_sched_setaffinity - set the cpu affinity of a process
4379 * @pid: pid of the process
4380 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4381 * @user_mask_ptr: user-space pointer to the new cpu mask
4382 */
4383asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4384 unsigned long __user *user_mask_ptr)
4385{
4386 cpumask_t new_mask;
4387 int retval;
4388
4389 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4390 if (retval)
4391 return retval;
4392
4393 return sched_setaffinity(pid, new_mask);
4394}
4395
4396/*
4397 * Represents all cpu's present in the system
4398 * In systems capable of hotplug, this map could dynamically grow
4399 * as new cpu's are detected in the system via any platform specific
4400 * method, such as ACPI for e.g.
4401 */
4402
Andi Kleen4cef0c62006-01-11 22:44:57 +01004403cpumask_t cpu_present_map __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004404EXPORT_SYMBOL(cpu_present_map);
4405
4406#ifndef CONFIG_SMP
Andi Kleen4cef0c62006-01-11 22:44:57 +01004407cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
Greg Bankse16b38f2006-10-02 02:17:40 -07004408EXPORT_SYMBOL(cpu_online_map);
4409
Andi Kleen4cef0c62006-01-11 22:44:57 +01004410cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
Greg Bankse16b38f2006-10-02 02:17:40 -07004411EXPORT_SYMBOL(cpu_possible_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004412#endif
4413
4414long sched_getaffinity(pid_t pid, cpumask_t *mask)
4415{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004416 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004417 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004418
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004419 mutex_lock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004420 read_lock(&tasklist_lock);
4421
4422 retval = -ESRCH;
4423 p = find_process_by_pid(pid);
4424 if (!p)
4425 goto out_unlock;
4426
David Quigleye7834f82006-06-23 02:03:59 -07004427 retval = security_task_getscheduler(p);
4428 if (retval)
4429 goto out_unlock;
4430
Jack Steiner2f7016d2006-02-01 03:05:18 -08004431 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004432
4433out_unlock:
4434 read_unlock(&tasklist_lock);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004435 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004436 if (retval)
4437 return retval;
4438
4439 return 0;
4440}
4441
4442/**
4443 * sys_sched_getaffinity - get the cpu affinity of a process
4444 * @pid: pid of the process
4445 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4446 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4447 */
4448asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4449 unsigned long __user *user_mask_ptr)
4450{
4451 int ret;
4452 cpumask_t mask;
4453
4454 if (len < sizeof(cpumask_t))
4455 return -EINVAL;
4456
4457 ret = sched_getaffinity(pid, &mask);
4458 if (ret < 0)
4459 return ret;
4460
4461 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4462 return -EFAULT;
4463
4464 return sizeof(cpumask_t);
4465}
4466
4467/**
4468 * sys_sched_yield - yield the current processor to other threads.
4469 *
Ingo Molnardd41f592007-07-09 18:51:59 +02004470 * This function yields the current CPU to other tasks. If there are no
4471 * other threads running on this CPU then this function will return.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004472 */
4473asmlinkage long sys_sched_yield(void)
4474{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004475 struct rq *rq = this_rq_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004476
4477 schedstat_inc(rq, yld_cnt);
Ingo Molnardd41f592007-07-09 18:51:59 +02004478 if (unlikely(rq->nr_running == 1))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004479 schedstat_inc(rq, yld_act_empty);
Ingo Molnardd41f592007-07-09 18:51:59 +02004480 else
4481 current->sched_class->yield_task(rq, current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004482
4483 /*
4484 * Since we are going to call schedule() anyway, there's
4485 * no need to preempt or enable interrupts:
4486 */
4487 __release(rq->lock);
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07004488 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004489 _raw_spin_unlock(&rq->lock);
4490 preempt_enable_no_resched();
4491
4492 schedule();
4493
4494 return 0;
4495}
4496
Andrew Mortone7b38402006-06-30 01:56:00 -07004497static void __cond_resched(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004498{
Ingo Molnar8e0a43d2006-06-23 02:05:23 -07004499#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4500 __might_sleep(__FILE__, __LINE__);
4501#endif
Ingo Molnar5bbcfd92005-07-07 17:57:04 -07004502 /*
4503 * The BKS might be reacquired before we have dropped
4504 * PREEMPT_ACTIVE, which could trigger a second
4505 * cond_resched() call.
4506 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004507 do {
4508 add_preempt_count(PREEMPT_ACTIVE);
4509 schedule();
4510 sub_preempt_count(PREEMPT_ACTIVE);
4511 } while (need_resched());
4512}
4513
4514int __sched cond_resched(void)
4515{
Ingo Molnar94142322006-12-29 16:48:13 -08004516 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4517 system_state == SYSTEM_RUNNING) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004518 __cond_resched();
4519 return 1;
4520 }
4521 return 0;
4522}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004523EXPORT_SYMBOL(cond_resched);
4524
4525/*
4526 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4527 * call schedule, and on return reacquire the lock.
4528 *
4529 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4530 * operations here to prevent schedule() from being called twice (once via
4531 * spin_unlock(), once by hand).
4532 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004533int cond_resched_lock(spinlock_t *lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004534{
Jan Kara6df3cec2005-06-13 15:52:32 -07004535 int ret = 0;
4536
Linus Torvalds1da177e2005-04-16 15:20:36 -07004537 if (need_lockbreak(lock)) {
4538 spin_unlock(lock);
4539 cpu_relax();
Jan Kara6df3cec2005-06-13 15:52:32 -07004540 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004541 spin_lock(lock);
4542 }
Ingo Molnar94142322006-12-29 16:48:13 -08004543 if (need_resched() && system_state == SYSTEM_RUNNING) {
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07004544 spin_release(&lock->dep_map, 1, _THIS_IP_);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004545 _raw_spin_unlock(lock);
4546 preempt_enable_no_resched();
4547 __cond_resched();
Jan Kara6df3cec2005-06-13 15:52:32 -07004548 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004549 spin_lock(lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004550 }
Jan Kara6df3cec2005-06-13 15:52:32 -07004551 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004552}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004553EXPORT_SYMBOL(cond_resched_lock);
4554
4555int __sched cond_resched_softirq(void)
4556{
4557 BUG_ON(!in_softirq());
4558
Ingo Molnar94142322006-12-29 16:48:13 -08004559 if (need_resched() && system_state == SYSTEM_RUNNING) {
Thomas Gleixner98d825672007-05-23 13:58:18 -07004560 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004561 __cond_resched();
4562 local_bh_disable();
4563 return 1;
4564 }
4565 return 0;
4566}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004567EXPORT_SYMBOL(cond_resched_softirq);
4568
Linus Torvalds1da177e2005-04-16 15:20:36 -07004569/**
4570 * yield - yield the current processor to other threads.
4571 *
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08004572 * This is a shortcut for kernel-space yielding - it marks the
Linus Torvalds1da177e2005-04-16 15:20:36 -07004573 * thread runnable and calls sys_sched_yield().
4574 */
4575void __sched yield(void)
4576{
4577 set_current_state(TASK_RUNNING);
4578 sys_sched_yield();
4579}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004580EXPORT_SYMBOL(yield);
4581
4582/*
4583 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4584 * that process accounting knows that this is a task in IO wait state.
4585 *
4586 * But don't do that if it is a deliberate, throttling IO wait (this task
4587 * has set its backing_dev_info: the queue against which it should throttle)
4588 */
4589void __sched io_schedule(void)
4590{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004591 struct rq *rq = &__raw_get_cpu_var(runqueues);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004592
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004593 delayacct_blkio_start();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004594 atomic_inc(&rq->nr_iowait);
4595 schedule();
4596 atomic_dec(&rq->nr_iowait);
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004597 delayacct_blkio_end();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004598}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004599EXPORT_SYMBOL(io_schedule);
4600
4601long __sched io_schedule_timeout(long timeout)
4602{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004603 struct rq *rq = &__raw_get_cpu_var(runqueues);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004604 long ret;
4605
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004606 delayacct_blkio_start();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004607 atomic_inc(&rq->nr_iowait);
4608 ret = schedule_timeout(timeout);
4609 atomic_dec(&rq->nr_iowait);
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004610 delayacct_blkio_end();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004611 return ret;
4612}
4613
4614/**
4615 * sys_sched_get_priority_max - return maximum RT priority.
4616 * @policy: scheduling class.
4617 *
4618 * this syscall returns the maximum rt_priority that can be used
4619 * by a given scheduling class.
4620 */
4621asmlinkage long sys_sched_get_priority_max(int policy)
4622{
4623 int ret = -EINVAL;
4624
4625 switch (policy) {
4626 case SCHED_FIFO:
4627 case SCHED_RR:
4628 ret = MAX_USER_RT_PRIO-1;
4629 break;
4630 case SCHED_NORMAL:
Ingo Molnarb0a94992006-01-14 13:20:41 -08004631 case SCHED_BATCH:
Ingo Molnardd41f592007-07-09 18:51:59 +02004632 case SCHED_IDLE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07004633 ret = 0;
4634 break;
4635 }
4636 return ret;
4637}
4638
4639/**
4640 * sys_sched_get_priority_min - return minimum RT priority.
4641 * @policy: scheduling class.
4642 *
4643 * this syscall returns the minimum rt_priority that can be used
4644 * by a given scheduling class.
4645 */
4646asmlinkage long sys_sched_get_priority_min(int policy)
4647{
4648 int ret = -EINVAL;
4649
4650 switch (policy) {
4651 case SCHED_FIFO:
4652 case SCHED_RR:
4653 ret = 1;
4654 break;
4655 case SCHED_NORMAL:
Ingo Molnarb0a94992006-01-14 13:20:41 -08004656 case SCHED_BATCH:
Ingo Molnardd41f592007-07-09 18:51:59 +02004657 case SCHED_IDLE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07004658 ret = 0;
4659 }
4660 return ret;
4661}
4662
4663/**
4664 * sys_sched_rr_get_interval - return the default timeslice of a process.
4665 * @pid: pid of the process.
4666 * @interval: userspace pointer to the timeslice value.
4667 *
4668 * this syscall writes the default timeslice value of a given process
4669 * into the user-space timespec buffer. A value of '0' means infinity.
4670 */
4671asmlinkage
4672long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4673{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004674 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004675 int retval = -EINVAL;
4676 struct timespec t;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004677
4678 if (pid < 0)
4679 goto out_nounlock;
4680
4681 retval = -ESRCH;
4682 read_lock(&tasklist_lock);
4683 p = find_process_by_pid(pid);
4684 if (!p)
4685 goto out_unlock;
4686
4687 retval = security_task_getscheduler(p);
4688 if (retval)
4689 goto out_unlock;
4690
Peter Williamsb78709c2006-06-26 16:58:00 +10004691 jiffies_to_timespec(p->policy == SCHED_FIFO ?
Ingo Molnardd41f592007-07-09 18:51:59 +02004692 0 : static_prio_timeslice(p->static_prio), &t);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004693 read_unlock(&tasklist_lock);
4694 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4695out_nounlock:
4696 return retval;
4697out_unlock:
4698 read_unlock(&tasklist_lock);
4699 return retval;
4700}
4701
Andreas Mohr2ed6e342006-07-10 04:43:52 -07004702static const char stat_nam[] = "RSDTtZX";
Ingo Molnar36c8b582006-07-03 00:25:41 -07004703
4704static void show_task(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004705{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004706 unsigned long free = 0;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004707 unsigned state;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004708
Linus Torvalds1da177e2005-04-16 15:20:36 -07004709 state = p->state ? __ffs(p->state) + 1 : 0;
Andreas Mohr2ed6e342006-07-10 04:43:52 -07004710 printk("%-13.13s %c", p->comm,
4711 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
Linus Torvalds1da177e2005-04-16 15:20:36 -07004712#if (BITS_PER_LONG == 32)
4713 if (state == TASK_RUNNING)
4714 printk(" running ");
4715 else
4716 printk(" %08lX ", thread_saved_pc(p));
4717#else
4718 if (state == TASK_RUNNING)
4719 printk(" running task ");
4720 else
4721 printk(" %016lx ", thread_saved_pc(p));
4722#endif
4723#ifdef CONFIG_DEBUG_STACK_USAGE
4724 {
Al Viro10ebffd2005-11-13 16:06:56 -08004725 unsigned long *n = end_of_stack(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004726 while (!*n)
4727 n++;
Al Viro10ebffd2005-11-13 16:06:56 -08004728 free = (unsigned long)n - (unsigned long)end_of_stack(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004729 }
4730#endif
Ingo Molnar35f6f752007-04-06 21:18:06 +02004731 printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004732 if (!p->mm)
4733 printk(" (L-TLB)\n");
4734 else
4735 printk(" (NOTLB)\n");
4736
4737 if (state != TASK_RUNNING)
4738 show_stack(p, NULL);
4739}
4740
Ingo Molnare59e2ae2006-12-06 20:35:59 -08004741void show_state_filter(unsigned long state_filter)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004742{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004743 struct task_struct *g, *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004744
4745#if (BITS_PER_LONG == 32)
4746 printk("\n"
Chris Caputo301827a2006-12-06 20:39:11 -08004747 " free sibling\n");
4748 printk(" task PC stack pid father child younger older\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004749#else
4750 printk("\n"
Chris Caputo301827a2006-12-06 20:39:11 -08004751 " free sibling\n");
4752 printk(" task PC stack pid father child younger older\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004753#endif
4754 read_lock(&tasklist_lock);
4755 do_each_thread(g, p) {
4756 /*
4757 * reset the NMI-timeout, listing all files on a slow
4758 * console might take alot of time:
4759 */
4760 touch_nmi_watchdog();
Ingo Molnar39bc89f2007-04-25 20:50:03 -07004761 if (!state_filter || (p->state & state_filter))
Ingo Molnare59e2ae2006-12-06 20:35:59 -08004762 show_task(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004763 } while_each_thread(g, p);
4764
Jeremy Fitzhardinge04c91672007-05-08 00:28:05 -07004765 touch_all_softlockup_watchdogs();
4766
Ingo Molnardd41f592007-07-09 18:51:59 +02004767#ifdef CONFIG_SCHED_DEBUG
4768 sysrq_sched_debug_show();
4769#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004770 read_unlock(&tasklist_lock);
Ingo Molnare59e2ae2006-12-06 20:35:59 -08004771 /*
4772 * Only show locks if all tasks are dumped:
4773 */
4774 if (state_filter == -1)
4775 debug_show_all_locks();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004776}
4777
Ingo Molnar1df21052007-07-09 18:51:58 +02004778void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4779{
Ingo Molnardd41f592007-07-09 18:51:59 +02004780 idle->sched_class = &idle_sched_class;
Ingo Molnar1df21052007-07-09 18:51:58 +02004781}
4782
Ingo Molnarf340c0d2005-06-28 16:40:42 +02004783/**
4784 * init_idle - set up an idle thread for a given CPU
4785 * @idle: task in question
4786 * @cpu: cpu the idle task belongs to
4787 *
4788 * NOTE: this function does not set the idle thread's NEED_RESCHED
4789 * flag, to make booting more robust.
4790 */
Nick Piggin5c1e1762006-10-03 01:14:04 -07004791void __cpuinit init_idle(struct task_struct *idle, int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004792{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004793 struct rq *rq = cpu_rq(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004794 unsigned long flags;
4795
Ingo Molnardd41f592007-07-09 18:51:59 +02004796 __sched_fork(idle);
4797 idle->se.exec_start = sched_clock();
4798
Ingo Molnarb29739f2006-06-27 02:54:51 -07004799 idle->prio = idle->normal_prio = MAX_PRIO;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004800 idle->cpus_allowed = cpumask_of_cpu(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02004801 __set_task_cpu(idle, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004802
4803 spin_lock_irqsave(&rq->lock, flags);
4804 rq->curr = rq->idle = idle;
Nick Piggin4866cde2005-06-25 14:57:23 -07004805#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4806 idle->oncpu = 1;
4807#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004808 spin_unlock_irqrestore(&rq->lock, flags);
4809
4810 /* Set the preempt count _outside_ the spinlocks! */
4811#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
Al Viroa1261f52005-11-13 16:06:55 -08004812 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004813#else
Al Viroa1261f52005-11-13 16:06:55 -08004814 task_thread_info(idle)->preempt_count = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004815#endif
Ingo Molnardd41f592007-07-09 18:51:59 +02004816 /*
4817 * The idle tasks have their own, simple scheduling class:
4818 */
4819 idle->sched_class = &idle_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004820}
4821
4822/*
4823 * In a system that switches off the HZ timer nohz_cpu_mask
4824 * indicates which cpus entered this state. This is used
4825 * in the rcu update to wait only for active cpus. For system
4826 * which do not switch off the HZ timer nohz_cpu_mask should
4827 * always be CPU_MASK_NONE.
4828 */
4829cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4830
Ingo Molnardd41f592007-07-09 18:51:59 +02004831/*
4832 * Increase the granularity value when there are more CPUs,
4833 * because with more CPUs the 'effective latency' as visible
4834 * to users decreases. But the relationship is not linear,
4835 * so pick a second-best guess by going with the log2 of the
4836 * number of CPUs.
4837 *
4838 * This idea comes from the SD scheduler of Con Kolivas:
4839 */
4840static inline void sched_init_granularity(void)
4841{
4842 unsigned int factor = 1 + ilog2(num_online_cpus());
4843 const unsigned long gran_limit = 10000000;
4844
4845 sysctl_sched_granularity *= factor;
4846 if (sysctl_sched_granularity > gran_limit)
4847 sysctl_sched_granularity = gran_limit;
4848
4849 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4850 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4851}
4852
Linus Torvalds1da177e2005-04-16 15:20:36 -07004853#ifdef CONFIG_SMP
4854/*
4855 * This is how migration works:
4856 *
Ingo Molnar70b97a72006-07-03 00:25:42 -07004857 * 1) we queue a struct migration_req structure in the source CPU's
Linus Torvalds1da177e2005-04-16 15:20:36 -07004858 * runqueue and wake up that CPU's migration thread.
4859 * 2) we down() the locked semaphore => thread blocks.
4860 * 3) migration thread wakes up (implicitly it forces the migrated
4861 * thread off the CPU)
4862 * 4) it gets the migration request and checks whether the migrated
4863 * task is still in the wrong runqueue.
4864 * 5) if it's in the wrong runqueue then the migration thread removes
4865 * it and puts it into the right queue.
4866 * 6) migration thread up()s the semaphore.
4867 * 7) we wake up and the migration is done.
4868 */
4869
4870/*
4871 * Change a given task's CPU affinity. Migrate the thread to a
4872 * proper CPU and schedule it away if the CPU it's executing on
4873 * is removed from the allowed bitmask.
4874 *
4875 * NOTE: the caller must have a valid reference to the task, the
4876 * task must not exit() & deallocate itself prematurely. The
4877 * call is not atomic; no spinlocks may be held.
4878 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004879int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004880{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004881 struct migration_req req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004882 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004883 struct rq *rq;
Ingo Molnar48f24c42006-07-03 00:25:40 -07004884 int ret = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004885
4886 rq = task_rq_lock(p, &flags);
4887 if (!cpus_intersects(new_mask, cpu_online_map)) {
4888 ret = -EINVAL;
4889 goto out;
4890 }
4891
4892 p->cpus_allowed = new_mask;
4893 /* Can the task run on the task's current CPU? If so, we're done */
4894 if (cpu_isset(task_cpu(p), new_mask))
4895 goto out;
4896
4897 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4898 /* Need help from migration thread: drop lock and wait. */
4899 task_rq_unlock(rq, &flags);
4900 wake_up_process(rq->migration_thread);
4901 wait_for_completion(&req.done);
4902 tlb_migrate_finish(p->mm);
4903 return 0;
4904 }
4905out:
4906 task_rq_unlock(rq, &flags);
Ingo Molnar48f24c42006-07-03 00:25:40 -07004907
Linus Torvalds1da177e2005-04-16 15:20:36 -07004908 return ret;
4909}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004910EXPORT_SYMBOL_GPL(set_cpus_allowed);
4911
4912/*
4913 * Move (not current) task off this cpu, onto dest cpu. We're doing
4914 * this because either it can't run here any more (set_cpus_allowed()
4915 * away from this CPU, or CPU going down), or because we're
4916 * attempting to rebalance this task on exec (sched_exec).
4917 *
4918 * So we race with normal scheduler movements, but that's OK, as long
4919 * as the task is no longer on this CPU.
Kirill Korotaevefc30812006-06-27 02:54:32 -07004920 *
4921 * Returns non-zero if task was successfully migrated.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004922 */
Kirill Korotaevefc30812006-06-27 02:54:32 -07004923static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004924{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004925 struct rq *rq_dest, *rq_src;
Ingo Molnardd41f592007-07-09 18:51:59 +02004926 int ret = 0, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004927
4928 if (unlikely(cpu_is_offline(dest_cpu)))
Kirill Korotaevefc30812006-06-27 02:54:32 -07004929 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004930
4931 rq_src = cpu_rq(src_cpu);
4932 rq_dest = cpu_rq(dest_cpu);
4933
4934 double_rq_lock(rq_src, rq_dest);
4935 /* Already moved. */
4936 if (task_cpu(p) != src_cpu)
4937 goto out;
4938 /* Affinity changed (again). */
4939 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4940 goto out;
4941
Ingo Molnardd41f592007-07-09 18:51:59 +02004942 on_rq = p->se.on_rq;
4943 if (on_rq)
4944 deactivate_task(rq_src, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004945 set_task_cpu(p, dest_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02004946 if (on_rq) {
4947 activate_task(rq_dest, p, 0);
4948 check_preempt_curr(rq_dest, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004949 }
Kirill Korotaevefc30812006-06-27 02:54:32 -07004950 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004951out:
4952 double_rq_unlock(rq_src, rq_dest);
Kirill Korotaevefc30812006-06-27 02:54:32 -07004953 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004954}
4955
4956/*
4957 * migration_thread - this is a highprio system thread that performs
4958 * thread migration by bumping thread off CPU then 'pushing' onto
4959 * another runqueue.
4960 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004961static int migration_thread(void *data)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004962{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004963 int cpu = (long)data;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004964 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004965
4966 rq = cpu_rq(cpu);
4967 BUG_ON(rq->migration_thread != current);
4968
4969 set_current_state(TASK_INTERRUPTIBLE);
4970 while (!kthread_should_stop()) {
Ingo Molnar70b97a72006-07-03 00:25:42 -07004971 struct migration_req *req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004972 struct list_head *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004973
Christoph Lameter3e1d1d22005-06-24 23:13:50 -07004974 try_to_freeze();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004975
4976 spin_lock_irq(&rq->lock);
4977
4978 if (cpu_is_offline(cpu)) {
4979 spin_unlock_irq(&rq->lock);
4980 goto wait_to_die;
4981 }
4982
4983 if (rq->active_balance) {
4984 active_load_balance(rq, cpu);
4985 rq->active_balance = 0;
4986 }
4987
4988 head = &rq->migration_queue;
4989
4990 if (list_empty(head)) {
4991 spin_unlock_irq(&rq->lock);
4992 schedule();
4993 set_current_state(TASK_INTERRUPTIBLE);
4994 continue;
4995 }
Ingo Molnar70b97a72006-07-03 00:25:42 -07004996 req = list_entry(head->next, struct migration_req, list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004997 list_del_init(head->next);
4998
Nick Piggin674311d2005-06-25 14:57:27 -07004999 spin_unlock(&rq->lock);
5000 __migrate_task(req->task, cpu, req->dest_cpu);
5001 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005002
5003 complete(&req->done);
5004 }
5005 __set_current_state(TASK_RUNNING);
5006 return 0;
5007
5008wait_to_die:
5009 /* Wait for kthread_stop */
5010 set_current_state(TASK_INTERRUPTIBLE);
5011 while (!kthread_should_stop()) {
5012 schedule();
5013 set_current_state(TASK_INTERRUPTIBLE);
5014 }
5015 __set_current_state(TASK_RUNNING);
5016 return 0;
5017}
5018
5019#ifdef CONFIG_HOTPLUG_CPU
Kirill Korotaev054b9102006-12-10 02:20:11 -08005020/*
5021 * Figure out where task on dead CPU should go, use force if neccessary.
5022 * NOTE: interrupts should be disabled by the caller
5023 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005024static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005025{
Kirill Korotaevefc30812006-06-27 02:54:32 -07005026 unsigned long flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005027 cpumask_t mask;
Ingo Molnar70b97a72006-07-03 00:25:42 -07005028 struct rq *rq;
5029 int dest_cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005030
Kirill Korotaevefc30812006-06-27 02:54:32 -07005031restart:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005032 /* On same node? */
5033 mask = node_to_cpumask(cpu_to_node(dead_cpu));
Ingo Molnar48f24c42006-07-03 00:25:40 -07005034 cpus_and(mask, mask, p->cpus_allowed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005035 dest_cpu = any_online_cpu(mask);
5036
5037 /* On any allowed CPU? */
5038 if (dest_cpu == NR_CPUS)
Ingo Molnar48f24c42006-07-03 00:25:40 -07005039 dest_cpu = any_online_cpu(p->cpus_allowed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005040
5041 /* No more Mr. Nice Guy. */
5042 if (dest_cpu == NR_CPUS) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07005043 rq = task_rq_lock(p, &flags);
5044 cpus_setall(p->cpus_allowed);
5045 dest_cpu = any_online_cpu(p->cpus_allowed);
Kirill Korotaevefc30812006-06-27 02:54:32 -07005046 task_rq_unlock(rq, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005047
5048 /*
5049 * Don't tell them about moving exiting tasks or
5050 * kernel threads (both mm NULL), since they never
5051 * leave kernel.
5052 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005053 if (p->mm && printk_ratelimit())
Linus Torvalds1da177e2005-04-16 15:20:36 -07005054 printk(KERN_INFO "process %d (%s) no "
5055 "longer affine to cpu%d\n",
Ingo Molnar48f24c42006-07-03 00:25:40 -07005056 p->pid, p->comm, dead_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005057 }
Ingo Molnar48f24c42006-07-03 00:25:40 -07005058 if (!__migrate_task(p, dead_cpu, dest_cpu))
Kirill Korotaevefc30812006-06-27 02:54:32 -07005059 goto restart;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005060}
5061
5062/*
5063 * While a dead CPU has no uninterruptible tasks queued at this point,
5064 * it might still have a nonzero ->nr_uninterruptible counter, because
5065 * for performance reasons the counter is not stricly tracking tasks to
5066 * their home CPUs. So we just add the counter to another CPU's counter,
5067 * to keep the global sum constant after CPU-down:
5068 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07005069static void migrate_nr_uninterruptible(struct rq *rq_src)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005070{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005071 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005072 unsigned long flags;
5073
5074 local_irq_save(flags);
5075 double_rq_lock(rq_src, rq_dest);
5076 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5077 rq_src->nr_uninterruptible = 0;
5078 double_rq_unlock(rq_src, rq_dest);
5079 local_irq_restore(flags);
5080}
5081
5082/* Run through task list and migrate tasks from the dead cpu. */
5083static void migrate_live_tasks(int src_cpu)
5084{
Ingo Molnar48f24c42006-07-03 00:25:40 -07005085 struct task_struct *p, *t;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005086
5087 write_lock_irq(&tasklist_lock);
5088
Ingo Molnar48f24c42006-07-03 00:25:40 -07005089 do_each_thread(t, p) {
5090 if (p == current)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005091 continue;
5092
Ingo Molnar48f24c42006-07-03 00:25:40 -07005093 if (task_cpu(p) == src_cpu)
5094 move_task_off_dead_cpu(src_cpu, p);
5095 } while_each_thread(t, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005096
5097 write_unlock_irq(&tasklist_lock);
5098}
5099
Ingo Molnardd41f592007-07-09 18:51:59 +02005100/*
5101 * Schedules idle task to be the next runnable task on current CPU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005102 * It does so by boosting its priority to highest possible and adding it to
Ingo Molnar48f24c42006-07-03 00:25:40 -07005103 * the _front_ of the runqueue. Used by CPU offline code.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005104 */
5105void sched_idle_next(void)
5106{
Ingo Molnar48f24c42006-07-03 00:25:40 -07005107 int this_cpu = smp_processor_id();
Ingo Molnar70b97a72006-07-03 00:25:42 -07005108 struct rq *rq = cpu_rq(this_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005109 struct task_struct *p = rq->idle;
5110 unsigned long flags;
5111
5112 /* cpu has to be offline */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005113 BUG_ON(cpu_online(this_cpu));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005114
Ingo Molnar48f24c42006-07-03 00:25:40 -07005115 /*
5116 * Strictly not necessary since rest of the CPUs are stopped by now
5117 * and interrupts disabled on the current cpu.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005118 */
5119 spin_lock_irqsave(&rq->lock, flags);
5120
Ingo Molnardd41f592007-07-09 18:51:59 +02005121 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005122
5123 /* Add idle task to the _front_ of its priority queue: */
Ingo Molnardd41f592007-07-09 18:51:59 +02005124 activate_idle_task(p, rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005125
5126 spin_unlock_irqrestore(&rq->lock, flags);
5127}
5128
Ingo Molnar48f24c42006-07-03 00:25:40 -07005129/*
5130 * Ensures that the idle task is using init_mm right before its cpu goes
Linus Torvalds1da177e2005-04-16 15:20:36 -07005131 * offline.
5132 */
5133void idle_task_exit(void)
5134{
5135 struct mm_struct *mm = current->active_mm;
5136
5137 BUG_ON(cpu_online(smp_processor_id()));
5138
5139 if (mm != &init_mm)
5140 switch_mm(mm, &init_mm, current);
5141 mmdrop(mm);
5142}
5143
Kirill Korotaev054b9102006-12-10 02:20:11 -08005144/* called under rq->lock with disabled interrupts */
Ingo Molnar36c8b582006-07-03 00:25:41 -07005145static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005146{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005147 struct rq *rq = cpu_rq(dead_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005148
5149 /* Must be exiting, otherwise would be on tasklist. */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005150 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005151
5152 /* Cannot have done final schedule yet: would have vanished. */
Oleg Nesterovc394cc92006-09-29 02:01:11 -07005153 BUG_ON(p->state == TASK_DEAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005154
Ingo Molnar48f24c42006-07-03 00:25:40 -07005155 get_task_struct(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005156
5157 /*
5158 * Drop lock around migration; if someone else moves it,
5159 * that's OK. No task can be added to this CPU, so iteration is
5160 * fine.
Kirill Korotaev054b9102006-12-10 02:20:11 -08005161 * NOTE: interrupts should be left disabled --dev@
Linus Torvalds1da177e2005-04-16 15:20:36 -07005162 */
Kirill Korotaev054b9102006-12-10 02:20:11 -08005163 spin_unlock(&rq->lock);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005164 move_task_off_dead_cpu(dead_cpu, p);
Kirill Korotaev054b9102006-12-10 02:20:11 -08005165 spin_lock(&rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005166
Ingo Molnar48f24c42006-07-03 00:25:40 -07005167 put_task_struct(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005168}
5169
5170/* release_task() removes task from tasklist, so we won't find dead tasks. */
5171static void migrate_dead_tasks(unsigned int dead_cpu)
5172{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005173 struct rq *rq = cpu_rq(dead_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02005174 struct task_struct *next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005175
Ingo Molnardd41f592007-07-09 18:51:59 +02005176 for ( ; ; ) {
5177 if (!rq->nr_running)
5178 break;
5179 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5180 if (!next)
5181 break;
5182 migrate_dead(dead_cpu, next);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005183 }
5184}
5185#endif /* CONFIG_HOTPLUG_CPU */
5186
5187/*
5188 * migration_call - callback that gets triggered when a CPU is added.
5189 * Here we can start up the necessary migration thread for the new CPU.
5190 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005191static int __cpuinit
5192migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005193{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005194 struct task_struct *p;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005195 int cpu = (long)hcpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005196 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07005197 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005198
5199 switch (action) {
Gautham R Shenoy5be93612007-05-09 02:34:04 -07005200 case CPU_LOCK_ACQUIRE:
5201 mutex_lock(&sched_hotcpu_mutex);
5202 break;
5203
Linus Torvalds1da177e2005-04-16 15:20:36 -07005204 case CPU_UP_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005205 case CPU_UP_PREPARE_FROZEN:
Ingo Molnardd41f592007-07-09 18:51:59 +02005206 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005207 if (IS_ERR(p))
5208 return NOTIFY_BAD;
5209 p->flags |= PF_NOFREEZE;
5210 kthread_bind(p, cpu);
5211 /* Must be high prio: stop_machine expects to yield to it. */
5212 rq = task_rq_lock(p, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02005213 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005214 task_rq_unlock(rq, &flags);
5215 cpu_rq(cpu)->migration_thread = p;
5216 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005217
Linus Torvalds1da177e2005-04-16 15:20:36 -07005218 case CPU_ONLINE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005219 case CPU_ONLINE_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005220 /* Strictly unneccessary, as first user will wake it. */
5221 wake_up_process(cpu_rq(cpu)->migration_thread);
5222 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005223
Linus Torvalds1da177e2005-04-16 15:20:36 -07005224#ifdef CONFIG_HOTPLUG_CPU
5225 case CPU_UP_CANCELED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005226 case CPU_UP_CANCELED_FROZEN:
Heiko Carstensfc75cdf2006-06-25 05:49:10 -07005227 if (!cpu_rq(cpu)->migration_thread)
5228 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005229 /* Unbind it from offline cpu so it can run. Fall thru. */
Heiko Carstensa4c4af72005-11-07 00:58:38 -08005230 kthread_bind(cpu_rq(cpu)->migration_thread,
5231 any_online_cpu(cpu_online_map));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005232 kthread_stop(cpu_rq(cpu)->migration_thread);
5233 cpu_rq(cpu)->migration_thread = NULL;
5234 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005235
Linus Torvalds1da177e2005-04-16 15:20:36 -07005236 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005237 case CPU_DEAD_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005238 migrate_live_tasks(cpu);
5239 rq = cpu_rq(cpu);
5240 kthread_stop(rq->migration_thread);
5241 rq->migration_thread = NULL;
5242 /* Idle task back to normal (off runqueue, low prio) */
5243 rq = task_rq_lock(rq->idle, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02005244 deactivate_task(rq, rq->idle, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005245 rq->idle->static_prio = MAX_PRIO;
Ingo Molnardd41f592007-07-09 18:51:59 +02005246 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5247 rq->idle->sched_class = &idle_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005248 migrate_dead_tasks(cpu);
5249 task_rq_unlock(rq, &flags);
5250 migrate_nr_uninterruptible(rq);
5251 BUG_ON(rq->nr_running != 0);
5252
5253 /* No need to migrate the tasks: it was best-effort if
Gautham R Shenoy5be93612007-05-09 02:34:04 -07005254 * they didn't take sched_hotcpu_mutex. Just wake up
Linus Torvalds1da177e2005-04-16 15:20:36 -07005255 * the requestors. */
5256 spin_lock_irq(&rq->lock);
5257 while (!list_empty(&rq->migration_queue)) {
Ingo Molnar70b97a72006-07-03 00:25:42 -07005258 struct migration_req *req;
5259
Linus Torvalds1da177e2005-04-16 15:20:36 -07005260 req = list_entry(rq->migration_queue.next,
Ingo Molnar70b97a72006-07-03 00:25:42 -07005261 struct migration_req, list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005262 list_del_init(&req->list);
5263 complete(&req->done);
5264 }
5265 spin_unlock_irq(&rq->lock);
5266 break;
5267#endif
Gautham R Shenoy5be93612007-05-09 02:34:04 -07005268 case CPU_LOCK_RELEASE:
5269 mutex_unlock(&sched_hotcpu_mutex);
5270 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005271 }
5272 return NOTIFY_OK;
5273}
5274
5275/* Register at highest priority so that task migration (migrate_all_tasks)
5276 * happens before everything else.
5277 */
Chandra Seetharaman26c21432006-06-27 02:54:10 -07005278static struct notifier_block __cpuinitdata migration_notifier = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005279 .notifier_call = migration_call,
5280 .priority = 10
5281};
5282
5283int __init migration_init(void)
5284{
5285 void *cpu = (void *)(long)smp_processor_id();
Akinobu Mita07dccf32006-09-29 02:00:22 -07005286 int err;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005287
5288 /* Start one for the boot CPU: */
Akinobu Mita07dccf32006-09-29 02:00:22 -07005289 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5290 BUG_ON(err == NOTIFY_BAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005291 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5292 register_cpu_notifier(&migration_notifier);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005293
Linus Torvalds1da177e2005-04-16 15:20:36 -07005294 return 0;
5295}
5296#endif
5297
5298#ifdef CONFIG_SMP
Christoph Lameter476f3532007-05-06 14:48:58 -07005299
5300/* Number of possible processor ids */
5301int nr_cpu_ids __read_mostly = NR_CPUS;
5302EXPORT_SYMBOL(nr_cpu_ids);
5303
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005304#undef SCHED_DOMAIN_DEBUG
Linus Torvalds1da177e2005-04-16 15:20:36 -07005305#ifdef SCHED_DOMAIN_DEBUG
5306static void sched_domain_debug(struct sched_domain *sd, int cpu)
5307{
5308 int level = 0;
5309
Nick Piggin41c7ce92005-06-25 14:57:24 -07005310 if (!sd) {
5311 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5312 return;
5313 }
5314
Linus Torvalds1da177e2005-04-16 15:20:36 -07005315 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5316
5317 do {
5318 int i;
5319 char str[NR_CPUS];
5320 struct sched_group *group = sd->groups;
5321 cpumask_t groupmask;
5322
5323 cpumask_scnprintf(str, NR_CPUS, sd->span);
5324 cpus_clear(groupmask);
5325
5326 printk(KERN_DEBUG);
5327 for (i = 0; i < level + 1; i++)
5328 printk(" ");
5329 printk("domain %d: ", level);
5330
5331 if (!(sd->flags & SD_LOAD_BALANCE)) {
5332 printk("does not load-balance\n");
5333 if (sd->parent)
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005334 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5335 " has parent");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005336 break;
5337 }
5338
5339 printk("span %s\n", str);
5340
5341 if (!cpu_isset(cpu, sd->span))
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005342 printk(KERN_ERR "ERROR: domain->span does not contain "
5343 "CPU%d\n", cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005344 if (!cpu_isset(cpu, group->cpumask))
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005345 printk(KERN_ERR "ERROR: domain->groups does not contain"
5346 " CPU%d\n", cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005347
5348 printk(KERN_DEBUG);
5349 for (i = 0; i < level + 2; i++)
5350 printk(" ");
5351 printk("groups:");
5352 do {
5353 if (!group) {
5354 printk("\n");
5355 printk(KERN_ERR "ERROR: group is NULL\n");
5356 break;
5357 }
5358
Eric Dumazet5517d862007-05-08 00:32:57 -07005359 if (!group->__cpu_power) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005360 printk("\n");
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005361 printk(KERN_ERR "ERROR: domain->cpu_power not "
5362 "set\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005363 }
5364
5365 if (!cpus_weight(group->cpumask)) {
5366 printk("\n");
5367 printk(KERN_ERR "ERROR: empty group\n");
5368 }
5369
5370 if (cpus_intersects(groupmask, group->cpumask)) {
5371 printk("\n");
5372 printk(KERN_ERR "ERROR: repeated CPUs\n");
5373 }
5374
5375 cpus_or(groupmask, groupmask, group->cpumask);
5376
5377 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5378 printk(" %s", str);
5379
5380 group = group->next;
5381 } while (group != sd->groups);
5382 printk("\n");
5383
5384 if (!cpus_equal(sd->span, groupmask))
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005385 printk(KERN_ERR "ERROR: groups don't span "
5386 "domain->span\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005387
5388 level++;
5389 sd = sd->parent;
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005390 if (!sd)
5391 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005392
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005393 if (!cpus_subset(groupmask, sd->span))
5394 printk(KERN_ERR "ERROR: parent span is not a superset "
5395 "of domain->span\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005396
5397 } while (sd);
5398}
5399#else
Ingo Molnar48f24c42006-07-03 00:25:40 -07005400# define sched_domain_debug(sd, cpu) do { } while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005401#endif
5402
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005403static int sd_degenerate(struct sched_domain *sd)
Suresh Siddha245af2c2005-06-25 14:57:25 -07005404{
5405 if (cpus_weight(sd->span) == 1)
5406 return 1;
5407
5408 /* Following flags need at least 2 groups */
5409 if (sd->flags & (SD_LOAD_BALANCE |
5410 SD_BALANCE_NEWIDLE |
5411 SD_BALANCE_FORK |
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005412 SD_BALANCE_EXEC |
5413 SD_SHARE_CPUPOWER |
5414 SD_SHARE_PKG_RESOURCES)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07005415 if (sd->groups != sd->groups->next)
5416 return 0;
5417 }
5418
5419 /* Following flags don't use groups */
5420 if (sd->flags & (SD_WAKE_IDLE |
5421 SD_WAKE_AFFINE |
5422 SD_WAKE_BALANCE))
5423 return 0;
5424
5425 return 1;
5426}
5427
Ingo Molnar48f24c42006-07-03 00:25:40 -07005428static int
5429sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
Suresh Siddha245af2c2005-06-25 14:57:25 -07005430{
5431 unsigned long cflags = sd->flags, pflags = parent->flags;
5432
5433 if (sd_degenerate(parent))
5434 return 1;
5435
5436 if (!cpus_equal(sd->span, parent->span))
5437 return 0;
5438
5439 /* Does parent contain flags not in child? */
5440 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5441 if (cflags & SD_WAKE_AFFINE)
5442 pflags &= ~SD_WAKE_BALANCE;
5443 /* Flags needing groups don't count if only 1 group in parent */
5444 if (parent->groups == parent->groups->next) {
5445 pflags &= ~(SD_LOAD_BALANCE |
5446 SD_BALANCE_NEWIDLE |
5447 SD_BALANCE_FORK |
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005448 SD_BALANCE_EXEC |
5449 SD_SHARE_CPUPOWER |
5450 SD_SHARE_PKG_RESOURCES);
Suresh Siddha245af2c2005-06-25 14:57:25 -07005451 }
5452 if (~cflags & pflags)
5453 return 0;
5454
5455 return 1;
5456}
5457
Linus Torvalds1da177e2005-04-16 15:20:36 -07005458/*
5459 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5460 * hold the hotplug lock.
5461 */
John Hawkes9c1cfda2005-09-06 15:18:14 -07005462static void cpu_attach_domain(struct sched_domain *sd, int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005463{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005464 struct rq *rq = cpu_rq(cpu);
Suresh Siddha245af2c2005-06-25 14:57:25 -07005465 struct sched_domain *tmp;
5466
5467 /* Remove the sched domains which do not contribute to scheduling. */
5468 for (tmp = sd; tmp; tmp = tmp->parent) {
5469 struct sched_domain *parent = tmp->parent;
5470 if (!parent)
5471 break;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005472 if (sd_parent_degenerate(tmp, parent)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07005473 tmp->parent = parent->parent;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005474 if (parent->parent)
5475 parent->parent->child = tmp;
5476 }
Suresh Siddha245af2c2005-06-25 14:57:25 -07005477 }
5478
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005479 if (sd && sd_degenerate(sd)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07005480 sd = sd->parent;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005481 if (sd)
5482 sd->child = NULL;
5483 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005484
5485 sched_domain_debug(sd, cpu);
5486
Nick Piggin674311d2005-06-25 14:57:27 -07005487 rcu_assign_pointer(rq->sd, sd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005488}
5489
5490/* cpus with isolated domains */
Tim Chen67af63a2006-12-22 01:07:50 -08005491static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005492
5493/* Setup the mask of cpus configured for isolated domains */
5494static int __init isolated_cpu_setup(char *str)
5495{
5496 int ints[NR_CPUS], i;
5497
5498 str = get_options(str, ARRAY_SIZE(ints), ints);
5499 cpus_clear(cpu_isolated_map);
5500 for (i = 1; i <= ints[0]; i++)
5501 if (ints[i] < NR_CPUS)
5502 cpu_set(ints[i], cpu_isolated_map);
5503 return 1;
5504}
5505
5506__setup ("isolcpus=", isolated_cpu_setup);
5507
5508/*
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005509 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5510 * to a function which identifies what group(along with sched group) a CPU
5511 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5512 * (due to the fact that we keep track of groups covered with a cpumask_t).
Linus Torvalds1da177e2005-04-16 15:20:36 -07005513 *
5514 * init_sched_build_groups will build a circular linked list of the groups
5515 * covered by the given span, and will set each group's ->cpumask correctly,
5516 * and ->cpu_power to 0.
5517 */
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005518static void
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005519init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5520 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5521 struct sched_group **sg))
Linus Torvalds1da177e2005-04-16 15:20:36 -07005522{
5523 struct sched_group *first = NULL, *last = NULL;
5524 cpumask_t covered = CPU_MASK_NONE;
5525 int i;
5526
5527 for_each_cpu_mask(i, span) {
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005528 struct sched_group *sg;
5529 int group = group_fn(i, cpu_map, &sg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005530 int j;
5531
5532 if (cpu_isset(i, covered))
5533 continue;
5534
5535 sg->cpumask = CPU_MASK_NONE;
Eric Dumazet5517d862007-05-08 00:32:57 -07005536 sg->__cpu_power = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005537
5538 for_each_cpu_mask(j, span) {
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005539 if (group_fn(j, cpu_map, NULL) != group)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005540 continue;
5541
5542 cpu_set(j, covered);
5543 cpu_set(j, sg->cpumask);
5544 }
5545 if (!first)
5546 first = sg;
5547 if (last)
5548 last->next = sg;
5549 last = sg;
5550 }
5551 last->next = first;
5552}
5553
John Hawkes9c1cfda2005-09-06 15:18:14 -07005554#define SD_NODES_PER_DOMAIN 16
Linus Torvalds1da177e2005-04-16 15:20:36 -07005555
John Hawkes9c1cfda2005-09-06 15:18:14 -07005556#ifdef CONFIG_NUMA
akpm@osdl.org198e2f12006-01-12 01:05:30 -08005557
John Hawkes9c1cfda2005-09-06 15:18:14 -07005558/**
5559 * find_next_best_node - find the next node to include in a sched_domain
5560 * @node: node whose sched_domain we're building
5561 * @used_nodes: nodes already in the sched_domain
5562 *
5563 * Find the next node to include in a given scheduling domain. Simply
5564 * finds the closest node not already in the @used_nodes map.
5565 *
5566 * Should use nodemask_t.
5567 */
5568static int find_next_best_node(int node, unsigned long *used_nodes)
5569{
5570 int i, n, val, min_val, best_node = 0;
5571
5572 min_val = INT_MAX;
5573
5574 for (i = 0; i < MAX_NUMNODES; i++) {
5575 /* Start at @node */
5576 n = (node + i) % MAX_NUMNODES;
5577
5578 if (!nr_cpus_node(n))
5579 continue;
5580
5581 /* Skip already used nodes */
5582 if (test_bit(n, used_nodes))
5583 continue;
5584
5585 /* Simple min distance search */
5586 val = node_distance(node, n);
5587
5588 if (val < min_val) {
5589 min_val = val;
5590 best_node = n;
5591 }
5592 }
5593
5594 set_bit(best_node, used_nodes);
5595 return best_node;
5596}
5597
5598/**
5599 * sched_domain_node_span - get a cpumask for a node's sched_domain
5600 * @node: node whose cpumask we're constructing
5601 * @size: number of nodes to include in this span
5602 *
5603 * Given a node, construct a good cpumask for its sched_domain to span. It
5604 * should be one that prevents unnecessary balancing, but also spreads tasks
5605 * out optimally.
5606 */
5607static cpumask_t sched_domain_node_span(int node)
5608{
John Hawkes9c1cfda2005-09-06 15:18:14 -07005609 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005610 cpumask_t span, nodemask;
5611 int i;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005612
5613 cpus_clear(span);
5614 bitmap_zero(used_nodes, MAX_NUMNODES);
5615
5616 nodemask = node_to_cpumask(node);
5617 cpus_or(span, span, nodemask);
5618 set_bit(node, used_nodes);
5619
5620 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5621 int next_node = find_next_best_node(node, used_nodes);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005622
John Hawkes9c1cfda2005-09-06 15:18:14 -07005623 nodemask = node_to_cpumask(next_node);
5624 cpus_or(span, span, nodemask);
5625 }
5626
5627 return span;
5628}
5629#endif
5630
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07005631int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005632
John Hawkes9c1cfda2005-09-06 15:18:14 -07005633/*
Ingo Molnar48f24c42006-07-03 00:25:40 -07005634 * SMT sched-domains:
John Hawkes9c1cfda2005-09-06 15:18:14 -07005635 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005636#ifdef CONFIG_SCHED_SMT
5637static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005638static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005639
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005640static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5641 struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005642{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005643 if (sg)
5644 *sg = &per_cpu(sched_group_cpus, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005645 return cpu;
5646}
5647#endif
5648
Ingo Molnar48f24c42006-07-03 00:25:40 -07005649/*
5650 * multi-core sched-domains:
5651 */
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005652#ifdef CONFIG_SCHED_MC
5653static DEFINE_PER_CPU(struct sched_domain, core_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005654static DEFINE_PER_CPU(struct sched_group, sched_group_core);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005655#endif
5656
5657#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005658static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5659 struct sched_group **sg)
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005660{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005661 int group;
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005662 cpumask_t mask = cpu_sibling_map[cpu];
5663 cpus_and(mask, mask, *cpu_map);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005664 group = first_cpu(mask);
5665 if (sg)
5666 *sg = &per_cpu(sched_group_core, group);
5667 return group;
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005668}
5669#elif defined(CONFIG_SCHED_MC)
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005670static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5671 struct sched_group **sg)
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005672{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005673 if (sg)
5674 *sg = &per_cpu(sched_group_core, cpu);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005675 return cpu;
5676}
5677#endif
5678
Linus Torvalds1da177e2005-04-16 15:20:36 -07005679static DEFINE_PER_CPU(struct sched_domain, phys_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005680static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005681
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005682static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5683 struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005684{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005685 int group;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005686#ifdef CONFIG_SCHED_MC
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005687 cpumask_t mask = cpu_coregroup_map(cpu);
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005688 cpus_and(mask, mask, *cpu_map);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005689 group = first_cpu(mask);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005690#elif defined(CONFIG_SCHED_SMT)
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005691 cpumask_t mask = cpu_sibling_map[cpu];
5692 cpus_and(mask, mask, *cpu_map);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005693 group = first_cpu(mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005694#else
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005695 group = cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005696#endif
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005697 if (sg)
5698 *sg = &per_cpu(sched_group_phys, group);
5699 return group;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005700}
5701
5702#ifdef CONFIG_NUMA
John Hawkes9c1cfda2005-09-06 15:18:14 -07005703/*
5704 * The init_sched_build_groups can't handle what we want to do with node
5705 * groups, so roll our own. Now each node has its own list of groups which
5706 * gets dynamically allocated.
5707 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005708static DEFINE_PER_CPU(struct sched_domain, node_domains);
John Hawkesd1b55132005-09-06 15:18:14 -07005709static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
John Hawkes9c1cfda2005-09-06 15:18:14 -07005710
5711static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005712static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005713
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005714static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5715 struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005716{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005717 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5718 int group;
5719
5720 cpus_and(nodemask, nodemask, *cpu_map);
5721 group = first_cpu(nodemask);
5722
5723 if (sg)
5724 *sg = &per_cpu(sched_group_allnodes, group);
5725 return group;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005726}
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005727
Siddha, Suresh B08069032006-03-27 01:15:23 -08005728static void init_numa_sched_groups_power(struct sched_group *group_head)
5729{
5730 struct sched_group *sg = group_head;
5731 int j;
5732
5733 if (!sg)
5734 return;
5735next_sg:
5736 for_each_cpu_mask(j, sg->cpumask) {
5737 struct sched_domain *sd;
5738
5739 sd = &per_cpu(phys_domains, j);
5740 if (j != first_cpu(sd->groups->cpumask)) {
5741 /*
5742 * Only add "power" once for each
5743 * physical package.
5744 */
5745 continue;
5746 }
5747
Eric Dumazet5517d862007-05-08 00:32:57 -07005748 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
Siddha, Suresh B08069032006-03-27 01:15:23 -08005749 }
5750 sg = sg->next;
5751 if (sg != group_head)
5752 goto next_sg;
5753}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005754#endif
5755
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005756#ifdef CONFIG_NUMA
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005757/* Free memory allocated for various sched_group structures */
5758static void free_sched_groups(const cpumask_t *cpu_map)
5759{
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005760 int cpu, i;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005761
5762 for_each_cpu_mask(cpu, *cpu_map) {
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005763 struct sched_group **sched_group_nodes
5764 = sched_group_nodes_bycpu[cpu];
5765
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005766 if (!sched_group_nodes)
5767 continue;
5768
5769 for (i = 0; i < MAX_NUMNODES; i++) {
5770 cpumask_t nodemask = node_to_cpumask(i);
5771 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5772
5773 cpus_and(nodemask, nodemask, *cpu_map);
5774 if (cpus_empty(nodemask))
5775 continue;
5776
5777 if (sg == NULL)
5778 continue;
5779 sg = sg->next;
5780next_sg:
5781 oldsg = sg;
5782 sg = sg->next;
5783 kfree(oldsg);
5784 if (oldsg != sched_group_nodes[i])
5785 goto next_sg;
5786 }
5787 kfree(sched_group_nodes);
5788 sched_group_nodes_bycpu[cpu] = NULL;
5789 }
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005790}
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005791#else
5792static void free_sched_groups(const cpumask_t *cpu_map)
5793{
5794}
5795#endif
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005796
Linus Torvalds1da177e2005-04-16 15:20:36 -07005797/*
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005798 * Initialize sched groups cpu_power.
5799 *
5800 * cpu_power indicates the capacity of sched group, which is used while
5801 * distributing the load between different sched groups in a sched domain.
5802 * Typically cpu_power for all the groups in a sched domain will be same unless
5803 * there are asymmetries in the topology. If there are asymmetries, group
5804 * having more cpu_power will pickup more load compared to the group having
5805 * less cpu_power.
5806 *
5807 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5808 * the maximum number of tasks a group can handle in the presence of other idle
5809 * or lightly loaded groups in the same sched domain.
5810 */
5811static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5812{
5813 struct sched_domain *child;
5814 struct sched_group *group;
5815
5816 WARN_ON(!sd || !sd->groups);
5817
5818 if (cpu != first_cpu(sd->groups->cpumask))
5819 return;
5820
5821 child = sd->child;
5822
Eric Dumazet5517d862007-05-08 00:32:57 -07005823 sd->groups->__cpu_power = 0;
5824
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005825 /*
5826 * For perf policy, if the groups in child domain share resources
5827 * (for example cores sharing some portions of the cache hierarchy
5828 * or SMT), then set this domain groups cpu_power such that each group
5829 * can handle only one task, when there are other idle groups in the
5830 * same sched domain.
5831 */
5832 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5833 (child->flags &
5834 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
Eric Dumazet5517d862007-05-08 00:32:57 -07005835 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005836 return;
5837 }
5838
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005839 /*
5840 * add cpu_power of each child group to this groups cpu_power
5841 */
5842 group = child->groups;
5843 do {
Eric Dumazet5517d862007-05-08 00:32:57 -07005844 sg_inc_cpu_power(sd->groups, group->__cpu_power);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005845 group = group->next;
5846 } while (group != child->groups);
5847}
5848
5849/*
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005850 * Build sched domains for a given set of cpus and attach the sched domains
5851 * to the individual cpus
Linus Torvalds1da177e2005-04-16 15:20:36 -07005852 */
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005853static int build_sched_domains(const cpumask_t *cpu_map)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005854{
5855 int i;
John Hawkesd1b55132005-09-06 15:18:14 -07005856#ifdef CONFIG_NUMA
5857 struct sched_group **sched_group_nodes = NULL;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005858 int sd_allnodes = 0;
John Hawkesd1b55132005-09-06 15:18:14 -07005859
5860 /*
5861 * Allocate the per-node list of sched groups
5862 */
Ingo Molnardd41f592007-07-09 18:51:59 +02005863 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
Srivatsa Vaddagirid3a5aa92006-06-27 02:54:39 -07005864 GFP_KERNEL);
John Hawkesd1b55132005-09-06 15:18:14 -07005865 if (!sched_group_nodes) {
5866 printk(KERN_WARNING "Can not alloc sched group node list\n");
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005867 return -ENOMEM;
John Hawkesd1b55132005-09-06 15:18:14 -07005868 }
5869 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5870#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005871
5872 /*
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005873 * Set up domains for cpus specified by the cpu_map.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005874 */
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005875 for_each_cpu_mask(i, *cpu_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005876 struct sched_domain *sd = NULL, *p;
5877 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5878
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005879 cpus_and(nodemask, nodemask, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005880
5881#ifdef CONFIG_NUMA
Ingo Molnardd41f592007-07-09 18:51:59 +02005882 if (cpus_weight(*cpu_map) >
5883 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
John Hawkes9c1cfda2005-09-06 15:18:14 -07005884 sd = &per_cpu(allnodes_domains, i);
5885 *sd = SD_ALLNODES_INIT;
5886 sd->span = *cpu_map;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005887 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005888 p = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005889 sd_allnodes = 1;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005890 } else
5891 p = NULL;
5892
Linus Torvalds1da177e2005-04-16 15:20:36 -07005893 sd = &per_cpu(node_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005894 *sd = SD_NODE_INIT;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005895 sd->span = sched_domain_node_span(cpu_to_node(i));
5896 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005897 if (p)
5898 p->child = sd;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005899 cpus_and(sd->span, sd->span, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005900#endif
5901
5902 p = sd;
5903 sd = &per_cpu(phys_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005904 *sd = SD_CPU_INIT;
5905 sd->span = nodemask;
5906 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005907 if (p)
5908 p->child = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005909 cpu_to_phys_group(i, cpu_map, &sd->groups);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005910
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005911#ifdef CONFIG_SCHED_MC
5912 p = sd;
5913 sd = &per_cpu(core_domains, i);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005914 *sd = SD_MC_INIT;
5915 sd->span = cpu_coregroup_map(i);
5916 cpus_and(sd->span, sd->span, *cpu_map);
5917 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005918 p->child = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005919 cpu_to_core_group(i, cpu_map, &sd->groups);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005920#endif
5921
Linus Torvalds1da177e2005-04-16 15:20:36 -07005922#ifdef CONFIG_SCHED_SMT
5923 p = sd;
5924 sd = &per_cpu(cpu_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005925 *sd = SD_SIBLING_INIT;
5926 sd->span = cpu_sibling_map[i];
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005927 cpus_and(sd->span, sd->span, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005928 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005929 p->child = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005930 cpu_to_cpu_group(i, cpu_map, &sd->groups);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005931#endif
5932 }
5933
5934#ifdef CONFIG_SCHED_SMT
5935 /* Set up CPU (sibling) groups */
John Hawkes9c1cfda2005-09-06 15:18:14 -07005936 for_each_cpu_mask(i, *cpu_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005937 cpumask_t this_sibling_map = cpu_sibling_map[i];
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005938 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005939 if (i != first_cpu(this_sibling_map))
5940 continue;
5941
Ingo Molnardd41f592007-07-09 18:51:59 +02005942 init_sched_build_groups(this_sibling_map, cpu_map,
5943 &cpu_to_cpu_group);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005944 }
5945#endif
5946
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005947#ifdef CONFIG_SCHED_MC
5948 /* Set up multi-core groups */
5949 for_each_cpu_mask(i, *cpu_map) {
5950 cpumask_t this_core_map = cpu_coregroup_map(i);
5951 cpus_and(this_core_map, this_core_map, *cpu_map);
5952 if (i != first_cpu(this_core_map))
5953 continue;
Ingo Molnardd41f592007-07-09 18:51:59 +02005954 init_sched_build_groups(this_core_map, cpu_map,
5955 &cpu_to_core_group);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005956 }
5957#endif
5958
Linus Torvalds1da177e2005-04-16 15:20:36 -07005959 /* Set up physical groups */
5960 for (i = 0; i < MAX_NUMNODES; i++) {
5961 cpumask_t nodemask = node_to_cpumask(i);
5962
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005963 cpus_and(nodemask, nodemask, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005964 if (cpus_empty(nodemask))
5965 continue;
5966
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005967 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005968 }
5969
5970#ifdef CONFIG_NUMA
5971 /* Set up node groups */
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005972 if (sd_allnodes)
Ingo Molnardd41f592007-07-09 18:51:59 +02005973 init_sched_build_groups(*cpu_map, cpu_map,
5974 &cpu_to_allnodes_group);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005975
5976 for (i = 0; i < MAX_NUMNODES; i++) {
5977 /* Set up node groups */
5978 struct sched_group *sg, *prev;
5979 cpumask_t nodemask = node_to_cpumask(i);
5980 cpumask_t domainspan;
5981 cpumask_t covered = CPU_MASK_NONE;
5982 int j;
5983
5984 cpus_and(nodemask, nodemask, *cpu_map);
John Hawkesd1b55132005-09-06 15:18:14 -07005985 if (cpus_empty(nodemask)) {
5986 sched_group_nodes[i] = NULL;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005987 continue;
John Hawkesd1b55132005-09-06 15:18:14 -07005988 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07005989
5990 domainspan = sched_domain_node_span(i);
5991 cpus_and(domainspan, domainspan, *cpu_map);
5992
Srivatsa Vaddagiri15f0b672006-06-27 02:54:40 -07005993 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005994 if (!sg) {
5995 printk(KERN_WARNING "Can not alloc domain group for "
5996 "node %d\n", i);
5997 goto error;
5998 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07005999 sched_group_nodes[i] = sg;
6000 for_each_cpu_mask(j, nodemask) {
6001 struct sched_domain *sd;
6002 sd = &per_cpu(node_domains, j);
6003 sd->groups = sg;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006004 }
Eric Dumazet5517d862007-05-08 00:32:57 -07006005 sg->__cpu_power = 0;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006006 sg->cpumask = nodemask;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006007 sg->next = sg;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006008 cpus_or(covered, covered, nodemask);
6009 prev = sg;
6010
6011 for (j = 0; j < MAX_NUMNODES; j++) {
6012 cpumask_t tmp, notcovered;
6013 int n = (i + j) % MAX_NUMNODES;
6014
6015 cpus_complement(notcovered, covered);
6016 cpus_and(tmp, notcovered, *cpu_map);
6017 cpus_and(tmp, tmp, domainspan);
6018 if (cpus_empty(tmp))
6019 break;
6020
6021 nodemask = node_to_cpumask(n);
6022 cpus_and(tmp, tmp, nodemask);
6023 if (cpus_empty(tmp))
6024 continue;
6025
Srivatsa Vaddagiri15f0b672006-06-27 02:54:40 -07006026 sg = kmalloc_node(sizeof(struct sched_group),
6027 GFP_KERNEL, i);
John Hawkes9c1cfda2005-09-06 15:18:14 -07006028 if (!sg) {
6029 printk(KERN_WARNING
6030 "Can not alloc domain group for node %d\n", j);
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006031 goto error;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006032 }
Eric Dumazet5517d862007-05-08 00:32:57 -07006033 sg->__cpu_power = 0;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006034 sg->cpumask = tmp;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006035 sg->next = prev->next;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006036 cpus_or(covered, covered, tmp);
6037 prev->next = sg;
6038 prev = sg;
6039 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07006040 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006041#endif
6042
6043 /* Calculate CPU power for physical packages and nodes */
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006044#ifdef CONFIG_SCHED_SMT
6045 for_each_cpu_mask(i, *cpu_map) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006046 struct sched_domain *sd = &per_cpu(cpu_domains, i);
6047
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006048 init_sched_groups_power(i, sd);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006049 }
6050#endif
6051#ifdef CONFIG_SCHED_MC
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006052 for_each_cpu_mask(i, *cpu_map) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006053 struct sched_domain *sd = &per_cpu(core_domains, i);
6054
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006055 init_sched_groups_power(i, sd);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006056 }
6057#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006058
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006059 for_each_cpu_mask(i, *cpu_map) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006060 struct sched_domain *sd = &per_cpu(phys_domains, i);
6061
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006062 init_sched_groups_power(i, sd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006063 }
6064
John Hawkes9c1cfda2005-09-06 15:18:14 -07006065#ifdef CONFIG_NUMA
Siddha, Suresh B08069032006-03-27 01:15:23 -08006066 for (i = 0; i < MAX_NUMNODES; i++)
6067 init_numa_sched_groups_power(sched_group_nodes[i]);
John Hawkes9c1cfda2005-09-06 15:18:14 -07006068
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08006069 if (sd_allnodes) {
6070 struct sched_group *sg;
Siddha, Suresh Bf712c0c2006-07-30 03:02:59 -07006071
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08006072 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
Siddha, Suresh Bf712c0c2006-07-30 03:02:59 -07006073 init_numa_sched_groups_power(sg);
6074 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07006075#endif
6076
Linus Torvalds1da177e2005-04-16 15:20:36 -07006077 /* Attach the domains */
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006078 for_each_cpu_mask(i, *cpu_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006079 struct sched_domain *sd;
6080#ifdef CONFIG_SCHED_SMT
6081 sd = &per_cpu(cpu_domains, i);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08006082#elif defined(CONFIG_SCHED_MC)
6083 sd = &per_cpu(core_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006084#else
6085 sd = &per_cpu(phys_domains, i);
6086#endif
6087 cpu_attach_domain(sd, i);
6088 }
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006089
6090 return 0;
6091
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07006092#ifdef CONFIG_NUMA
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006093error:
6094 free_sched_groups(cpu_map);
6095 return -ENOMEM;
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07006096#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006097}
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006098/*
6099 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6100 */
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006101static int arch_init_sched_domains(const cpumask_t *cpu_map)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006102{
6103 cpumask_t cpu_default_map;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006104 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006105
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006106 /*
6107 * Setup mask for cpus without special case scheduling requirements.
6108 * For now this just excludes isolated cpus, but could be used to
6109 * exclude other special cases in the future.
6110 */
6111 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6112
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006113 err = build_sched_domains(&cpu_default_map);
6114
6115 return err;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006116}
6117
6118static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006119{
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006120 free_sched_groups(cpu_map);
John Hawkes9c1cfda2005-09-06 15:18:14 -07006121}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006122
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006123/*
6124 * Detach sched domains from a group of cpus specified in cpu_map
6125 * These cpus will now be attached to the NULL domain
6126 */
Arjan van de Ven858119e2006-01-14 13:20:43 -08006127static void detach_destroy_domains(const cpumask_t *cpu_map)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006128{
6129 int i;
6130
6131 for_each_cpu_mask(i, *cpu_map)
6132 cpu_attach_domain(NULL, i);
6133 synchronize_sched();
6134 arch_destroy_sched_domains(cpu_map);
6135}
6136
6137/*
6138 * Partition sched domains as specified by the cpumasks below.
6139 * This attaches all cpus from the cpumasks to the NULL domain,
6140 * waits for a RCU quiescent period, recalculates sched
6141 * domain information and then attaches them back to the
6142 * correct sched domains
6143 * Call with hotplug lock held
6144 */
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006145int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006146{
6147 cpumask_t change_map;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006148 int err = 0;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006149
6150 cpus_and(*partition1, *partition1, cpu_online_map);
6151 cpus_and(*partition2, *partition2, cpu_online_map);
6152 cpus_or(change_map, *partition1, *partition2);
6153
6154 /* Detach sched domains from all of the affected cpus */
6155 detach_destroy_domains(&change_map);
6156 if (!cpus_empty(*partition1))
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006157 err = build_sched_domains(partition1);
6158 if (!err && !cpus_empty(*partition2))
6159 err = build_sched_domains(partition2);
6160
6161 return err;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006162}
6163
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006164#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6165int arch_reinit_sched_domains(void)
6166{
6167 int err;
6168
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006169 mutex_lock(&sched_hotcpu_mutex);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006170 detach_destroy_domains(&cpu_online_map);
6171 err = arch_init_sched_domains(&cpu_online_map);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006172 mutex_unlock(&sched_hotcpu_mutex);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006173
6174 return err;
6175}
6176
6177static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6178{
6179 int ret;
6180
6181 if (buf[0] != '0' && buf[0] != '1')
6182 return -EINVAL;
6183
6184 if (smt)
6185 sched_smt_power_savings = (buf[0] == '1');
6186 else
6187 sched_mc_power_savings = (buf[0] == '1');
6188
6189 ret = arch_reinit_sched_domains();
6190
6191 return ret ? ret : count;
6192}
6193
6194int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6195{
6196 int err = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07006197
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006198#ifdef CONFIG_SCHED_SMT
6199 if (smt_capable())
6200 err = sysfs_create_file(&cls->kset.kobj,
6201 &attr_sched_smt_power_savings.attr);
6202#endif
6203#ifdef CONFIG_SCHED_MC
6204 if (!err && mc_capable())
6205 err = sysfs_create_file(&cls->kset.kobj,
6206 &attr_sched_mc_power_savings.attr);
6207#endif
6208 return err;
6209}
6210#endif
6211
6212#ifdef CONFIG_SCHED_MC
6213static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6214{
6215 return sprintf(page, "%u\n", sched_mc_power_savings);
6216}
Ingo Molnar48f24c42006-07-03 00:25:40 -07006217static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6218 const char *buf, size_t count)
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006219{
6220 return sched_power_savings_store(buf, count, 0);
6221}
6222SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6223 sched_mc_power_savings_store);
6224#endif
6225
6226#ifdef CONFIG_SCHED_SMT
6227static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6228{
6229 return sprintf(page, "%u\n", sched_smt_power_savings);
6230}
Ingo Molnar48f24c42006-07-03 00:25:40 -07006231static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6232 const char *buf, size_t count)
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006233{
6234 return sched_power_savings_store(buf, count, 1);
6235}
6236SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6237 sched_smt_power_savings_store);
6238#endif
6239
Linus Torvalds1da177e2005-04-16 15:20:36 -07006240/*
6241 * Force a reinitialization of the sched domains hierarchy. The domains
6242 * and groups cannot be updated in place without racing with the balancing
Nick Piggin41c7ce92005-06-25 14:57:24 -07006243 * code, so we temporarily attach all running cpus to the NULL domain
Linus Torvalds1da177e2005-04-16 15:20:36 -07006244 * which will prevent rebalancing while the sched domains are recalculated.
6245 */
6246static int update_sched_domains(struct notifier_block *nfb,
6247 unsigned long action, void *hcpu)
6248{
Linus Torvalds1da177e2005-04-16 15:20:36 -07006249 switch (action) {
6250 case CPU_UP_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006251 case CPU_UP_PREPARE_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006252 case CPU_DOWN_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006253 case CPU_DOWN_PREPARE_FROZEN:
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006254 detach_destroy_domains(&cpu_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006255 return NOTIFY_OK;
6256
6257 case CPU_UP_CANCELED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006258 case CPU_UP_CANCELED_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006259 case CPU_DOWN_FAILED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006260 case CPU_DOWN_FAILED_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006261 case CPU_ONLINE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006262 case CPU_ONLINE_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006263 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006264 case CPU_DEAD_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006265 /*
6266 * Fall through and re-initialise the domains.
6267 */
6268 break;
6269 default:
6270 return NOTIFY_DONE;
6271 }
6272
6273 /* The hotplug lock is already held by cpu_up/cpu_down */
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006274 arch_init_sched_domains(&cpu_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006275
6276 return NOTIFY_OK;
6277}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006278
6279void __init sched_init_smp(void)
6280{
Nick Piggin5c1e1762006-10-03 01:14:04 -07006281 cpumask_t non_isolated_cpus;
6282
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006283 mutex_lock(&sched_hotcpu_mutex);
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006284 arch_init_sched_domains(&cpu_online_map);
Nathan Lynche5e56732007-01-10 23:15:28 -08006285 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
Nick Piggin5c1e1762006-10-03 01:14:04 -07006286 if (cpus_empty(non_isolated_cpus))
6287 cpu_set(smp_processor_id(), non_isolated_cpus);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006288 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006289 /* XXX: Theoretical race here - CPU may be hotplugged now */
6290 hotcpu_notifier(update_sched_domains, 0);
Nick Piggin5c1e1762006-10-03 01:14:04 -07006291
6292 /* Move init over to a non-isolated CPU */
6293 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6294 BUG();
Ingo Molnardd41f592007-07-09 18:51:59 +02006295 sched_init_granularity();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006296}
6297#else
6298void __init sched_init_smp(void)
6299{
Ingo Molnardd41f592007-07-09 18:51:59 +02006300 sched_init_granularity();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006301}
6302#endif /* CONFIG_SMP */
6303
6304int in_sched_functions(unsigned long addr)
6305{
6306 /* Linker adds these: start and end of __sched functions */
6307 extern char __sched_text_start[], __sched_text_end[];
Ingo Molnar48f24c42006-07-03 00:25:40 -07006308
Linus Torvalds1da177e2005-04-16 15:20:36 -07006309 return in_lock_functions(addr) ||
6310 (addr >= (unsigned long)__sched_text_start
6311 && addr < (unsigned long)__sched_text_end);
6312}
6313
Ingo Molnardd41f592007-07-09 18:51:59 +02006314static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6315{
6316 cfs_rq->tasks_timeline = RB_ROOT;
6317 cfs_rq->fair_clock = 1;
6318#ifdef CONFIG_FAIR_GROUP_SCHED
6319 cfs_rq->rq = rq;
6320#endif
6321}
6322
Linus Torvalds1da177e2005-04-16 15:20:36 -07006323void __init sched_init(void)
6324{
Ingo Molnardd41f592007-07-09 18:51:59 +02006325 u64 now = sched_clock();
Christoph Lameter476f3532007-05-06 14:48:58 -07006326 int highest_cpu = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02006327 int i, j;
6328
6329 /*
6330 * Link up the scheduling class hierarchy:
6331 */
6332 rt_sched_class.next = &fair_sched_class;
6333 fair_sched_class.next = &idle_sched_class;
6334 idle_sched_class.next = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006335
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08006336 for_each_possible_cpu(i) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006337 struct rt_prio_array *array;
Ingo Molnar70b97a72006-07-03 00:25:42 -07006338 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006339
6340 rq = cpu_rq(i);
6341 spin_lock_init(&rq->lock);
Ingo Molnarfcb99372006-07-03 00:25:10 -07006342 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
Nick Piggin78979862005-06-25 14:57:13 -07006343 rq->nr_running = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02006344 rq->clock = 1;
6345 init_cfs_rq(&rq->cfs, rq);
6346#ifdef CONFIG_FAIR_GROUP_SCHED
6347 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6348 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6349#endif
6350 rq->ls.load_update_last = now;
6351 rq->ls.load_update_start = now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006352
Ingo Molnardd41f592007-07-09 18:51:59 +02006353 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6354 rq->cpu_load[j] = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006355#ifdef CONFIG_SMP
Nick Piggin41c7ce92005-06-25 14:57:24 -07006356 rq->sd = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006357 rq->active_balance = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02006358 rq->next_balance = jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006359 rq->push_cpu = 0;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07006360 rq->cpu = i;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006361 rq->migration_thread = NULL;
6362 INIT_LIST_HEAD(&rq->migration_queue);
6363#endif
6364 atomic_set(&rq->nr_iowait, 0);
6365
Ingo Molnardd41f592007-07-09 18:51:59 +02006366 array = &rq->rt.active;
6367 for (j = 0; j < MAX_RT_PRIO; j++) {
6368 INIT_LIST_HEAD(array->queue + j);
6369 __clear_bit(j, array->bitmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006370 }
Christoph Lameter476f3532007-05-06 14:48:58 -07006371 highest_cpu = i;
Ingo Molnardd41f592007-07-09 18:51:59 +02006372 /* delimiter for bitsearch: */
6373 __set_bit(MAX_RT_PRIO, array->bitmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006374 }
6375
Peter Williams2dd73a42006-06-27 02:54:34 -07006376 set_load_weight(&init_task);
Heiko Carstensb50f60c2006-07-30 03:03:52 -07006377
Christoph Lameterc9819f42006-12-10 02:20:25 -08006378#ifdef CONFIG_SMP
Christoph Lameter476f3532007-05-06 14:48:58 -07006379 nr_cpu_ids = highest_cpu + 1;
Christoph Lameterc9819f42006-12-10 02:20:25 -08006380 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6381#endif
6382
Heiko Carstensb50f60c2006-07-30 03:03:52 -07006383#ifdef CONFIG_RT_MUTEXES
6384 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6385#endif
6386
Linus Torvalds1da177e2005-04-16 15:20:36 -07006387 /*
6388 * The boot idle thread does lazy MMU switching as well:
6389 */
6390 atomic_inc(&init_mm.mm_count);
6391 enter_lazy_tlb(&init_mm, current);
6392
6393 /*
6394 * Make us the idle thread. Technically, schedule() should not be
6395 * called from this thread, however somewhere below it might be,
6396 * but because we are the idle thread, we just pick up running again
6397 * when this runqueue becomes "idle".
6398 */
6399 init_idle(current, smp_processor_id());
Ingo Molnardd41f592007-07-09 18:51:59 +02006400 /*
6401 * During early bootup we pretend to be a normal task:
6402 */
6403 current->sched_class = &fair_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006404}
6405
6406#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6407void __might_sleep(char *file, int line)
6408{
Ingo Molnar48f24c42006-07-03 00:25:40 -07006409#ifdef in_atomic
Linus Torvalds1da177e2005-04-16 15:20:36 -07006410 static unsigned long prev_jiffy; /* ratelimiting */
6411
6412 if ((in_atomic() || irqs_disabled()) &&
6413 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6414 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6415 return;
6416 prev_jiffy = jiffies;
Ingo Molnar91368d72006-03-23 03:00:54 -08006417 printk(KERN_ERR "BUG: sleeping function called from invalid"
Linus Torvalds1da177e2005-04-16 15:20:36 -07006418 " context at %s:%d\n", file, line);
6419 printk("in_atomic():%d, irqs_disabled():%d\n",
6420 in_atomic(), irqs_disabled());
Peter Zijlstraa4c410f2006-12-06 20:37:21 -08006421 debug_show_held_locks(current);
Ingo Molnar3117df02006-12-13 00:34:43 -08006422 if (irqs_disabled())
6423 print_irqtrace_events(current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006424 dump_stack();
6425 }
6426#endif
6427}
6428EXPORT_SYMBOL(__might_sleep);
6429#endif
6430
6431#ifdef CONFIG_MAGIC_SYSRQ
6432void normalize_rt_tasks(void)
6433{
Ingo Molnara0f98a12007-06-17 18:37:45 +02006434 struct task_struct *g, *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006435 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07006436 struct rq *rq;
Ingo Molnardd41f592007-07-09 18:51:59 +02006437 int on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006438
6439 read_lock_irq(&tasklist_lock);
Ingo Molnara0f98a12007-06-17 18:37:45 +02006440 do_each_thread(g, p) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006441 p->se.fair_key = 0;
6442 p->se.wait_runtime = 0;
6443 p->se.wait_start_fair = 0;
6444 p->se.wait_start = 0;
6445 p->se.exec_start = 0;
6446 p->se.sleep_start = 0;
6447 p->se.sleep_start_fair = 0;
6448 p->se.block_start = 0;
6449 task_rq(p)->cfs.fair_clock = 0;
6450 task_rq(p)->clock = 0;
6451
6452 if (!rt_task(p)) {
6453 /*
6454 * Renice negative nice level userspace
6455 * tasks back to 0:
6456 */
6457 if (TASK_NICE(p) < 0 && p->mm)
6458 set_user_nice(p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006459 continue;
Ingo Molnardd41f592007-07-09 18:51:59 +02006460 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006461
Ingo Molnarb29739f2006-06-27 02:54:51 -07006462 spin_lock_irqsave(&p->pi_lock, flags);
6463 rq = __task_rq_lock(p);
Ingo Molnardd41f592007-07-09 18:51:59 +02006464#ifdef CONFIG_SMP
6465 /*
6466 * Do not touch the migration thread:
6467 */
6468 if (p == rq->migration_thread)
6469 goto out_unlock;
6470#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006471
Ingo Molnardd41f592007-07-09 18:51:59 +02006472 on_rq = p->se.on_rq;
6473 if (on_rq)
6474 deactivate_task(task_rq(p), p, 0);
6475 __setscheduler(rq, p, SCHED_NORMAL, 0);
6476 if (on_rq) {
6477 activate_task(task_rq(p), p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006478 resched_task(rq->curr);
6479 }
Ingo Molnardd41f592007-07-09 18:51:59 +02006480#ifdef CONFIG_SMP
6481 out_unlock:
6482#endif
Ingo Molnarb29739f2006-06-27 02:54:51 -07006483 __task_rq_unlock(rq);
6484 spin_unlock_irqrestore(&p->pi_lock, flags);
Ingo Molnara0f98a12007-06-17 18:37:45 +02006485 } while_each_thread(g, p);
6486
Linus Torvalds1da177e2005-04-16 15:20:36 -07006487 read_unlock_irq(&tasklist_lock);
6488}
6489
6490#endif /* CONFIG_MAGIC_SYSRQ */
Linus Torvalds1df5c102005-09-12 07:59:21 -07006491
6492#ifdef CONFIG_IA64
6493/*
6494 * These functions are only useful for the IA64 MCA handling.
6495 *
6496 * They can only be called when the whole system has been
6497 * stopped - every CPU needs to be quiescent, and no scheduling
6498 * activity can take place. Using them for anything else would
6499 * be a serious bug, and as a result, they aren't even visible
6500 * under any other configuration.
6501 */
6502
6503/**
6504 * curr_task - return the current task for a given cpu.
6505 * @cpu: the processor in question.
6506 *
6507 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6508 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07006509struct task_struct *curr_task(int cpu)
Linus Torvalds1df5c102005-09-12 07:59:21 -07006510{
6511 return cpu_curr(cpu);
6512}
6513
6514/**
6515 * set_curr_task - set the current task for a given cpu.
6516 * @cpu: the processor in question.
6517 * @p: the task pointer to set.
6518 *
6519 * Description: This function must only be used when non-maskable interrupts
6520 * are serviced on a separate stack. It allows the architecture to switch the
6521 * notion of the current task on a cpu in a non-blocking manner. This function
6522 * must be called with all CPU's synchronized, and interrupts disabled, the
6523 * and caller must save the original value of the current task (see
6524 * curr_task() above) and restore that value before reenabling interrupts and
6525 * re-starting the system.
6526 *
6527 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6528 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07006529void set_curr_task(int cpu, struct task_struct *p)
Linus Torvalds1df5c102005-09-12 07:59:21 -07006530{
6531 cpu_curr(cpu) = p;
6532}
6533
6534#endif