sched: add rt-overload tracking
[linux-2.6.git] / kernel / sched_rt.c
1 /*
2  * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
3  * policies)
4  */
5
6 #ifdef CONFIG_SMP
7 static cpumask_t rt_overload_mask;
8 static atomic_t rto_count;
9 static inline int rt_overloaded(void)
10 {
11         return atomic_read(&rto_count);
12 }
13 static inline cpumask_t *rt_overload(void)
14 {
15         return &rt_overload_mask;
16 }
17 static inline void rt_set_overload(struct rq *rq)
18 {
19         cpu_set(rq->cpu, rt_overload_mask);
20         /*
21          * Make sure the mask is visible before we set
22          * the overload count. That is checked to determine
23          * if we should look at the mask. It would be a shame
24          * if we looked at the mask, but the mask was not
25          * updated yet.
26          */
27         wmb();
28         atomic_inc(&rto_count);
29 }
30 static inline void rt_clear_overload(struct rq *rq)
31 {
32         /* the order here really doesn't matter */
33         atomic_dec(&rto_count);
34         cpu_clear(rq->cpu, rt_overload_mask);
35 }
36 #endif /* CONFIG_SMP */
37
38 /*
39  * Update the current task's runtime statistics. Skip current tasks that
40  * are not in our scheduling class.
41  */
42 static void update_curr_rt(struct rq *rq)
43 {
44         struct task_struct *curr = rq->curr;
45         u64 delta_exec;
46
47         if (!task_has_rt_policy(curr))
48                 return;
49
50         delta_exec = rq->clock - curr->se.exec_start;
51         if (unlikely((s64)delta_exec < 0))
52                 delta_exec = 0;
53
54         schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
55
56         curr->se.sum_exec_runtime += delta_exec;
57         curr->se.exec_start = rq->clock;
58         cpuacct_charge(curr, delta_exec);
59 }
60
61 static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
62 {
63         WARN_ON(!rt_task(p));
64         rq->rt.rt_nr_running++;
65 #ifdef CONFIG_SMP
66         if (p->prio < rq->rt.highest_prio)
67                 rq->rt.highest_prio = p->prio;
68         if (rq->rt.rt_nr_running > 1)
69                 rt_set_overload(rq);
70 #endif /* CONFIG_SMP */
71 }
72
73 static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
74 {
75         WARN_ON(!rt_task(p));
76         WARN_ON(!rq->rt.rt_nr_running);
77         rq->rt.rt_nr_running--;
78 #ifdef CONFIG_SMP
79         if (rq->rt.rt_nr_running) {
80                 struct rt_prio_array *array;
81
82                 WARN_ON(p->prio < rq->rt.highest_prio);
83                 if (p->prio == rq->rt.highest_prio) {
84                         /* recalculate */
85                         array = &rq->rt.active;
86                         rq->rt.highest_prio =
87                                 sched_find_first_bit(array->bitmap);
88                 } /* otherwise leave rq->highest prio alone */
89         } else
90                 rq->rt.highest_prio = MAX_RT_PRIO;
91         if (rq->rt.rt_nr_running < 2)
92                 rt_clear_overload(rq);
93 #endif /* CONFIG_SMP */
94 }
95
96 static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
97 {
98         struct rt_prio_array *array = &rq->rt.active;
99
100         list_add_tail(&p->run_list, array->queue + p->prio);
101         __set_bit(p->prio, array->bitmap);
102         inc_cpu_load(rq, p->se.load.weight);
103
104         inc_rt_tasks(p, rq);
105 }
106
107 /*
108  * Adding/removing a task to/from a priority array:
109  */
110 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
111 {
112         struct rt_prio_array *array = &rq->rt.active;
113
114         update_curr_rt(rq);
115
116         list_del(&p->run_list);
117         if (list_empty(array->queue + p->prio))
118                 __clear_bit(p->prio, array->bitmap);
119         dec_cpu_load(rq, p->se.load.weight);
120
121         dec_rt_tasks(p, rq);
122 }
123
124 /*
125  * Put task to the end of the run list without the overhead of dequeue
126  * followed by enqueue.
127  */
128 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
129 {
130         struct rt_prio_array *array = &rq->rt.active;
131
132         list_move_tail(&p->run_list, array->queue + p->prio);
133 }
134
135 static void
136 yield_task_rt(struct rq *rq)
137 {
138         requeue_task_rt(rq, rq->curr);
139 }
140
141 /*
142  * Preempt the current task with a newly woken task if needed:
143  */
144 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
145 {
146         if (p->prio < rq->curr->prio)
147                 resched_task(rq->curr);
148 }
149
150 static struct task_struct *pick_next_task_rt(struct rq *rq)
151 {
152         struct rt_prio_array *array = &rq->rt.active;
153         struct task_struct *next;
154         struct list_head *queue;
155         int idx;
156
157         idx = sched_find_first_bit(array->bitmap);
158         if (idx >= MAX_RT_PRIO)
159                 return NULL;
160
161         queue = array->queue + idx;
162         next = list_entry(queue->next, struct task_struct, run_list);
163
164         next->se.exec_start = rq->clock;
165
166         return next;
167 }
168
169 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
170 {
171         update_curr_rt(rq);
172         p->se.exec_start = 0;
173 }
174
175 #ifdef CONFIG_SMP
176 /* Only try algorithms three times */
177 #define RT_MAX_TRIES 3
178
179 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
180 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
181
182 /* Return the second highest RT task, NULL otherwise */
183 static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
184 {
185         struct rt_prio_array *array = &rq->rt.active;
186         struct task_struct *next;
187         struct list_head *queue;
188         int idx;
189
190         assert_spin_locked(&rq->lock);
191
192         if (likely(rq->rt.rt_nr_running < 2))
193                 return NULL;
194
195         idx = sched_find_first_bit(array->bitmap);
196         if (unlikely(idx >= MAX_RT_PRIO)) {
197                 WARN_ON(1); /* rt_nr_running is bad */
198                 return NULL;
199         }
200
201         queue = array->queue + idx;
202         next = list_entry(queue->next, struct task_struct, run_list);
203         if (unlikely(next != rq->curr))
204                 return next;
205
206         if (queue->next->next != queue) {
207                 /* same prio task */
208                 next = list_entry(queue->next->next, struct task_struct, run_list);
209                 return next;
210         }
211
212         /* slower, but more flexible */
213         idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
214         if (unlikely(idx >= MAX_RT_PRIO)) {
215                 WARN_ON(1); /* rt_nr_running was 2 and above! */
216                 return NULL;
217         }
218
219         queue = array->queue + idx;
220         next = list_entry(queue->next, struct task_struct, run_list);
221
222         return next;
223 }
224
225 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
226
227 /* Will lock the rq it finds */
228 static struct rq *find_lock_lowest_rq(struct task_struct *task,
229                                       struct rq *this_rq)
230 {
231         struct rq *lowest_rq = NULL;
232         int cpu;
233         int tries;
234         cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
235
236         cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
237
238         for (tries = 0; tries < RT_MAX_TRIES; tries++) {
239                 /*
240                  * Scan each rq for the lowest prio.
241                  */
242                 for_each_cpu_mask(cpu, *cpu_mask) {
243                         struct rq *rq = &per_cpu(runqueues, cpu);
244
245                         if (cpu == this_rq->cpu)
246                                 continue;
247
248                         /* We look for lowest RT prio or non-rt CPU */
249                         if (rq->rt.highest_prio >= MAX_RT_PRIO) {
250                                 lowest_rq = rq;
251                                 break;
252                         }
253
254                         /* no locking for now */
255                         if (rq->rt.highest_prio > task->prio &&
256                             (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
257                                 lowest_rq = rq;
258                         }
259                 }
260
261                 if (!lowest_rq)
262                         break;
263
264                 /* if the prio of this runqueue changed, try again */
265                 if (double_lock_balance(this_rq, lowest_rq)) {
266                         /*
267                          * We had to unlock the run queue. In
268                          * the mean time, task could have
269                          * migrated already or had its affinity changed.
270                          * Also make sure that it wasn't scheduled on its rq.
271                          */
272                         if (unlikely(task_rq(task) != this_rq ||
273                                      !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
274                                      task_running(this_rq, task) ||
275                                      !task->se.on_rq)) {
276                                 spin_unlock(&lowest_rq->lock);
277                                 lowest_rq = NULL;
278                                 break;
279                         }
280                 }
281
282                 /* If this rq is still suitable use it. */
283                 if (lowest_rq->rt.highest_prio > task->prio)
284                         break;
285
286                 /* try again */
287                 spin_unlock(&lowest_rq->lock);
288                 lowest_rq = NULL;
289         }
290
291         return lowest_rq;
292 }
293
294 /*
295  * If the current CPU has more than one RT task, see if the non
296  * running task can migrate over to a CPU that is running a task
297  * of lesser priority.
298  */
299 static int push_rt_task(struct rq *this_rq)
300 {
301         struct task_struct *next_task;
302         struct rq *lowest_rq;
303         int ret = 0;
304         int paranoid = RT_MAX_TRIES;
305
306         assert_spin_locked(&this_rq->lock);
307
308         next_task = pick_next_highest_task_rt(this_rq);
309         if (!next_task)
310                 return 0;
311
312  retry:
313         if (unlikely(next_task == this_rq->curr))
314                 return 0;
315
316         /*
317          * It's possible that the next_task slipped in of
318          * higher priority than current. If that's the case
319          * just reschedule current.
320          */
321         if (unlikely(next_task->prio < this_rq->curr->prio)) {
322                 resched_task(this_rq->curr);
323                 return 0;
324         }
325
326         /* We might release this_rq lock */
327         get_task_struct(next_task);
328
329         /* find_lock_lowest_rq locks the rq if found */
330         lowest_rq = find_lock_lowest_rq(next_task, this_rq);
331         if (!lowest_rq) {
332                 struct task_struct *task;
333                 /*
334                  * find lock_lowest_rq releases this_rq->lock
335                  * so it is possible that next_task has changed.
336                  * If it has, then try again.
337                  */
338                 task = pick_next_highest_task_rt(this_rq);
339                 if (unlikely(task != next_task) && task && paranoid--) {
340                         put_task_struct(next_task);
341                         next_task = task;
342                         goto retry;
343                 }
344                 goto out;
345         }
346
347         assert_spin_locked(&lowest_rq->lock);
348
349         deactivate_task(this_rq, next_task, 0);
350         set_task_cpu(next_task, lowest_rq->cpu);
351         activate_task(lowest_rq, next_task, 0);
352
353         resched_task(lowest_rq->curr);
354
355         spin_unlock(&lowest_rq->lock);
356
357         ret = 1;
358 out:
359         put_task_struct(next_task);
360
361         return ret;
362 }
363
364 /*
365  * TODO: Currently we just use the second highest prio task on
366  *       the queue, and stop when it can't migrate (or there's
367  *       no more RT tasks).  There may be a case where a lower
368  *       priority RT task has a different affinity than the
369  *       higher RT task. In this case the lower RT task could
370  *       possibly be able to migrate where as the higher priority
371  *       RT task could not.  We currently ignore this issue.
372  *       Enhancements are welcome!
373  */
374 static void push_rt_tasks(struct rq *rq)
375 {
376         /* push_rt_task will return true if it moved an RT */
377         while (push_rt_task(rq))
378                 ;
379 }
380
381 static void schedule_tail_balance_rt(struct rq *rq)
382 {
383         /*
384          * If we have more than one rt_task queued, then
385          * see if we can push the other rt_tasks off to other CPUS.
386          * Note we may release the rq lock, and since
387          * the lock was owned by prev, we need to release it
388          * first via finish_lock_switch and then reaquire it here.
389          */
390         if (unlikely(rq->rt.rt_nr_running > 1)) {
391                 spin_lock_irq(&rq->lock);
392                 push_rt_tasks(rq);
393                 spin_unlock_irq(&rq->lock);
394         }
395 }
396
397 /*
398  * Load-balancing iterator. Note: while the runqueue stays locked
399  * during the whole iteration, the current task might be
400  * dequeued so the iterator has to be dequeue-safe. Here we
401  * achieve that by always pre-iterating before returning
402  * the current task:
403  */
404 static struct task_struct *load_balance_start_rt(void *arg)
405 {
406         struct rq *rq = arg;
407         struct rt_prio_array *array = &rq->rt.active;
408         struct list_head *head, *curr;
409         struct task_struct *p;
410         int idx;
411
412         idx = sched_find_first_bit(array->bitmap);
413         if (idx >= MAX_RT_PRIO)
414                 return NULL;
415
416         head = array->queue + idx;
417         curr = head->prev;
418
419         p = list_entry(curr, struct task_struct, run_list);
420
421         curr = curr->prev;
422
423         rq->rt.rt_load_balance_idx = idx;
424         rq->rt.rt_load_balance_head = head;
425         rq->rt.rt_load_balance_curr = curr;
426
427         return p;
428 }
429
430 static struct task_struct *load_balance_next_rt(void *arg)
431 {
432         struct rq *rq = arg;
433         struct rt_prio_array *array = &rq->rt.active;
434         struct list_head *head, *curr;
435         struct task_struct *p;
436         int idx;
437
438         idx = rq->rt.rt_load_balance_idx;
439         head = rq->rt.rt_load_balance_head;
440         curr = rq->rt.rt_load_balance_curr;
441
442         /*
443          * If we arrived back to the head again then
444          * iterate to the next queue (if any):
445          */
446         if (unlikely(head == curr)) {
447                 int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
448
449                 if (next_idx >= MAX_RT_PRIO)
450                         return NULL;
451
452                 idx = next_idx;
453                 head = array->queue + idx;
454                 curr = head->prev;
455
456                 rq->rt.rt_load_balance_idx = idx;
457                 rq->rt.rt_load_balance_head = head;
458         }
459
460         p = list_entry(curr, struct task_struct, run_list);
461
462         curr = curr->prev;
463
464         rq->rt.rt_load_balance_curr = curr;
465
466         return p;
467 }
468
469 static unsigned long
470 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
471                 unsigned long max_load_move,
472                 struct sched_domain *sd, enum cpu_idle_type idle,
473                 int *all_pinned, int *this_best_prio)
474 {
475         struct rq_iterator rt_rq_iterator;
476
477         rt_rq_iterator.start = load_balance_start_rt;
478         rt_rq_iterator.next = load_balance_next_rt;
479         /* pass 'busiest' rq argument into
480          * load_balance_[start|next]_rt iterators
481          */
482         rt_rq_iterator.arg = busiest;
483
484         return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd,
485                              idle, all_pinned, this_best_prio, &rt_rq_iterator);
486 }
487
488 static int
489 move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
490                  struct sched_domain *sd, enum cpu_idle_type idle)
491 {
492         struct rq_iterator rt_rq_iterator;
493
494         rt_rq_iterator.start = load_balance_start_rt;
495         rt_rq_iterator.next = load_balance_next_rt;
496         rt_rq_iterator.arg = busiest;
497
498         return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
499                                   &rt_rq_iterator);
500 }
501 #else /* CONFIG_SMP */
502 # define schedule_tail_balance_rt(rq)   do { } while (0)
503 #endif /* CONFIG_SMP */
504
505 static void task_tick_rt(struct rq *rq, struct task_struct *p)
506 {
507         update_curr_rt(rq);
508
509         /*
510          * RR tasks need a special form of timeslice management.
511          * FIFO tasks have no timeslices.
512          */
513         if (p->policy != SCHED_RR)
514                 return;
515
516         if (--p->time_slice)
517                 return;
518
519         p->time_slice = DEF_TIMESLICE;
520
521         /*
522          * Requeue to the end of queue if we are not the only element
523          * on the queue:
524          */
525         if (p->run_list.prev != p->run_list.next) {
526                 requeue_task_rt(rq, p);
527                 set_tsk_need_resched(p);
528         }
529 }
530
531 static void set_curr_task_rt(struct rq *rq)
532 {
533         struct task_struct *p = rq->curr;
534
535         p->se.exec_start = rq->clock;
536 }
537
538 const struct sched_class rt_sched_class = {
539         .next                   = &fair_sched_class,
540         .enqueue_task           = enqueue_task_rt,
541         .dequeue_task           = dequeue_task_rt,
542         .yield_task             = yield_task_rt,
543
544         .check_preempt_curr     = check_preempt_curr_rt,
545
546         .pick_next_task         = pick_next_task_rt,
547         .put_prev_task          = put_prev_task_rt,
548
549 #ifdef CONFIG_SMP
550         .load_balance           = load_balance_rt,
551         .move_one_task          = move_one_task_rt,
552 #endif
553
554         .set_curr_task          = set_curr_task_rt,
555         .task_tick              = task_tick_rt,
556 };