sched: add RT task pushing
[linux-2.6.git] / kernel / sched_rt.c
1 /*
2  * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
3  * policies)
4  */
5
6 /*
7  * Update the current task's runtime statistics. Skip current tasks that
8  * are not in our scheduling class.
9  */
10 static void update_curr_rt(struct rq *rq)
11 {
12         struct task_struct *curr = rq->curr;
13         u64 delta_exec;
14
15         if (!task_has_rt_policy(curr))
16                 return;
17
18         delta_exec = rq->clock - curr->se.exec_start;
19         if (unlikely((s64)delta_exec < 0))
20                 delta_exec = 0;
21
22         schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
23
24         curr->se.sum_exec_runtime += delta_exec;
25         curr->se.exec_start = rq->clock;
26         cpuacct_charge(curr, delta_exec);
27 }
28
29 static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
30 {
31         WARN_ON(!rt_task(p));
32         rq->rt.rt_nr_running++;
33 #ifdef CONFIG_SMP
34         if (p->prio < rq->rt.highest_prio)
35                 rq->rt.highest_prio = p->prio;
36 #endif /* CONFIG_SMP */
37 }
38
39 static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
40 {
41         WARN_ON(!rt_task(p));
42         WARN_ON(!rq->rt.rt_nr_running);
43         rq->rt.rt_nr_running--;
44 #ifdef CONFIG_SMP
45         if (rq->rt.rt_nr_running) {
46                 struct rt_prio_array *array;
47
48                 WARN_ON(p->prio < rq->rt.highest_prio);
49                 if (p->prio == rq->rt.highest_prio) {
50                         /* recalculate */
51                         array = &rq->rt.active;
52                         rq->rt.highest_prio =
53                                 sched_find_first_bit(array->bitmap);
54                 } /* otherwise leave rq->highest prio alone */
55         } else
56                 rq->rt.highest_prio = MAX_RT_PRIO;
57 #endif /* CONFIG_SMP */
58 }
59
60 static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
61 {
62         struct rt_prio_array *array = &rq->rt.active;
63
64         list_add_tail(&p->run_list, array->queue + p->prio);
65         __set_bit(p->prio, array->bitmap);
66         inc_cpu_load(rq, p->se.load.weight);
67
68         inc_rt_tasks(p, rq);
69 }
70
71 /*
72  * Adding/removing a task to/from a priority array:
73  */
74 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
75 {
76         struct rt_prio_array *array = &rq->rt.active;
77
78         update_curr_rt(rq);
79
80         list_del(&p->run_list);
81         if (list_empty(array->queue + p->prio))
82                 __clear_bit(p->prio, array->bitmap);
83         dec_cpu_load(rq, p->se.load.weight);
84
85         dec_rt_tasks(p, rq);
86 }
87
88 /*
89  * Put task to the end of the run list without the overhead of dequeue
90  * followed by enqueue.
91  */
92 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
93 {
94         struct rt_prio_array *array = &rq->rt.active;
95
96         list_move_tail(&p->run_list, array->queue + p->prio);
97 }
98
99 static void
100 yield_task_rt(struct rq *rq)
101 {
102         requeue_task_rt(rq, rq->curr);
103 }
104
105 /*
106  * Preempt the current task with a newly woken task if needed:
107  */
108 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
109 {
110         if (p->prio < rq->curr->prio)
111                 resched_task(rq->curr);
112 }
113
114 static struct task_struct *pick_next_task_rt(struct rq *rq)
115 {
116         struct rt_prio_array *array = &rq->rt.active;
117         struct task_struct *next;
118         struct list_head *queue;
119         int idx;
120
121         idx = sched_find_first_bit(array->bitmap);
122         if (idx >= MAX_RT_PRIO)
123                 return NULL;
124
125         queue = array->queue + idx;
126         next = list_entry(queue->next, struct task_struct, run_list);
127
128         next->se.exec_start = rq->clock;
129
130         return next;
131 }
132
133 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
134 {
135         update_curr_rt(rq);
136         p->se.exec_start = 0;
137 }
138
139 #ifdef CONFIG_SMP
140 /* Only try algorithms three times */
141 #define RT_MAX_TRIES 3
142
143 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
144 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
145
146 /* Return the second highest RT task, NULL otherwise */
147 static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
148 {
149         struct rt_prio_array *array = &rq->rt.active;
150         struct task_struct *next;
151         struct list_head *queue;
152         int idx;
153
154         assert_spin_locked(&rq->lock);
155
156         if (likely(rq->rt.rt_nr_running < 2))
157                 return NULL;
158
159         idx = sched_find_first_bit(array->bitmap);
160         if (unlikely(idx >= MAX_RT_PRIO)) {
161                 WARN_ON(1); /* rt_nr_running is bad */
162                 return NULL;
163         }
164
165         queue = array->queue + idx;
166         next = list_entry(queue->next, struct task_struct, run_list);
167         if (unlikely(next != rq->curr))
168                 return next;
169
170         if (queue->next->next != queue) {
171                 /* same prio task */
172                 next = list_entry(queue->next->next, struct task_struct, run_list);
173                 return next;
174         }
175
176         /* slower, but more flexible */
177         idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
178         if (unlikely(idx >= MAX_RT_PRIO)) {
179                 WARN_ON(1); /* rt_nr_running was 2 and above! */
180                 return NULL;
181         }
182
183         queue = array->queue + idx;
184         next = list_entry(queue->next, struct task_struct, run_list);
185
186         return next;
187 }
188
189 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
190
191 /* Will lock the rq it finds */
192 static struct rq *find_lock_lowest_rq(struct task_struct *task,
193                                       struct rq *this_rq)
194 {
195         struct rq *lowest_rq = NULL;
196         int cpu;
197         int tries;
198         cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
199
200         cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
201
202         for (tries = 0; tries < RT_MAX_TRIES; tries++) {
203                 /*
204                  * Scan each rq for the lowest prio.
205                  */
206                 for_each_cpu_mask(cpu, *cpu_mask) {
207                         struct rq *rq = &per_cpu(runqueues, cpu);
208
209                         if (cpu == this_rq->cpu)
210                                 continue;
211
212                         /* We look for lowest RT prio or non-rt CPU */
213                         if (rq->rt.highest_prio >= MAX_RT_PRIO) {
214                                 lowest_rq = rq;
215                                 break;
216                         }
217
218                         /* no locking for now */
219                         if (rq->rt.highest_prio > task->prio &&
220                             (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
221                                 lowest_rq = rq;
222                         }
223                 }
224
225                 if (!lowest_rq)
226                         break;
227
228                 /* if the prio of this runqueue changed, try again */
229                 if (double_lock_balance(this_rq, lowest_rq)) {
230                         /*
231                          * We had to unlock the run queue. In
232                          * the mean time, task could have
233                          * migrated already or had its affinity changed.
234                          * Also make sure that it wasn't scheduled on its rq.
235                          */
236                         if (unlikely(task_rq(task) != this_rq ||
237                                      !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
238                                      task_running(this_rq, task) ||
239                                      !task->se.on_rq)) {
240                                 spin_unlock(&lowest_rq->lock);
241                                 lowest_rq = NULL;
242                                 break;
243                         }
244                 }
245
246                 /* If this rq is still suitable use it. */
247                 if (lowest_rq->rt.highest_prio > task->prio)
248                         break;
249
250                 /* try again */
251                 spin_unlock(&lowest_rq->lock);
252                 lowest_rq = NULL;
253         }
254
255         return lowest_rq;
256 }
257
258 /*
259  * If the current CPU has more than one RT task, see if the non
260  * running task can migrate over to a CPU that is running a task
261  * of lesser priority.
262  */
263 static int push_rt_task(struct rq *this_rq)
264 {
265         struct task_struct *next_task;
266         struct rq *lowest_rq;
267         int ret = 0;
268         int paranoid = RT_MAX_TRIES;
269
270         assert_spin_locked(&this_rq->lock);
271
272         next_task = pick_next_highest_task_rt(this_rq);
273         if (!next_task)
274                 return 0;
275
276  retry:
277         if (unlikely(next_task == this_rq->curr))
278                 return 0;
279
280         /*
281          * It's possible that the next_task slipped in of
282          * higher priority than current. If that's the case
283          * just reschedule current.
284          */
285         if (unlikely(next_task->prio < this_rq->curr->prio)) {
286                 resched_task(this_rq->curr);
287                 return 0;
288         }
289
290         /* We might release this_rq lock */
291         get_task_struct(next_task);
292
293         /* find_lock_lowest_rq locks the rq if found */
294         lowest_rq = find_lock_lowest_rq(next_task, this_rq);
295         if (!lowest_rq) {
296                 struct task_struct *task;
297                 /*
298                  * find lock_lowest_rq releases this_rq->lock
299                  * so it is possible that next_task has changed.
300                  * If it has, then try again.
301                  */
302                 task = pick_next_highest_task_rt(this_rq);
303                 if (unlikely(task != next_task) && task && paranoid--) {
304                         put_task_struct(next_task);
305                         next_task = task;
306                         goto retry;
307                 }
308                 goto out;
309         }
310
311         assert_spin_locked(&lowest_rq->lock);
312
313         deactivate_task(this_rq, next_task, 0);
314         set_task_cpu(next_task, lowest_rq->cpu);
315         activate_task(lowest_rq, next_task, 0);
316
317         resched_task(lowest_rq->curr);
318
319         spin_unlock(&lowest_rq->lock);
320
321         ret = 1;
322 out:
323         put_task_struct(next_task);
324
325         return ret;
326 }
327
328 /*
329  * TODO: Currently we just use the second highest prio task on
330  *       the queue, and stop when it can't migrate (or there's
331  *       no more RT tasks).  There may be a case where a lower
332  *       priority RT task has a different affinity than the
333  *       higher RT task. In this case the lower RT task could
334  *       possibly be able to migrate where as the higher priority
335  *       RT task could not.  We currently ignore this issue.
336  *       Enhancements are welcome!
337  */
338 static void push_rt_tasks(struct rq *rq)
339 {
340         /* push_rt_task will return true if it moved an RT */
341         while (push_rt_task(rq))
342                 ;
343 }
344
345 static void schedule_tail_balance_rt(struct rq *rq)
346 {
347         /*
348          * If we have more than one rt_task queued, then
349          * see if we can push the other rt_tasks off to other CPUS.
350          * Note we may release the rq lock, and since
351          * the lock was owned by prev, we need to release it
352          * first via finish_lock_switch and then reaquire it here.
353          */
354         if (unlikely(rq->rt.rt_nr_running > 1)) {
355                 spin_lock_irq(&rq->lock);
356                 push_rt_tasks(rq);
357                 spin_unlock_irq(&rq->lock);
358         }
359 }
360
361 /*
362  * Load-balancing iterator. Note: while the runqueue stays locked
363  * during the whole iteration, the current task might be
364  * dequeued so the iterator has to be dequeue-safe. Here we
365  * achieve that by always pre-iterating before returning
366  * the current task:
367  */
368 static struct task_struct *load_balance_start_rt(void *arg)
369 {
370         struct rq *rq = arg;
371         struct rt_prio_array *array = &rq->rt.active;
372         struct list_head *head, *curr;
373         struct task_struct *p;
374         int idx;
375
376         idx = sched_find_first_bit(array->bitmap);
377         if (idx >= MAX_RT_PRIO)
378                 return NULL;
379
380         head = array->queue + idx;
381         curr = head->prev;
382
383         p = list_entry(curr, struct task_struct, run_list);
384
385         curr = curr->prev;
386
387         rq->rt.rt_load_balance_idx = idx;
388         rq->rt.rt_load_balance_head = head;
389         rq->rt.rt_load_balance_curr = curr;
390
391         return p;
392 }
393
394 static struct task_struct *load_balance_next_rt(void *arg)
395 {
396         struct rq *rq = arg;
397         struct rt_prio_array *array = &rq->rt.active;
398         struct list_head *head, *curr;
399         struct task_struct *p;
400         int idx;
401
402         idx = rq->rt.rt_load_balance_idx;
403         head = rq->rt.rt_load_balance_head;
404         curr = rq->rt.rt_load_balance_curr;
405
406         /*
407          * If we arrived back to the head again then
408          * iterate to the next queue (if any):
409          */
410         if (unlikely(head == curr)) {
411                 int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
412
413                 if (next_idx >= MAX_RT_PRIO)
414                         return NULL;
415
416                 idx = next_idx;
417                 head = array->queue + idx;
418                 curr = head->prev;
419
420                 rq->rt.rt_load_balance_idx = idx;
421                 rq->rt.rt_load_balance_head = head;
422         }
423
424         p = list_entry(curr, struct task_struct, run_list);
425
426         curr = curr->prev;
427
428         rq->rt.rt_load_balance_curr = curr;
429
430         return p;
431 }
432
433 static unsigned long
434 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
435                 unsigned long max_load_move,
436                 struct sched_domain *sd, enum cpu_idle_type idle,
437                 int *all_pinned, int *this_best_prio)
438 {
439         struct rq_iterator rt_rq_iterator;
440
441         rt_rq_iterator.start = load_balance_start_rt;
442         rt_rq_iterator.next = load_balance_next_rt;
443         /* pass 'busiest' rq argument into
444          * load_balance_[start|next]_rt iterators
445          */
446         rt_rq_iterator.arg = busiest;
447
448         return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd,
449                              idle, all_pinned, this_best_prio, &rt_rq_iterator);
450 }
451
452 static int
453 move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
454                  struct sched_domain *sd, enum cpu_idle_type idle)
455 {
456         struct rq_iterator rt_rq_iterator;
457
458         rt_rq_iterator.start = load_balance_start_rt;
459         rt_rq_iterator.next = load_balance_next_rt;
460         rt_rq_iterator.arg = busiest;
461
462         return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
463                                   &rt_rq_iterator);
464 }
465 #else /* CONFIG_SMP */
466 # define schedule_tail_balance_rt(rq)   do { } while (0)
467 #endif /* CONFIG_SMP */
468
469 static void task_tick_rt(struct rq *rq, struct task_struct *p)
470 {
471         update_curr_rt(rq);
472
473         /*
474          * RR tasks need a special form of timeslice management.
475          * FIFO tasks have no timeslices.
476          */
477         if (p->policy != SCHED_RR)
478                 return;
479
480         if (--p->time_slice)
481                 return;
482
483         p->time_slice = DEF_TIMESLICE;
484
485         /*
486          * Requeue to the end of queue if we are not the only element
487          * on the queue:
488          */
489         if (p->run_list.prev != p->run_list.next) {
490                 requeue_task_rt(rq, p);
491                 set_tsk_need_resched(p);
492         }
493 }
494
495 static void set_curr_task_rt(struct rq *rq)
496 {
497         struct task_struct *p = rq->curr;
498
499         p->se.exec_start = rq->clock;
500 }
501
502 const struct sched_class rt_sched_class = {
503         .next                   = &fair_sched_class,
504         .enqueue_task           = enqueue_task_rt,
505         .dequeue_task           = dequeue_task_rt,
506         .yield_task             = yield_task_rt,
507
508         .check_preempt_curr     = check_preempt_curr_rt,
509
510         .pick_next_task         = pick_next_task_rt,
511         .put_prev_task          = put_prev_task_rt,
512
513 #ifdef CONFIG_SMP
514         .load_balance           = load_balance_rt,
515         .move_one_task          = move_one_task_rt,
516 #endif
517
518         .set_curr_task          = set_curr_task_rt,
519         .task_tick              = task_tick_rt,
520 };