sched: push RT tasks from overloaded CPUs
[linux-2.6.git] / kernel / sched_rt.c
1 /*
2  * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
3  * policies)
4  */
5
6 #ifdef CONFIG_SMP
7 static cpumask_t rt_overload_mask;
8 static atomic_t rto_count;
9 static inline int rt_overloaded(void)
10 {
11         return atomic_read(&rto_count);
12 }
13 static inline cpumask_t *rt_overload(void)
14 {
15         return &rt_overload_mask;
16 }
17 static inline void rt_set_overload(struct rq *rq)
18 {
19         cpu_set(rq->cpu, rt_overload_mask);
20         /*
21          * Make sure the mask is visible before we set
22          * the overload count. That is checked to determine
23          * if we should look at the mask. It would be a shame
24          * if we looked at the mask, but the mask was not
25          * updated yet.
26          */
27         wmb();
28         atomic_inc(&rto_count);
29 }
30 static inline void rt_clear_overload(struct rq *rq)
31 {
32         /* the order here really doesn't matter */
33         atomic_dec(&rto_count);
34         cpu_clear(rq->cpu, rt_overload_mask);
35 }
36 #endif /* CONFIG_SMP */
37
38 /*
39  * Update the current task's runtime statistics. Skip current tasks that
40  * are not in our scheduling class.
41  */
42 static void update_curr_rt(struct rq *rq)
43 {
44         struct task_struct *curr = rq->curr;
45         u64 delta_exec;
46
47         if (!task_has_rt_policy(curr))
48                 return;
49
50         delta_exec = rq->clock - curr->se.exec_start;
51         if (unlikely((s64)delta_exec < 0))
52                 delta_exec = 0;
53
54         schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
55
56         curr->se.sum_exec_runtime += delta_exec;
57         curr->se.exec_start = rq->clock;
58         cpuacct_charge(curr, delta_exec);
59 }
60
61 static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
62 {
63         WARN_ON(!rt_task(p));
64         rq->rt.rt_nr_running++;
65 #ifdef CONFIG_SMP
66         if (p->prio < rq->rt.highest_prio)
67                 rq->rt.highest_prio = p->prio;
68         if (rq->rt.rt_nr_running > 1)
69                 rt_set_overload(rq);
70 #endif /* CONFIG_SMP */
71 }
72
73 static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
74 {
75         WARN_ON(!rt_task(p));
76         WARN_ON(!rq->rt.rt_nr_running);
77         rq->rt.rt_nr_running--;
78 #ifdef CONFIG_SMP
79         if (rq->rt.rt_nr_running) {
80                 struct rt_prio_array *array;
81
82                 WARN_ON(p->prio < rq->rt.highest_prio);
83                 if (p->prio == rq->rt.highest_prio) {
84                         /* recalculate */
85                         array = &rq->rt.active;
86                         rq->rt.highest_prio =
87                                 sched_find_first_bit(array->bitmap);
88                 } /* otherwise leave rq->highest prio alone */
89         } else
90                 rq->rt.highest_prio = MAX_RT_PRIO;
91         if (rq->rt.rt_nr_running < 2)
92                 rt_clear_overload(rq);
93 #endif /* CONFIG_SMP */
94 }
95
96 static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
97 {
98         struct rt_prio_array *array = &rq->rt.active;
99
100         list_add_tail(&p->run_list, array->queue + p->prio);
101         __set_bit(p->prio, array->bitmap);
102         inc_cpu_load(rq, p->se.load.weight);
103
104         inc_rt_tasks(p, rq);
105 }
106
107 /*
108  * Adding/removing a task to/from a priority array:
109  */
110 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
111 {
112         struct rt_prio_array *array = &rq->rt.active;
113
114         update_curr_rt(rq);
115
116         list_del(&p->run_list);
117         if (list_empty(array->queue + p->prio))
118                 __clear_bit(p->prio, array->bitmap);
119         dec_cpu_load(rq, p->se.load.weight);
120
121         dec_rt_tasks(p, rq);
122 }
123
124 /*
125  * Put task to the end of the run list without the overhead of dequeue
126  * followed by enqueue.
127  */
128 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
129 {
130         struct rt_prio_array *array = &rq->rt.active;
131
132         list_move_tail(&p->run_list, array->queue + p->prio);
133 }
134
135 static void
136 yield_task_rt(struct rq *rq)
137 {
138         requeue_task_rt(rq, rq->curr);
139 }
140
141 /*
142  * Preempt the current task with a newly woken task if needed:
143  */
144 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
145 {
146         if (p->prio < rq->curr->prio)
147                 resched_task(rq->curr);
148 }
149
150 static struct task_struct *pick_next_task_rt(struct rq *rq)
151 {
152         struct rt_prio_array *array = &rq->rt.active;
153         struct task_struct *next;
154         struct list_head *queue;
155         int idx;
156
157         idx = sched_find_first_bit(array->bitmap);
158         if (idx >= MAX_RT_PRIO)
159                 return NULL;
160
161         queue = array->queue + idx;
162         next = list_entry(queue->next, struct task_struct, run_list);
163
164         next->se.exec_start = rq->clock;
165
166         return next;
167 }
168
169 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
170 {
171         update_curr_rt(rq);
172         p->se.exec_start = 0;
173 }
174
175 #ifdef CONFIG_SMP
176 /* Only try algorithms three times */
177 #define RT_MAX_TRIES 3
178
179 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
180 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
181
182 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
183 {
184         if (!task_running(rq, p) &&
185             (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)))
186                 return 1;
187         return 0;
188 }
189
190 /* Return the second highest RT task, NULL otherwise */
191 static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
192                                                      int cpu)
193 {
194         struct rt_prio_array *array = &rq->rt.active;
195         struct task_struct *next;
196         struct list_head *queue;
197         int idx;
198
199         assert_spin_locked(&rq->lock);
200
201         if (likely(rq->rt.rt_nr_running < 2))
202                 return NULL;
203
204         idx = sched_find_first_bit(array->bitmap);
205         if (unlikely(idx >= MAX_RT_PRIO)) {
206                 WARN_ON(1); /* rt_nr_running is bad */
207                 return NULL;
208         }
209
210         queue = array->queue + idx;
211         BUG_ON(list_empty(queue));
212
213         next = list_entry(queue->next, struct task_struct, run_list);
214         if (unlikely(pick_rt_task(rq, next, cpu)))
215                 goto out;
216
217         if (queue->next->next != queue) {
218                 /* same prio task */
219                 next = list_entry(queue->next->next, struct task_struct, run_list);
220                 if (pick_rt_task(rq, next, cpu))
221                         goto out;
222         }
223
224  retry:
225         /* slower, but more flexible */
226         idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
227         if (unlikely(idx >= MAX_RT_PRIO))
228                 return NULL;
229
230         queue = array->queue + idx;
231         BUG_ON(list_empty(queue));
232
233         list_for_each_entry(next, queue, run_list) {
234                 if (pick_rt_task(rq, next, cpu))
235                         goto out;
236         }
237
238         goto retry;
239
240  out:
241         return next;
242 }
243
244 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
245
246 /* Will lock the rq it finds */
247 static struct rq *find_lock_lowest_rq(struct task_struct *task,
248                                       struct rq *this_rq)
249 {
250         struct rq *lowest_rq = NULL;
251         int cpu;
252         int tries;
253         cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
254
255         cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
256
257         for (tries = 0; tries < RT_MAX_TRIES; tries++) {
258                 /*
259                  * Scan each rq for the lowest prio.
260                  */
261                 for_each_cpu_mask(cpu, *cpu_mask) {
262                         struct rq *rq = &per_cpu(runqueues, cpu);
263
264                         if (cpu == this_rq->cpu)
265                                 continue;
266
267                         /* We look for lowest RT prio or non-rt CPU */
268                         if (rq->rt.highest_prio >= MAX_RT_PRIO) {
269                                 lowest_rq = rq;
270                                 break;
271                         }
272
273                         /* no locking for now */
274                         if (rq->rt.highest_prio > task->prio &&
275                             (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
276                                 lowest_rq = rq;
277                         }
278                 }
279
280                 if (!lowest_rq)
281                         break;
282
283                 /* if the prio of this runqueue changed, try again */
284                 if (double_lock_balance(this_rq, lowest_rq)) {
285                         /*
286                          * We had to unlock the run queue. In
287                          * the mean time, task could have
288                          * migrated already or had its affinity changed.
289                          * Also make sure that it wasn't scheduled on its rq.
290                          */
291                         if (unlikely(task_rq(task) != this_rq ||
292                                      !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
293                                      task_running(this_rq, task) ||
294                                      !task->se.on_rq)) {
295                                 spin_unlock(&lowest_rq->lock);
296                                 lowest_rq = NULL;
297                                 break;
298                         }
299                 }
300
301                 /* If this rq is still suitable use it. */
302                 if (lowest_rq->rt.highest_prio > task->prio)
303                         break;
304
305                 /* try again */
306                 spin_unlock(&lowest_rq->lock);
307                 lowest_rq = NULL;
308         }
309
310         return lowest_rq;
311 }
312
313 /*
314  * If the current CPU has more than one RT task, see if the non
315  * running task can migrate over to a CPU that is running a task
316  * of lesser priority.
317  */
318 static int push_rt_task(struct rq *this_rq)
319 {
320         struct task_struct *next_task;
321         struct rq *lowest_rq;
322         int ret = 0;
323         int paranoid = RT_MAX_TRIES;
324
325         assert_spin_locked(&this_rq->lock);
326
327         next_task = pick_next_highest_task_rt(this_rq, -1);
328         if (!next_task)
329                 return 0;
330
331  retry:
332         if (unlikely(next_task == this_rq->curr)) {
333                 WARN_ON(1);
334                 return 0;
335         }
336
337         /*
338          * It's possible that the next_task slipped in of
339          * higher priority than current. If that's the case
340          * just reschedule current.
341          */
342         if (unlikely(next_task->prio < this_rq->curr->prio)) {
343                 resched_task(this_rq->curr);
344                 return 0;
345         }
346
347         /* We might release this_rq lock */
348         get_task_struct(next_task);
349
350         /* find_lock_lowest_rq locks the rq if found */
351         lowest_rq = find_lock_lowest_rq(next_task, this_rq);
352         if (!lowest_rq) {
353                 struct task_struct *task;
354                 /*
355                  * find lock_lowest_rq releases this_rq->lock
356                  * so it is possible that next_task has changed.
357                  * If it has, then try again.
358                  */
359                 task = pick_next_highest_task_rt(this_rq, -1);
360                 if (unlikely(task != next_task) && task && paranoid--) {
361                         put_task_struct(next_task);
362                         next_task = task;
363                         goto retry;
364                 }
365                 goto out;
366         }
367
368         assert_spin_locked(&lowest_rq->lock);
369
370         deactivate_task(this_rq, next_task, 0);
371         set_task_cpu(next_task, lowest_rq->cpu);
372         activate_task(lowest_rq, next_task, 0);
373
374         resched_task(lowest_rq->curr);
375
376         spin_unlock(&lowest_rq->lock);
377
378         ret = 1;
379 out:
380         put_task_struct(next_task);
381
382         return ret;
383 }
384
385 /*
386  * TODO: Currently we just use the second highest prio task on
387  *       the queue, and stop when it can't migrate (or there's
388  *       no more RT tasks).  There may be a case where a lower
389  *       priority RT task has a different affinity than the
390  *       higher RT task. In this case the lower RT task could
391  *       possibly be able to migrate where as the higher priority
392  *       RT task could not.  We currently ignore this issue.
393  *       Enhancements are welcome!
394  */
395 static void push_rt_tasks(struct rq *rq)
396 {
397         /* push_rt_task will return true if it moved an RT */
398         while (push_rt_task(rq))
399                 ;
400 }
401
402 static int pull_rt_task(struct rq *this_rq)
403 {
404         struct task_struct *next;
405         struct task_struct *p;
406         struct rq *src_rq;
407         cpumask_t *rto_cpumask;
408         int this_cpu = this_rq->cpu;
409         int cpu;
410         int ret = 0;
411
412         assert_spin_locked(&this_rq->lock);
413
414         /*
415          * If cpusets are used, and we have overlapping
416          * run queue cpusets, then this algorithm may not catch all.
417          * This is just the price you pay on trying to keep
418          * dirtying caches down on large SMP machines.
419          */
420         if (likely(!rt_overloaded()))
421                 return 0;
422
423         next = pick_next_task_rt(this_rq);
424
425         rto_cpumask = rt_overload();
426
427         for_each_cpu_mask(cpu, *rto_cpumask) {
428                 if (this_cpu == cpu)
429                         continue;
430
431                 src_rq = cpu_rq(cpu);
432                 if (unlikely(src_rq->rt.rt_nr_running <= 1)) {
433                         /*
434                          * It is possible that overlapping cpusets
435                          * will miss clearing a non overloaded runqueue.
436                          * Clear it now.
437                          */
438                         if (double_lock_balance(this_rq, src_rq)) {
439                                 /* unlocked our runqueue lock */
440                                 struct task_struct *old_next = next;
441                                 next = pick_next_task_rt(this_rq);
442                                 if (next != old_next)
443                                         ret = 1;
444                         }
445                         if (likely(src_rq->rt.rt_nr_running <= 1))
446                                 /*
447                                  * Small chance that this_rq->curr changed
448                                  * but it's really harmless here.
449                                  */
450                                 rt_clear_overload(this_rq);
451                         else
452                                 /*
453                                  * Heh, the src_rq is now overloaded, since
454                                  * we already have the src_rq lock, go straight
455                                  * to pulling tasks from it.
456                                  */
457                                 goto try_pulling;
458                         spin_unlock(&src_rq->lock);
459                         continue;
460                 }
461
462                 /*
463                  * We can potentially drop this_rq's lock in
464                  * double_lock_balance, and another CPU could
465                  * steal our next task - hence we must cause
466                  * the caller to recalculate the next task
467                  * in that case:
468                  */
469                 if (double_lock_balance(this_rq, src_rq)) {
470                         struct task_struct *old_next = next;
471                         next = pick_next_task_rt(this_rq);
472                         if (next != old_next)
473                                 ret = 1;
474                 }
475
476                 /*
477                  * Are there still pullable RT tasks?
478                  */
479                 if (src_rq->rt.rt_nr_running <= 1) {
480                         spin_unlock(&src_rq->lock);
481                         continue;
482                 }
483
484  try_pulling:
485                 p = pick_next_highest_task_rt(src_rq, this_cpu);
486
487                 /*
488                  * Do we have an RT task that preempts
489                  * the to-be-scheduled task?
490                  */
491                 if (p && (!next || (p->prio < next->prio))) {
492                         WARN_ON(p == src_rq->curr);
493                         WARN_ON(!p->se.on_rq);
494
495                         /*
496                          * There's a chance that p is higher in priority
497                          * than what's currently running on its cpu.
498                          * This is just that p is wakeing up and hasn't
499                          * had a chance to schedule. We only pull
500                          * p if it is lower in priority than the
501                          * current task on the run queue or
502                          * this_rq next task is lower in prio than
503                          * the current task on that rq.
504                          */
505                         if (p->prio < src_rq->curr->prio ||
506                             (next && next->prio < src_rq->curr->prio))
507                                 goto bail;
508
509                         ret = 1;
510
511                         deactivate_task(src_rq, p, 0);
512                         set_task_cpu(p, this_cpu);
513                         activate_task(this_rq, p, 0);
514                         /*
515                          * We continue with the search, just in
516                          * case there's an even higher prio task
517                          * in another runqueue. (low likelyhood
518                          * but possible)
519                          */
520
521                         /*
522                          * Update next so that we won't pick a task
523                          * on another cpu with a priority lower (or equal)
524                          * than the one we just picked.
525                          */
526                         next = p;
527
528                 }
529  bail:
530                 spin_unlock(&src_rq->lock);
531         }
532
533         return ret;
534 }
535
536 static void schedule_balance_rt(struct rq *rq,
537                                 struct task_struct *prev)
538 {
539         /* Try to pull RT tasks here if we lower this rq's prio */
540         if (unlikely(rt_task(prev)) &&
541             rq->rt.highest_prio > prev->prio)
542                 pull_rt_task(rq);
543 }
544
545 static void schedule_tail_balance_rt(struct rq *rq)
546 {
547         /*
548          * If we have more than one rt_task queued, then
549          * see if we can push the other rt_tasks off to other CPUS.
550          * Note we may release the rq lock, and since
551          * the lock was owned by prev, we need to release it
552          * first via finish_lock_switch and then reaquire it here.
553          */
554         if (unlikely(rq->rt.rt_nr_running > 1)) {
555                 spin_lock_irq(&rq->lock);
556                 push_rt_tasks(rq);
557                 spin_unlock_irq(&rq->lock);
558         }
559 }
560
561
562 static void wakeup_balance_rt(struct rq *rq, struct task_struct *p)
563 {
564         if (unlikely(rt_task(p)) &&
565             !task_running(rq, p) &&
566             (p->prio >= rq->curr->prio))
567                 push_rt_tasks(rq);
568 }
569
570 /*
571  * Load-balancing iterator. Note: while the runqueue stays locked
572  * during the whole iteration, the current task might be
573  * dequeued so the iterator has to be dequeue-safe. Here we
574  * achieve that by always pre-iterating before returning
575  * the current task:
576  */
577 static struct task_struct *load_balance_start_rt(void *arg)
578 {
579         struct rq *rq = arg;
580         struct rt_prio_array *array = &rq->rt.active;
581         struct list_head *head, *curr;
582         struct task_struct *p;
583         int idx;
584
585         idx = sched_find_first_bit(array->bitmap);
586         if (idx >= MAX_RT_PRIO)
587                 return NULL;
588
589         head = array->queue + idx;
590         curr = head->prev;
591
592         p = list_entry(curr, struct task_struct, run_list);
593
594         curr = curr->prev;
595
596         rq->rt.rt_load_balance_idx = idx;
597         rq->rt.rt_load_balance_head = head;
598         rq->rt.rt_load_balance_curr = curr;
599
600         return p;
601 }
602
603 static struct task_struct *load_balance_next_rt(void *arg)
604 {
605         struct rq *rq = arg;
606         struct rt_prio_array *array = &rq->rt.active;
607         struct list_head *head, *curr;
608         struct task_struct *p;
609         int idx;
610
611         idx = rq->rt.rt_load_balance_idx;
612         head = rq->rt.rt_load_balance_head;
613         curr = rq->rt.rt_load_balance_curr;
614
615         /*
616          * If we arrived back to the head again then
617          * iterate to the next queue (if any):
618          */
619         if (unlikely(head == curr)) {
620                 int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
621
622                 if (next_idx >= MAX_RT_PRIO)
623                         return NULL;
624
625                 idx = next_idx;
626                 head = array->queue + idx;
627                 curr = head->prev;
628
629                 rq->rt.rt_load_balance_idx = idx;
630                 rq->rt.rt_load_balance_head = head;
631         }
632
633         p = list_entry(curr, struct task_struct, run_list);
634
635         curr = curr->prev;
636
637         rq->rt.rt_load_balance_curr = curr;
638
639         return p;
640 }
641
642 static unsigned long
643 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
644                 unsigned long max_load_move,
645                 struct sched_domain *sd, enum cpu_idle_type idle,
646                 int *all_pinned, int *this_best_prio)
647 {
648         struct rq_iterator rt_rq_iterator;
649
650         rt_rq_iterator.start = load_balance_start_rt;
651         rt_rq_iterator.next = load_balance_next_rt;
652         /* pass 'busiest' rq argument into
653          * load_balance_[start|next]_rt iterators
654          */
655         rt_rq_iterator.arg = busiest;
656
657         return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd,
658                              idle, all_pinned, this_best_prio, &rt_rq_iterator);
659 }
660
661 static int
662 move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
663                  struct sched_domain *sd, enum cpu_idle_type idle)
664 {
665         struct rq_iterator rt_rq_iterator;
666
667         rt_rq_iterator.start = load_balance_start_rt;
668         rt_rq_iterator.next = load_balance_next_rt;
669         rt_rq_iterator.arg = busiest;
670
671         return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
672                                   &rt_rq_iterator);
673 }
674 #else /* CONFIG_SMP */
675 # define schedule_tail_balance_rt(rq)   do { } while (0)
676 # define schedule_balance_rt(rq, prev)  do { } while (0)
677 # define wakeup_balance_rt(rq, p)       do { } while (0)
678 #endif /* CONFIG_SMP */
679
680 static void task_tick_rt(struct rq *rq, struct task_struct *p)
681 {
682         update_curr_rt(rq);
683
684         /*
685          * RR tasks need a special form of timeslice management.
686          * FIFO tasks have no timeslices.
687          */
688         if (p->policy != SCHED_RR)
689                 return;
690
691         if (--p->time_slice)
692                 return;
693
694         p->time_slice = DEF_TIMESLICE;
695
696         /*
697          * Requeue to the end of queue if we are not the only element
698          * on the queue:
699          */
700         if (p->run_list.prev != p->run_list.next) {
701                 requeue_task_rt(rq, p);
702                 set_tsk_need_resched(p);
703         }
704 }
705
706 static void set_curr_task_rt(struct rq *rq)
707 {
708         struct task_struct *p = rq->curr;
709
710         p->se.exec_start = rq->clock;
711 }
712
713 const struct sched_class rt_sched_class = {
714         .next                   = &fair_sched_class,
715         .enqueue_task           = enqueue_task_rt,
716         .dequeue_task           = dequeue_task_rt,
717         .yield_task             = yield_task_rt,
718
719         .check_preempt_curr     = check_preempt_curr_rt,
720
721         .pick_next_task         = pick_next_task_rt,
722         .put_prev_task          = put_prev_task_rt,
723
724 #ifdef CONFIG_SMP
725         .load_balance           = load_balance_rt,
726         .move_one_task          = move_one_task_rt,
727 #endif
728
729         .set_curr_task          = set_curr_task_rt,
730         .task_tick              = task_tick_rt,
731 };