[PATCH] cfq-iosched: small cfq_choose_req() optimization
[linux-2.6.git] / block / cfq-iosched.c
1 /*
2  *  CFQ, or complete fairness queueing, disk scheduler.
3  *
4  *  Based on ideas from a previously unfinished io
5  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6  *
7  *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
8  */
9 #include <linux/config.h>
10 #include <linux/module.h>
11 #include <linux/blkdev.h>
12 #include <linux/elevator.h>
13 #include <linux/hash.h>
14 #include <linux/rbtree.h>
15 #include <linux/ioprio.h>
16
17 /*
18  * tunables
19  */
20 static const int cfq_quantum = 4;               /* max queue in one round of service */
21 static const int cfq_queued = 8;                /* minimum rq allocate limit per-queue*/
22 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
23 static const int cfq_back_max = 16 * 1024;      /* maximum backwards seek, in KiB */
24 static const int cfq_back_penalty = 2;          /* penalty of a backwards seek */
25
26 static const int cfq_slice_sync = HZ / 10;
27 static int cfq_slice_async = HZ / 25;
28 static const int cfq_slice_async_rq = 2;
29 static int cfq_slice_idle = HZ / 100;
30
31 #define CFQ_IDLE_GRACE          (HZ / 10)
32 #define CFQ_SLICE_SCALE         (5)
33
34 #define CFQ_KEY_ASYNC           (0)
35 #define CFQ_KEY_ANY             (0xffff)
36
37 /*
38  * disable queueing at the driver/hardware level
39  */
40 static const int cfq_max_depth = 2;
41
42 static DEFINE_RWLOCK(cfq_exit_lock);
43
44 /*
45  * for the hash of cfqq inside the cfqd
46  */
47 #define CFQ_QHASH_SHIFT         6
48 #define CFQ_QHASH_ENTRIES       (1 << CFQ_QHASH_SHIFT)
49 #define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
50
51 /*
52  * for the hash of crq inside the cfqq
53  */
54 #define CFQ_MHASH_SHIFT         6
55 #define CFQ_MHASH_BLOCK(sec)    ((sec) >> 3)
56 #define CFQ_MHASH_ENTRIES       (1 << CFQ_MHASH_SHIFT)
57 #define CFQ_MHASH_FN(sec)       hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT)
58 #define rq_hash_key(rq)         ((rq)->sector + (rq)->nr_sectors)
59 #define list_entry_hash(ptr)    hlist_entry((ptr), struct cfq_rq, hash)
60
61 #define list_entry_cfqq(ptr)    list_entry((ptr), struct cfq_queue, cfq_list)
62 #define list_entry_fifo(ptr)    list_entry((ptr), struct request, queuelist)
63
64 #define RQ_DATA(rq)             (rq)->elevator_private
65
66 /*
67  * rb-tree defines
68  */
69 #define RB_NONE                 (2)
70 #define RB_EMPTY(node)          ((node)->rb_node == NULL)
71 #define RB_CLEAR_COLOR(node)    (node)->rb_color = RB_NONE
72 #define RB_CLEAR(node)          do {    \
73         (node)->rb_parent = NULL;       \
74         RB_CLEAR_COLOR((node));         \
75         (node)->rb_right = NULL;        \
76         (node)->rb_left = NULL;         \
77 } while (0)
78 #define RB_CLEAR_ROOT(root)     ((root)->rb_node = NULL)
79 #define rb_entry_crq(node)      rb_entry((node), struct cfq_rq, rb_node)
80 #define rq_rb_key(rq)           (rq)->sector
81
82 static kmem_cache_t *crq_pool;
83 static kmem_cache_t *cfq_pool;
84 static kmem_cache_t *cfq_ioc_pool;
85
86 static atomic_t ioc_count = ATOMIC_INIT(0);
87 static struct completion *ioc_gone;
88
89 #define CFQ_PRIO_LISTS          IOPRIO_BE_NR
90 #define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
91 #define cfq_class_be(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
92 #define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
93
94 #define ASYNC                   (0)
95 #define SYNC                    (1)
96
97 #define cfq_cfqq_dispatched(cfqq)       \
98         ((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC])
99
100 #define cfq_cfqq_class_sync(cfqq)       ((cfqq)->key != CFQ_KEY_ASYNC)
101
102 #define cfq_cfqq_sync(cfqq)             \
103         (cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
104
105 /*
106  * Per block device queue structure
107  */
108 struct cfq_data {
109         request_queue_t *queue;
110
111         /*
112          * rr list of queues with requests and the count of them
113          */
114         struct list_head rr_list[CFQ_PRIO_LISTS];
115         struct list_head busy_rr;
116         struct list_head cur_rr;
117         struct list_head idle_rr;
118         unsigned int busy_queues;
119
120         /*
121          * non-ordered list of empty cfqq's
122          */
123         struct list_head empty_list;
124
125         /*
126          * cfqq lookup hash
127          */
128         struct hlist_head *cfq_hash;
129
130         /*
131          * global crq hash for all queues
132          */
133         struct hlist_head *crq_hash;
134
135         unsigned int max_queued;
136
137         mempool_t *crq_pool;
138
139         int rq_in_driver;
140
141         /*
142          * schedule slice state info
143          */
144         /*
145          * idle window management
146          */
147         struct timer_list idle_slice_timer;
148         struct work_struct unplug_work;
149
150         struct cfq_queue *active_queue;
151         struct cfq_io_context *active_cic;
152         int cur_prio, cur_end_prio;
153         unsigned int dispatch_slice;
154
155         struct timer_list idle_class_timer;
156
157         sector_t last_sector;
158         unsigned long last_end_request;
159
160         unsigned int rq_starved;
161
162         /*
163          * tunables, see top of file
164          */
165         unsigned int cfq_quantum;
166         unsigned int cfq_queued;
167         unsigned int cfq_fifo_expire[2];
168         unsigned int cfq_back_penalty;
169         unsigned int cfq_back_max;
170         unsigned int cfq_slice[2];
171         unsigned int cfq_slice_async_rq;
172         unsigned int cfq_slice_idle;
173         unsigned int cfq_max_depth;
174
175         struct list_head cic_list;
176 };
177
178 /*
179  * Per process-grouping structure
180  */
181 struct cfq_queue {
182         /* reference count */
183         atomic_t ref;
184         /* parent cfq_data */
185         struct cfq_data *cfqd;
186         /* cfqq lookup hash */
187         struct hlist_node cfq_hash;
188         /* hash key */
189         unsigned int key;
190         /* on either rr or empty list of cfqd */
191         struct list_head cfq_list;
192         /* sorted list of pending requests */
193         struct rb_root sort_list;
194         /* if fifo isn't expired, next request to serve */
195         struct cfq_rq *next_crq;
196         /* requests queued in sort_list */
197         int queued[2];
198         /* currently allocated requests */
199         int allocated[2];
200         /* fifo list of requests in sort_list */
201         struct list_head fifo;
202
203         unsigned long slice_start;
204         unsigned long slice_end;
205         unsigned long slice_left;
206         unsigned long service_last;
207
208         /* number of requests that are on the dispatch list */
209         int on_dispatch[2];
210
211         /* io prio of this group */
212         unsigned short ioprio, org_ioprio;
213         unsigned short ioprio_class, org_ioprio_class;
214
215         /* various state flags, see below */
216         unsigned int flags;
217 };
218
219 struct cfq_rq {
220         struct rb_node rb_node;
221         sector_t rb_key;
222         struct request *request;
223         struct hlist_node hash;
224
225         struct cfq_queue *cfq_queue;
226         struct cfq_io_context *io_context;
227
228         unsigned int crq_flags;
229 };
230
231 enum cfqq_state_flags {
232         CFQ_CFQQ_FLAG_on_rr = 0,
233         CFQ_CFQQ_FLAG_wait_request,
234         CFQ_CFQQ_FLAG_must_alloc,
235         CFQ_CFQQ_FLAG_must_alloc_slice,
236         CFQ_CFQQ_FLAG_must_dispatch,
237         CFQ_CFQQ_FLAG_fifo_expire,
238         CFQ_CFQQ_FLAG_idle_window,
239         CFQ_CFQQ_FLAG_prio_changed,
240 };
241
242 #define CFQ_CFQQ_FNS(name)                                              \
243 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)         \
244 {                                                                       \
245         cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name);                     \
246 }                                                                       \
247 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)        \
248 {                                                                       \
249         cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);                    \
250 }                                                                       \
251 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)         \
252 {                                                                       \
253         return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;        \
254 }
255
256 CFQ_CFQQ_FNS(on_rr);
257 CFQ_CFQQ_FNS(wait_request);
258 CFQ_CFQQ_FNS(must_alloc);
259 CFQ_CFQQ_FNS(must_alloc_slice);
260 CFQ_CFQQ_FNS(must_dispatch);
261 CFQ_CFQQ_FNS(fifo_expire);
262 CFQ_CFQQ_FNS(idle_window);
263 CFQ_CFQQ_FNS(prio_changed);
264 #undef CFQ_CFQQ_FNS
265
266 enum cfq_rq_state_flags {
267         CFQ_CRQ_FLAG_is_sync = 0,
268 };
269
270 #define CFQ_CRQ_FNS(name)                                               \
271 static inline void cfq_mark_crq_##name(struct cfq_rq *crq)              \
272 {                                                                       \
273         crq->crq_flags |= (1 << CFQ_CRQ_FLAG_##name);                   \
274 }                                                                       \
275 static inline void cfq_clear_crq_##name(struct cfq_rq *crq)             \
276 {                                                                       \
277         crq->crq_flags &= ~(1 << CFQ_CRQ_FLAG_##name);                  \
278 }                                                                       \
279 static inline int cfq_crq_##name(const struct cfq_rq *crq)              \
280 {                                                                       \
281         return (crq->crq_flags & (1 << CFQ_CRQ_FLAG_##name)) != 0;      \
282 }
283
284 CFQ_CRQ_FNS(is_sync);
285 #undef CFQ_CRQ_FNS
286
287 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
288 static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
289 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
290
291 #define process_sync(tsk)       ((tsk)->flags & PF_SYNCWRITE)
292
293 /*
294  * lots of deadline iosched dupes, can be abstracted later...
295  */
296 static inline void cfq_del_crq_hash(struct cfq_rq *crq)
297 {
298         hlist_del_init(&crq->hash);
299 }
300
301 static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
302 {
303         const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
304
305         hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
306 }
307
308 static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
309 {
310         struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
311         struct hlist_node *entry, *next;
312
313         hlist_for_each_safe(entry, next, hash_list) {
314                 struct cfq_rq *crq = list_entry_hash(entry);
315                 struct request *__rq = crq->request;
316
317                 if (!rq_mergeable(__rq)) {
318                         cfq_del_crq_hash(crq);
319                         continue;
320                 }
321
322                 if (rq_hash_key(__rq) == offset)
323                         return __rq;
324         }
325
326         return NULL;
327 }
328
329 /*
330  * scheduler run of queue, if there are requests pending and no one in the
331  * driver that will restart queueing
332  */
333 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
334 {
335         if (cfqd->busy_queues)
336                 kblockd_schedule_work(&cfqd->unplug_work);
337 }
338
339 static int cfq_queue_empty(request_queue_t *q)
340 {
341         struct cfq_data *cfqd = q->elevator->elevator_data;
342
343         return !cfqd->busy_queues;
344 }
345
346 /*
347  * Lifted from AS - choose which of crq1 and crq2 that is best served now.
348  * We choose the request that is closest to the head right now. Distance
349  * behind the head is penalized and only allowed to a certain extent.
350  */
351 static struct cfq_rq *
352 cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
353 {
354         sector_t last, s1, s2, d1 = 0, d2 = 0;
355         unsigned long back_max;
356 #define CFQ_RQ1_WRAP    0x01 /* request 1 wraps */
357 #define CFQ_RQ2_WRAP    0x02 /* request 2 wraps */
358         unsigned wrap = 0; /* bit mask: requests behind the disk head? */
359
360         if (crq1 == NULL || crq1 == crq2)
361                 return crq2;
362         if (crq2 == NULL)
363                 return crq1;
364
365         if (cfq_crq_is_sync(crq1) && !cfq_crq_is_sync(crq2))
366                 return crq1;
367         else if (cfq_crq_is_sync(crq2) && !cfq_crq_is_sync(crq1))
368                 return crq2;
369
370         s1 = crq1->request->sector;
371         s2 = crq2->request->sector;
372
373         last = cfqd->last_sector;
374
375         /*
376          * by definition, 1KiB is 2 sectors
377          */
378         back_max = cfqd->cfq_back_max * 2;
379
380         /*
381          * Strict one way elevator _except_ in the case where we allow
382          * short backward seeks which are biased as twice the cost of a
383          * similar forward seek.
384          */
385         if (s1 >= last)
386                 d1 = s1 - last;
387         else if (s1 + back_max >= last)
388                 d1 = (last - s1) * cfqd->cfq_back_penalty;
389         else
390                 wrap |= CFQ_RQ1_WRAP;
391
392         if (s2 >= last)
393                 d2 = s2 - last;
394         else if (s2 + back_max >= last)
395                 d2 = (last - s2) * cfqd->cfq_back_penalty;
396         else
397                 wrap |= CFQ_RQ2_WRAP;
398
399         /* Found required data */
400
401         /*
402          * By doing switch() on the bit mask "wrap" we avoid having to
403          * check two variables for all permutations: --> faster!
404          */
405         switch (wrap) {
406         case 0: /* common case for CFQ: crq1 and crq2 not wrapped */
407                 if (d1 < d2)
408                         return crq1;
409                 else if (d2 < d1)
410                         return crq2;
411                 else {
412                         if (s1 >= s2)
413                                 return crq1;
414                         else
415                                 return crq2;
416                 }
417
418         case CFQ_RQ2_WRAP:
419                 return crq1;
420         case CFQ_RQ1_WRAP:
421                 return crq2;
422         case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both crqs wrapped */
423         default:
424                 /*
425                  * Since both rqs are wrapped,
426                  * start with the one that's further behind head
427                  * (--> only *one* back seek required),
428                  * since back seek takes more time than forward.
429                  */
430                 if (s1 <= s2)
431                         return crq1;
432                 else
433                         return crq2;
434         }
435 }
436
437 /*
438  * would be nice to take fifo expire time into account as well
439  */
440 static struct cfq_rq *
441 cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
442                   struct cfq_rq *last)
443 {
444         struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
445         struct rb_node *rbnext, *rbprev;
446
447         if (!(rbnext = rb_next(&last->rb_node))) {
448                 rbnext = rb_first(&cfqq->sort_list);
449                 if (rbnext == &last->rb_node)
450                         rbnext = NULL;
451         }
452
453         rbprev = rb_prev(&last->rb_node);
454
455         if (rbprev)
456                 crq_prev = rb_entry_crq(rbprev);
457         if (rbnext)
458                 crq_next = rb_entry_crq(rbnext);
459
460         return cfq_choose_req(cfqd, crq_next, crq_prev);
461 }
462
463 static void cfq_update_next_crq(struct cfq_rq *crq)
464 {
465         struct cfq_queue *cfqq = crq->cfq_queue;
466
467         if (cfqq->next_crq == crq)
468                 cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
469 }
470
471 static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
472 {
473         struct cfq_data *cfqd = cfqq->cfqd;
474         struct list_head *list, *entry;
475
476         BUG_ON(!cfq_cfqq_on_rr(cfqq));
477
478         list_del(&cfqq->cfq_list);
479
480         if (cfq_class_rt(cfqq))
481                 list = &cfqd->cur_rr;
482         else if (cfq_class_idle(cfqq))
483                 list = &cfqd->idle_rr;
484         else {
485                 /*
486                  * if cfqq has requests in flight, don't allow it to be
487                  * found in cfq_set_active_queue before it has finished them.
488                  * this is done to increase fairness between a process that
489                  * has lots of io pending vs one that only generates one
490                  * sporadically or synchronously
491                  */
492                 if (cfq_cfqq_dispatched(cfqq))
493                         list = &cfqd->busy_rr;
494                 else
495                         list = &cfqd->rr_list[cfqq->ioprio];
496         }
497
498         /*
499          * if queue was preempted, just add to front to be fair. busy_rr
500          * isn't sorted.
501          */
502         if (preempted || list == &cfqd->busy_rr) {
503                 list_add(&cfqq->cfq_list, list);
504                 return;
505         }
506
507         /*
508          * sort by when queue was last serviced
509          */
510         entry = list;
511         while ((entry = entry->prev) != list) {
512                 struct cfq_queue *__cfqq = list_entry_cfqq(entry);
513
514                 if (!__cfqq->service_last)
515                         break;
516                 if (time_before(__cfqq->service_last, cfqq->service_last))
517                         break;
518         }
519
520         list_add(&cfqq->cfq_list, entry);
521 }
522
523 /*
524  * add to busy list of queues for service, trying to be fair in ordering
525  * the pending list according to last request service
526  */
527 static inline void
528 cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
529 {
530         BUG_ON(cfq_cfqq_on_rr(cfqq));
531         cfq_mark_cfqq_on_rr(cfqq);
532         cfqd->busy_queues++;
533
534         cfq_resort_rr_list(cfqq, 0);
535 }
536
537 static inline void
538 cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
539 {
540         BUG_ON(!cfq_cfqq_on_rr(cfqq));
541         cfq_clear_cfqq_on_rr(cfqq);
542         list_move(&cfqq->cfq_list, &cfqd->empty_list);
543
544         BUG_ON(!cfqd->busy_queues);
545         cfqd->busy_queues--;
546 }
547
548 /*
549  * rb tree support functions
550  */
551 static inline void cfq_del_crq_rb(struct cfq_rq *crq)
552 {
553         struct cfq_queue *cfqq = crq->cfq_queue;
554         struct cfq_data *cfqd = cfqq->cfqd;
555         const int sync = cfq_crq_is_sync(crq);
556
557         BUG_ON(!cfqq->queued[sync]);
558         cfqq->queued[sync]--;
559
560         cfq_update_next_crq(crq);
561
562         rb_erase(&crq->rb_node, &cfqq->sort_list);
563         RB_CLEAR_COLOR(&crq->rb_node);
564
565         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY(&cfqq->sort_list))
566                 cfq_del_cfqq_rr(cfqd, cfqq);
567 }
568
569 static struct cfq_rq *
570 __cfq_add_crq_rb(struct cfq_rq *crq)
571 {
572         struct rb_node **p = &crq->cfq_queue->sort_list.rb_node;
573         struct rb_node *parent = NULL;
574         struct cfq_rq *__crq;
575
576         while (*p) {
577                 parent = *p;
578                 __crq = rb_entry_crq(parent);
579
580                 if (crq->rb_key < __crq->rb_key)
581                         p = &(*p)->rb_left;
582                 else if (crq->rb_key > __crq->rb_key)
583                         p = &(*p)->rb_right;
584                 else
585                         return __crq;
586         }
587
588         rb_link_node(&crq->rb_node, parent, p);
589         return NULL;
590 }
591
592 static void cfq_add_crq_rb(struct cfq_rq *crq)
593 {
594         struct cfq_queue *cfqq = crq->cfq_queue;
595         struct cfq_data *cfqd = cfqq->cfqd;
596         struct request *rq = crq->request;
597         struct cfq_rq *__alias;
598
599         crq->rb_key = rq_rb_key(rq);
600         cfqq->queued[cfq_crq_is_sync(crq)]++;
601
602         /*
603          * looks a little odd, but the first insert might return an alias.
604          * if that happens, put the alias on the dispatch list
605          */
606         while ((__alias = __cfq_add_crq_rb(crq)) != NULL)
607                 cfq_dispatch_insert(cfqd->queue, __alias);
608
609         rb_insert_color(&crq->rb_node, &cfqq->sort_list);
610
611         if (!cfq_cfqq_on_rr(cfqq))
612                 cfq_add_cfqq_rr(cfqd, cfqq);
613
614         /*
615          * check if this request is a better next-serve candidate
616          */
617         cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
618 }
619
620 static inline void
621 cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
622 {
623         rb_erase(&crq->rb_node, &cfqq->sort_list);
624         cfqq->queued[cfq_crq_is_sync(crq)]--;
625
626         cfq_add_crq_rb(crq);
627 }
628
629 static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
630
631 {
632         struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid, CFQ_KEY_ANY);
633         struct rb_node *n;
634
635         if (!cfqq)
636                 goto out;
637
638         n = cfqq->sort_list.rb_node;
639         while (n) {
640                 struct cfq_rq *crq = rb_entry_crq(n);
641
642                 if (sector < crq->rb_key)
643                         n = n->rb_left;
644                 else if (sector > crq->rb_key)
645                         n = n->rb_right;
646                 else
647                         return crq->request;
648         }
649
650 out:
651         return NULL;
652 }
653
654 static void cfq_activate_request(request_queue_t *q, struct request *rq)
655 {
656         struct cfq_data *cfqd = q->elevator->elevator_data;
657
658         cfqd->rq_in_driver++;
659 }
660
661 static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
662 {
663         struct cfq_data *cfqd = q->elevator->elevator_data;
664
665         WARN_ON(!cfqd->rq_in_driver);
666         cfqd->rq_in_driver--;
667 }
668
669 static void cfq_remove_request(struct request *rq)
670 {
671         struct cfq_rq *crq = RQ_DATA(rq);
672
673         list_del_init(&rq->queuelist);
674         cfq_del_crq_rb(crq);
675         cfq_del_crq_hash(crq);
676 }
677
678 static int
679 cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
680 {
681         struct cfq_data *cfqd = q->elevator->elevator_data;
682         struct request *__rq;
683         int ret;
684
685         __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
686         if (__rq && elv_rq_merge_ok(__rq, bio)) {
687                 ret = ELEVATOR_BACK_MERGE;
688                 goto out;
689         }
690
691         __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
692         if (__rq && elv_rq_merge_ok(__rq, bio)) {
693                 ret = ELEVATOR_FRONT_MERGE;
694                 goto out;
695         }
696
697         return ELEVATOR_NO_MERGE;
698 out:
699         *req = __rq;
700         return ret;
701 }
702
703 static void cfq_merged_request(request_queue_t *q, struct request *req)
704 {
705         struct cfq_data *cfqd = q->elevator->elevator_data;
706         struct cfq_rq *crq = RQ_DATA(req);
707
708         cfq_del_crq_hash(crq);
709         cfq_add_crq_hash(cfqd, crq);
710
711         if (rq_rb_key(req) != crq->rb_key) {
712                 struct cfq_queue *cfqq = crq->cfq_queue;
713
714                 cfq_update_next_crq(crq);
715                 cfq_reposition_crq_rb(cfqq, crq);
716         }
717 }
718
719 static void
720 cfq_merged_requests(request_queue_t *q, struct request *rq,
721                     struct request *next)
722 {
723         cfq_merged_request(q, rq);
724
725         /*
726          * reposition in fifo if next is older than rq
727          */
728         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
729             time_before(next->start_time, rq->start_time))
730                 list_move(&rq->queuelist, &next->queuelist);
731
732         cfq_remove_request(next);
733 }
734
735 static inline void
736 __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
737 {
738         if (cfqq) {
739                 /*
740                  * stop potential idle class queues waiting service
741                  */
742                 del_timer(&cfqd->idle_class_timer);
743
744                 cfqq->slice_start = jiffies;
745                 cfqq->slice_end = 0;
746                 cfqq->slice_left = 0;
747                 cfq_clear_cfqq_must_alloc_slice(cfqq);
748                 cfq_clear_cfqq_fifo_expire(cfqq);
749         }
750
751         cfqd->active_queue = cfqq;
752 }
753
754 /*
755  * current cfqq expired its slice (or was too idle), select new one
756  */
757 static void
758 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
759                     int preempted)
760 {
761         unsigned long now = jiffies;
762
763         if (cfq_cfqq_wait_request(cfqq))
764                 del_timer(&cfqd->idle_slice_timer);
765
766         if (!preempted && !cfq_cfqq_dispatched(cfqq)) {
767                 cfqq->service_last = now;
768                 cfq_schedule_dispatch(cfqd);
769         }
770
771         cfq_clear_cfqq_must_dispatch(cfqq);
772         cfq_clear_cfqq_wait_request(cfqq);
773
774         /*
775          * store what was left of this slice, if the queue idled out
776          * or was preempted
777          */
778         if (time_after(cfqq->slice_end, now))
779                 cfqq->slice_left = cfqq->slice_end - now;
780         else
781                 cfqq->slice_left = 0;
782
783         if (cfq_cfqq_on_rr(cfqq))
784                 cfq_resort_rr_list(cfqq, preempted);
785
786         if (cfqq == cfqd->active_queue)
787                 cfqd->active_queue = NULL;
788
789         if (cfqd->active_cic) {
790                 put_io_context(cfqd->active_cic->ioc);
791                 cfqd->active_cic = NULL;
792         }
793
794         cfqd->dispatch_slice = 0;
795 }
796
797 static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
798 {
799         struct cfq_queue *cfqq = cfqd->active_queue;
800
801         if (cfqq)
802                 __cfq_slice_expired(cfqd, cfqq, preempted);
803 }
804
805 /*
806  * 0
807  * 0,1
808  * 0,1,2
809  * 0,1,2,3
810  * 0,1,2,3,4
811  * 0,1,2,3,4,5
812  * 0,1,2,3,4,5,6
813  * 0,1,2,3,4,5,6,7
814  */
815 static int cfq_get_next_prio_level(struct cfq_data *cfqd)
816 {
817         int prio, wrap;
818
819         prio = -1;
820         wrap = 0;
821         do {
822                 int p;
823
824                 for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
825                         if (!list_empty(&cfqd->rr_list[p])) {
826                                 prio = p;
827                                 break;
828                         }
829                 }
830
831                 if (prio != -1)
832                         break;
833                 cfqd->cur_prio = 0;
834                 if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
835                         cfqd->cur_end_prio = 0;
836                         if (wrap)
837                                 break;
838                         wrap = 1;
839                 }
840         } while (1);
841
842         if (unlikely(prio == -1))
843                 return -1;
844
845         BUG_ON(prio >= CFQ_PRIO_LISTS);
846
847         list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
848
849         cfqd->cur_prio = prio + 1;
850         if (cfqd->cur_prio > cfqd->cur_end_prio) {
851                 cfqd->cur_end_prio = cfqd->cur_prio;
852                 cfqd->cur_prio = 0;
853         }
854         if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
855                 cfqd->cur_prio = 0;
856                 cfqd->cur_end_prio = 0;
857         }
858
859         return prio;
860 }
861
862 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
863 {
864         struct cfq_queue *cfqq = NULL;
865
866         /*
867          * if current list is non-empty, grab first entry. if it is empty,
868          * get next prio level and grab first entry then if any are spliced
869          */
870         if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1)
871                 cfqq = list_entry_cfqq(cfqd->cur_rr.next);
872
873         /*
874          * if we have idle queues and no rt or be queues had pending
875          * requests, either allow immediate service if the grace period
876          * has passed or arm the idle grace timer
877          */
878         if (!cfqq && !list_empty(&cfqd->idle_rr)) {
879                 unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
880
881                 if (time_after_eq(jiffies, end))
882                         cfqq = list_entry_cfqq(cfqd->idle_rr.next);
883                 else
884                         mod_timer(&cfqd->idle_class_timer, end);
885         }
886
887         __cfq_set_active_queue(cfqd, cfqq);
888         return cfqq;
889 }
890
891 static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
892
893 {
894         unsigned long sl;
895
896         WARN_ON(!RB_EMPTY(&cfqq->sort_list));
897         WARN_ON(cfqq != cfqd->active_queue);
898
899         /*
900          * idle is disabled, either manually or by past process history
901          */
902         if (!cfqd->cfq_slice_idle)
903                 return 0;
904         if (!cfq_cfqq_idle_window(cfqq))
905                 return 0;
906         /*
907          * task has exited, don't wait
908          */
909         if (cfqd->active_cic && !cfqd->active_cic->ioc->task)
910                 return 0;
911
912         cfq_mark_cfqq_must_dispatch(cfqq);
913         cfq_mark_cfqq_wait_request(cfqq);
914
915         sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
916         mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
917         return 1;
918 }
919
920 static void cfq_dispatch_insert(request_queue_t *q, struct cfq_rq *crq)
921 {
922         struct cfq_data *cfqd = q->elevator->elevator_data;
923         struct cfq_queue *cfqq = crq->cfq_queue;
924
925         cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq);
926         cfq_remove_request(crq->request);
927         cfqq->on_dispatch[cfq_crq_is_sync(crq)]++;
928         elv_dispatch_sort(q, crq->request);
929 }
930
931 /*
932  * return expired entry, or NULL to just start from scratch in rbtree
933  */
934 static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
935 {
936         struct cfq_data *cfqd = cfqq->cfqd;
937         struct request *rq;
938         struct cfq_rq *crq;
939
940         if (cfq_cfqq_fifo_expire(cfqq))
941                 return NULL;
942
943         if (!list_empty(&cfqq->fifo)) {
944                 int fifo = cfq_cfqq_class_sync(cfqq);
945
946                 crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next));
947                 rq = crq->request;
948                 if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
949                         cfq_mark_cfqq_fifo_expire(cfqq);
950                         return crq;
951                 }
952         }
953
954         return NULL;
955 }
956
957 /*
958  * Scale schedule slice based on io priority. Use the sync time slice only
959  * if a queue is marked sync and has sync io queued. A sync queue with async
960  * io only, should not get full sync slice length.
961  */
962 static inline int
963 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
964 {
965         const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
966
967         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
968
969         return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
970 }
971
972 static inline void
973 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
974 {
975         cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
976 }
977
978 static inline int
979 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
980 {
981         const int base_rq = cfqd->cfq_slice_async_rq;
982
983         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
984
985         return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
986 }
987
988 /*
989  * get next queue for service
990  */
991 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
992 {
993         unsigned long now = jiffies;
994         struct cfq_queue *cfqq;
995
996         cfqq = cfqd->active_queue;
997         if (!cfqq)
998                 goto new_queue;
999
1000         /*
1001          * slice has expired
1002          */
1003         if (!cfq_cfqq_must_dispatch(cfqq) && time_after(now, cfqq->slice_end))
1004                 goto expire;
1005
1006         /*
1007          * if queue has requests, dispatch one. if not, check if
1008          * enough slice is left to wait for one
1009          */
1010         if (!RB_EMPTY(&cfqq->sort_list))
1011                 goto keep_queue;
1012         else if (cfq_cfqq_class_sync(cfqq) &&
1013                  time_before(now, cfqq->slice_end)) {
1014                 if (cfq_arm_slice_timer(cfqd, cfqq))
1015                         return NULL;
1016         }
1017
1018 expire:
1019         cfq_slice_expired(cfqd, 0);
1020 new_queue:
1021         cfqq = cfq_set_active_queue(cfqd);
1022 keep_queue:
1023         return cfqq;
1024 }
1025
1026 static int
1027 __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1028                         int max_dispatch)
1029 {
1030         int dispatched = 0;
1031
1032         BUG_ON(RB_EMPTY(&cfqq->sort_list));
1033
1034         do {
1035                 struct cfq_rq *crq;
1036
1037                 /*
1038                  * follow expired path, else get first next available
1039                  */
1040                 if ((crq = cfq_check_fifo(cfqq)) == NULL)
1041                         crq = cfqq->next_crq;
1042
1043                 /*
1044                  * finally, insert request into driver dispatch list
1045                  */
1046                 cfq_dispatch_insert(cfqd->queue, crq);
1047
1048                 cfqd->dispatch_slice++;
1049                 dispatched++;
1050
1051                 if (!cfqd->active_cic) {
1052                         atomic_inc(&crq->io_context->ioc->refcount);
1053                         cfqd->active_cic = crq->io_context;
1054                 }
1055
1056                 if (RB_EMPTY(&cfqq->sort_list))
1057                         break;
1058
1059         } while (dispatched < max_dispatch);
1060
1061         /*
1062          * if slice end isn't set yet, set it. if at least one request was
1063          * sync, use the sync time slice value
1064          */
1065         if (!cfqq->slice_end)
1066                 cfq_set_prio_slice(cfqd, cfqq);
1067
1068         /*
1069          * expire an async queue immediately if it has used up its slice. idle
1070          * queue always expire after 1 dispatch round.
1071          */
1072         if ((!cfq_cfqq_sync(cfqq) &&
1073             cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
1074             cfq_class_idle(cfqq))
1075                 cfq_slice_expired(cfqd, 0);
1076
1077         return dispatched;
1078 }
1079
1080 static int
1081 cfq_forced_dispatch_cfqqs(struct list_head *list)
1082 {
1083         int dispatched = 0;
1084         struct cfq_queue *cfqq, *next;
1085         struct cfq_rq *crq;
1086
1087         list_for_each_entry_safe(cfqq, next, list, cfq_list) {
1088                 while ((crq = cfqq->next_crq)) {
1089                         cfq_dispatch_insert(cfqq->cfqd->queue, crq);
1090                         dispatched++;
1091                 }
1092                 BUG_ON(!list_empty(&cfqq->fifo));
1093         }
1094         return dispatched;
1095 }
1096
1097 static int
1098 cfq_forced_dispatch(struct cfq_data *cfqd)
1099 {
1100         int i, dispatched = 0;
1101
1102         for (i = 0; i < CFQ_PRIO_LISTS; i++)
1103                 dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]);
1104
1105         dispatched += cfq_forced_dispatch_cfqqs(&cfqd->busy_rr);
1106         dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
1107         dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr);
1108
1109         cfq_slice_expired(cfqd, 0);
1110
1111         BUG_ON(cfqd->busy_queues);
1112
1113         return dispatched;
1114 }
1115
1116 static int
1117 cfq_dispatch_requests(request_queue_t *q, int force)
1118 {
1119         struct cfq_data *cfqd = q->elevator->elevator_data;
1120         struct cfq_queue *cfqq;
1121
1122         if (!cfqd->busy_queues)
1123                 return 0;
1124
1125         if (unlikely(force))
1126                 return cfq_forced_dispatch(cfqd);
1127
1128         cfqq = cfq_select_queue(cfqd);
1129         if (cfqq) {
1130                 int max_dispatch;
1131
1132                 /*
1133                  * if idle window is disabled, allow queue buildup
1134                  */
1135                 if (!cfq_cfqq_idle_window(cfqq) &&
1136                     cfqd->rq_in_driver >= cfqd->cfq_max_depth)
1137                         return 0;
1138
1139                 cfq_clear_cfqq_must_dispatch(cfqq);
1140                 cfq_clear_cfqq_wait_request(cfqq);
1141                 del_timer(&cfqd->idle_slice_timer);
1142
1143                 max_dispatch = cfqd->cfq_quantum;
1144                 if (cfq_class_idle(cfqq))
1145                         max_dispatch = 1;
1146
1147                 return __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
1148         }
1149
1150         return 0;
1151 }
1152
1153 /*
1154  * task holds one reference to the queue, dropped when task exits. each crq
1155  * in-flight on this queue also holds a reference, dropped when crq is freed.
1156  *
1157  * queue lock must be held here.
1158  */
1159 static void cfq_put_queue(struct cfq_queue *cfqq)
1160 {
1161         struct cfq_data *cfqd = cfqq->cfqd;
1162
1163         BUG_ON(atomic_read(&cfqq->ref) <= 0);
1164
1165         if (!atomic_dec_and_test(&cfqq->ref))
1166                 return;
1167
1168         BUG_ON(rb_first(&cfqq->sort_list));
1169         BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
1170         BUG_ON(cfq_cfqq_on_rr(cfqq));
1171
1172         if (unlikely(cfqd->active_queue == cfqq))
1173                 __cfq_slice_expired(cfqd, cfqq, 0);
1174
1175         /*
1176          * it's on the empty list and still hashed
1177          */
1178         list_del(&cfqq->cfq_list);
1179         hlist_del(&cfqq->cfq_hash);
1180         kmem_cache_free(cfq_pool, cfqq);
1181 }
1182
1183 static inline struct cfq_queue *
1184 __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
1185                     const int hashval)
1186 {
1187         struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
1188         struct hlist_node *entry, *next;
1189
1190         hlist_for_each_safe(entry, next, hash_list) {
1191                 struct cfq_queue *__cfqq = list_entry_qhash(entry);
1192                 const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio);
1193
1194                 if (__cfqq->key == key && (__p == prio || prio == CFQ_KEY_ANY))
1195                         return __cfqq;
1196         }
1197
1198         return NULL;
1199 }
1200
1201 static struct cfq_queue *
1202 cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
1203 {
1204         return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
1205 }
1206
1207 static void cfq_free_io_context(struct io_context *ioc)
1208 {
1209         struct cfq_io_context *__cic;
1210         struct rb_node *n;
1211         int freed = 0;
1212
1213         while ((n = rb_first(&ioc->cic_root)) != NULL) {
1214                 __cic = rb_entry(n, struct cfq_io_context, rb_node);
1215                 rb_erase(&__cic->rb_node, &ioc->cic_root);
1216                 kmem_cache_free(cfq_ioc_pool, __cic);
1217                 freed++;
1218         }
1219
1220         if (atomic_sub_and_test(freed, &ioc_count) && ioc_gone)
1221                 complete(ioc_gone);
1222 }
1223
1224 static void cfq_trim(struct io_context *ioc)
1225 {
1226         ioc->set_ioprio = NULL;
1227         cfq_free_io_context(ioc);
1228 }
1229
1230 /*
1231  * Called with interrupts disabled
1232  */
1233 static void cfq_exit_single_io_context(struct cfq_io_context *cic)
1234 {
1235         struct cfq_data *cfqd = cic->key;
1236         request_queue_t *q;
1237
1238         if (!cfqd)
1239                 return;
1240
1241         q = cfqd->queue;
1242
1243         WARN_ON(!irqs_disabled());
1244
1245         spin_lock(q->queue_lock);
1246
1247         if (cic->cfqq[ASYNC]) {
1248                 if (unlikely(cic->cfqq[ASYNC] == cfqd->active_queue))
1249                         __cfq_slice_expired(cfqd, cic->cfqq[ASYNC], 0);
1250                 cfq_put_queue(cic->cfqq[ASYNC]);
1251                 cic->cfqq[ASYNC] = NULL;
1252         }
1253
1254         if (cic->cfqq[SYNC]) {
1255                 if (unlikely(cic->cfqq[SYNC] == cfqd->active_queue))
1256                         __cfq_slice_expired(cfqd, cic->cfqq[SYNC], 0);
1257                 cfq_put_queue(cic->cfqq[SYNC]);
1258                 cic->cfqq[SYNC] = NULL;
1259         }
1260
1261         cic->key = NULL;
1262         list_del_init(&cic->queue_list);
1263         spin_unlock(q->queue_lock);
1264 }
1265
1266 static void cfq_exit_io_context(struct io_context *ioc)
1267 {
1268         struct cfq_io_context *__cic;
1269         unsigned long flags;
1270         struct rb_node *n;
1271
1272         /*
1273          * put the reference this task is holding to the various queues
1274          */
1275         read_lock_irqsave(&cfq_exit_lock, flags);
1276
1277         n = rb_first(&ioc->cic_root);
1278         while (n != NULL) {
1279                 __cic = rb_entry(n, struct cfq_io_context, rb_node);
1280
1281                 cfq_exit_single_io_context(__cic);
1282                 n = rb_next(n);
1283         }
1284
1285         read_unlock_irqrestore(&cfq_exit_lock, flags);
1286 }
1287
1288 static struct cfq_io_context *
1289 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1290 {
1291         struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
1292
1293         if (cic) {
1294                 RB_CLEAR(&cic->rb_node);
1295                 cic->key = NULL;
1296                 cic->cfqq[ASYNC] = NULL;
1297                 cic->cfqq[SYNC] = NULL;
1298                 cic->last_end_request = jiffies;
1299                 cic->ttime_total = 0;
1300                 cic->ttime_samples = 0;
1301                 cic->ttime_mean = 0;
1302                 cic->dtor = cfq_free_io_context;
1303                 cic->exit = cfq_exit_io_context;
1304                 INIT_LIST_HEAD(&cic->queue_list);
1305                 atomic_inc(&ioc_count);
1306         }
1307
1308         return cic;
1309 }
1310
1311 static void cfq_init_prio_data(struct cfq_queue *cfqq)
1312 {
1313         struct task_struct *tsk = current;
1314         int ioprio_class;
1315
1316         if (!cfq_cfqq_prio_changed(cfqq))
1317                 return;
1318
1319         ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
1320         switch (ioprio_class) {
1321                 default:
1322                         printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
1323                 case IOPRIO_CLASS_NONE:
1324                         /*
1325                          * no prio set, place us in the middle of the BE classes
1326                          */
1327                         cfqq->ioprio = task_nice_ioprio(tsk);
1328                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
1329                         break;
1330                 case IOPRIO_CLASS_RT:
1331                         cfqq->ioprio = task_ioprio(tsk);
1332                         cfqq->ioprio_class = IOPRIO_CLASS_RT;
1333                         break;
1334                 case IOPRIO_CLASS_BE:
1335                         cfqq->ioprio = task_ioprio(tsk);
1336                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
1337                         break;
1338                 case IOPRIO_CLASS_IDLE:
1339                         cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
1340                         cfqq->ioprio = 7;
1341                         cfq_clear_cfqq_idle_window(cfqq);
1342                         break;
1343         }
1344
1345         /*
1346          * keep track of original prio settings in case we have to temporarily
1347          * elevate the priority of this queue
1348          */
1349         cfqq->org_ioprio = cfqq->ioprio;
1350         cfqq->org_ioprio_class = cfqq->ioprio_class;
1351
1352         if (cfq_cfqq_on_rr(cfqq))
1353                 cfq_resort_rr_list(cfqq, 0);
1354
1355         cfq_clear_cfqq_prio_changed(cfqq);
1356 }
1357
1358 static inline void changed_ioprio(struct cfq_io_context *cic)
1359 {
1360         struct cfq_data *cfqd = cic->key;
1361         struct cfq_queue *cfqq;
1362         if (cfqd) {
1363                 spin_lock(cfqd->queue->queue_lock);
1364                 cfqq = cic->cfqq[ASYNC];
1365                 if (cfqq) {
1366                         struct cfq_queue *new_cfqq;
1367                         new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC,
1368                                                 cic->ioc->task, GFP_ATOMIC);
1369                         if (new_cfqq) {
1370                                 cic->cfqq[ASYNC] = new_cfqq;
1371                                 cfq_put_queue(cfqq);
1372                         }
1373                 }
1374                 cfqq = cic->cfqq[SYNC];
1375                 if (cfqq) {
1376                         cfq_mark_cfqq_prio_changed(cfqq);
1377                         cfq_init_prio_data(cfqq);
1378                 }
1379                 spin_unlock(cfqd->queue->queue_lock);
1380         }
1381 }
1382
1383 /*
1384  * callback from sys_ioprio_set, irqs are disabled
1385  */
1386 static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
1387 {
1388         struct cfq_io_context *cic;
1389         struct rb_node *n;
1390
1391         write_lock(&cfq_exit_lock);
1392
1393         n = rb_first(&ioc->cic_root);
1394         while (n != NULL) {
1395                 cic = rb_entry(n, struct cfq_io_context, rb_node);
1396  
1397                 changed_ioprio(cic);
1398                 n = rb_next(n);
1399         }
1400
1401         write_unlock(&cfq_exit_lock);
1402
1403         return 0;
1404 }
1405
1406 static struct cfq_queue *
1407 cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk,
1408               gfp_t gfp_mask)
1409 {
1410         const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1411         struct cfq_queue *cfqq, *new_cfqq = NULL;
1412         unsigned short ioprio;
1413
1414 retry:
1415         ioprio = tsk->ioprio;
1416         cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
1417
1418         if (!cfqq) {
1419                 if (new_cfqq) {
1420                         cfqq = new_cfqq;
1421                         new_cfqq = NULL;
1422                 } else if (gfp_mask & __GFP_WAIT) {
1423                         spin_unlock_irq(cfqd->queue->queue_lock);
1424                         new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1425                         spin_lock_irq(cfqd->queue->queue_lock);
1426                         goto retry;
1427                 } else {
1428                         cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1429                         if (!cfqq)
1430                                 goto out;
1431                 }
1432
1433                 memset(cfqq, 0, sizeof(*cfqq));
1434
1435                 INIT_HLIST_NODE(&cfqq->cfq_hash);
1436                 INIT_LIST_HEAD(&cfqq->cfq_list);
1437                 RB_CLEAR_ROOT(&cfqq->sort_list);
1438                 INIT_LIST_HEAD(&cfqq->fifo);
1439
1440                 cfqq->key = key;
1441                 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1442                 atomic_set(&cfqq->ref, 0);
1443                 cfqq->cfqd = cfqd;
1444                 cfqq->service_last = 0;
1445                 /*
1446                  * set ->slice_left to allow preemption for a new process
1447                  */
1448                 cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
1449                 cfq_mark_cfqq_idle_window(cfqq);
1450                 cfq_mark_cfqq_prio_changed(cfqq);
1451                 cfq_init_prio_data(cfqq);
1452         }
1453
1454         if (new_cfqq)
1455                 kmem_cache_free(cfq_pool, new_cfqq);
1456
1457         atomic_inc(&cfqq->ref);
1458 out:
1459         WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1460         return cfqq;
1461 }
1462
1463 static struct cfq_io_context *
1464 cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
1465 {
1466         struct rb_node *n = ioc->cic_root.rb_node;
1467         struct cfq_io_context *cic;
1468         void *key = cfqd;
1469
1470         while (n) {
1471                 cic = rb_entry(n, struct cfq_io_context, rb_node);
1472
1473                 if (key < cic->key)
1474                         n = n->rb_left;
1475                 else if (key > cic->key)
1476                         n = n->rb_right;
1477                 else
1478                         return cic;
1479         }
1480
1481         return NULL;
1482 }
1483
1484 static inline void
1485 cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
1486              struct cfq_io_context *cic)
1487 {
1488         struct rb_node **p = &ioc->cic_root.rb_node;
1489         struct rb_node *parent = NULL;
1490         struct cfq_io_context *__cic;
1491
1492         read_lock(&cfq_exit_lock);
1493
1494         cic->ioc = ioc;
1495         cic->key = cfqd;
1496
1497         ioc->set_ioprio = cfq_ioc_set_ioprio;
1498
1499         while (*p) {
1500                 parent = *p;
1501                 __cic = rb_entry(parent, struct cfq_io_context, rb_node);
1502
1503                 if (cic->key < __cic->key)
1504                         p = &(*p)->rb_left;
1505                 else if (cic->key > __cic->key)
1506                         p = &(*p)->rb_right;
1507                 else
1508                         BUG();
1509         }
1510
1511         rb_link_node(&cic->rb_node, parent, p);
1512         rb_insert_color(&cic->rb_node, &ioc->cic_root);
1513         list_add(&cic->queue_list, &cfqd->cic_list);
1514         read_unlock(&cfq_exit_lock);
1515 }
1516
1517 /*
1518  * Setup general io context and cfq io context. There can be several cfq
1519  * io contexts per general io context, if this process is doing io to more
1520  * than one device managed by cfq.
1521  */
1522 static struct cfq_io_context *
1523 cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1524 {
1525         struct io_context *ioc = NULL;
1526         struct cfq_io_context *cic;
1527
1528         might_sleep_if(gfp_mask & __GFP_WAIT);
1529
1530         ioc = get_io_context(gfp_mask);
1531         if (!ioc)
1532                 return NULL;
1533
1534         cic = cfq_cic_rb_lookup(cfqd, ioc);
1535         if (cic)
1536                 goto out;
1537
1538         cic = cfq_alloc_io_context(cfqd, gfp_mask);
1539         if (cic == NULL)
1540                 goto err;
1541
1542         cfq_cic_link(cfqd, ioc, cic);
1543 out:
1544         return cic;
1545 err:
1546         put_io_context(ioc);
1547         return NULL;
1548 }
1549
1550 static void
1551 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1552 {
1553         unsigned long elapsed, ttime;
1554
1555         /*
1556          * if this context already has stuff queued, thinktime is from
1557          * last queue not last end
1558          */
1559 #if 0
1560         if (time_after(cic->last_end_request, cic->last_queue))
1561                 elapsed = jiffies - cic->last_end_request;
1562         else
1563                 elapsed = jiffies - cic->last_queue;
1564 #else
1565                 elapsed = jiffies - cic->last_end_request;
1566 #endif
1567
1568         ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
1569
1570         cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
1571         cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
1572         cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
1573 }
1574
1575 #define sample_valid(samples)   ((samples) > 80)
1576
1577 /*
1578  * Disable idle window if the process thinks too long or seeks so much that
1579  * it doesn't matter
1580  */
1581 static void
1582 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1583                        struct cfq_io_context *cic)
1584 {
1585         int enable_idle = cfq_cfqq_idle_window(cfqq);
1586
1587         if (!cic->ioc->task || !cfqd->cfq_slice_idle)
1588                 enable_idle = 0;
1589         else if (sample_valid(cic->ttime_samples)) {
1590                 if (cic->ttime_mean > cfqd->cfq_slice_idle)
1591                         enable_idle = 0;
1592                 else
1593                         enable_idle = 1;
1594         }
1595
1596         if (enable_idle)
1597                 cfq_mark_cfqq_idle_window(cfqq);
1598         else
1599                 cfq_clear_cfqq_idle_window(cfqq);
1600 }
1601
1602
1603 /*
1604  * Check if new_cfqq should preempt the currently active queue. Return 0 for
1605  * no or if we aren't sure, a 1 will cause a preempt.
1606  */
1607 static int
1608 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
1609                    struct cfq_rq *crq)
1610 {
1611         struct cfq_queue *cfqq = cfqd->active_queue;
1612
1613         if (cfq_class_idle(new_cfqq))
1614                 return 0;
1615
1616         if (!cfqq)
1617                 return 1;
1618
1619         if (cfq_class_idle(cfqq))
1620                 return 1;
1621         if (!cfq_cfqq_wait_request(new_cfqq))
1622                 return 0;
1623         /*
1624          * if it doesn't have slice left, forget it
1625          */
1626         if (new_cfqq->slice_left < cfqd->cfq_slice_idle)
1627                 return 0;
1628         if (cfq_crq_is_sync(crq) && !cfq_cfqq_sync(cfqq))
1629                 return 1;
1630
1631         return 0;
1632 }
1633
1634 /*
1635  * cfqq preempts the active queue. if we allowed preempt with no slice left,
1636  * let it have half of its nominal slice.
1637  */
1638 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1639 {
1640         struct cfq_queue *__cfqq, *next;
1641
1642         list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list)
1643                 cfq_resort_rr_list(__cfqq, 1);
1644
1645         if (!cfqq->slice_left)
1646                 cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;
1647
1648         cfqq->slice_end = cfqq->slice_left + jiffies;
1649         __cfq_slice_expired(cfqd, cfqq, 1);
1650         __cfq_set_active_queue(cfqd, cfqq);
1651 }
1652
1653 /*
1654  * should really be a ll_rw_blk.c helper
1655  */
1656 static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1657 {
1658         request_queue_t *q = cfqd->queue;
1659
1660         if (!blk_queue_plugged(q))
1661                 q->request_fn(q);
1662         else
1663                 __generic_unplug_device(q);
1664 }
1665
1666 /*
1667  * Called when a new fs request (crq) is added (to cfqq). Check if there's
1668  * something we should do about it
1669  */
1670 static void
1671 cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1672                  struct cfq_rq *crq)
1673 {
1674         struct cfq_io_context *cic;
1675
1676         cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
1677
1678         /*
1679          * we never wait for an async request and we don't allow preemption
1680          * of an async request. so just return early
1681          */
1682         if (!cfq_crq_is_sync(crq))
1683                 return;
1684
1685         cic = crq->io_context;
1686
1687         cfq_update_io_thinktime(cfqd, cic);
1688         cfq_update_idle_window(cfqd, cfqq, cic);
1689
1690         cic->last_queue = jiffies;
1691
1692         if (cfqq == cfqd->active_queue) {
1693                 /*
1694                  * if we are waiting for a request for this queue, let it rip
1695                  * immediately and flag that we must not expire this queue
1696                  * just now
1697                  */
1698                 if (cfq_cfqq_wait_request(cfqq)) {
1699                         cfq_mark_cfqq_must_dispatch(cfqq);
1700                         del_timer(&cfqd->idle_slice_timer);
1701                         cfq_start_queueing(cfqd, cfqq);
1702                 }
1703         } else if (cfq_should_preempt(cfqd, cfqq, crq)) {
1704                 /*
1705                  * not the active queue - expire current slice if it is
1706                  * idle and has expired it's mean thinktime or this new queue
1707                  * has some old slice time left and is of higher priority
1708                  */
1709                 cfq_preempt_queue(cfqd, cfqq);
1710                 cfq_mark_cfqq_must_dispatch(cfqq);
1711                 cfq_start_queueing(cfqd, cfqq);
1712         }
1713 }
1714
1715 static void cfq_insert_request(request_queue_t *q, struct request *rq)
1716 {
1717         struct cfq_data *cfqd = q->elevator->elevator_data;
1718         struct cfq_rq *crq = RQ_DATA(rq);
1719         struct cfq_queue *cfqq = crq->cfq_queue;
1720
1721         cfq_init_prio_data(cfqq);
1722
1723         cfq_add_crq_rb(crq);
1724
1725         list_add_tail(&rq->queuelist, &cfqq->fifo);
1726
1727         if (rq_mergeable(rq))
1728                 cfq_add_crq_hash(cfqd, crq);
1729
1730         cfq_crq_enqueued(cfqd, cfqq, crq);
1731 }
1732
1733 static void cfq_completed_request(request_queue_t *q, struct request *rq)
1734 {
1735         struct cfq_rq *crq = RQ_DATA(rq);
1736         struct cfq_queue *cfqq = crq->cfq_queue;
1737         struct cfq_data *cfqd = cfqq->cfqd;
1738         const int sync = cfq_crq_is_sync(crq);
1739         unsigned long now;
1740
1741         now = jiffies;
1742
1743         WARN_ON(!cfqd->rq_in_driver);
1744         WARN_ON(!cfqq->on_dispatch[sync]);
1745         cfqd->rq_in_driver--;
1746         cfqq->on_dispatch[sync]--;
1747
1748         if (!cfq_class_idle(cfqq))
1749                 cfqd->last_end_request = now;
1750
1751         if (!cfq_cfqq_dispatched(cfqq)) {
1752                 if (cfq_cfqq_on_rr(cfqq)) {
1753                         cfqq->service_last = now;
1754                         cfq_resort_rr_list(cfqq, 0);
1755                 }
1756                 cfq_schedule_dispatch(cfqd);
1757         }
1758
1759         if (cfq_crq_is_sync(crq))
1760                 crq->io_context->last_end_request = now;
1761 }
1762
1763 static struct request *
1764 cfq_former_request(request_queue_t *q, struct request *rq)
1765 {
1766         struct cfq_rq *crq = RQ_DATA(rq);
1767         struct rb_node *rbprev = rb_prev(&crq->rb_node);
1768
1769         if (rbprev)
1770                 return rb_entry_crq(rbprev)->request;
1771
1772         return NULL;
1773 }
1774
1775 static struct request *
1776 cfq_latter_request(request_queue_t *q, struct request *rq)
1777 {
1778         struct cfq_rq *crq = RQ_DATA(rq);
1779         struct rb_node *rbnext = rb_next(&crq->rb_node);
1780
1781         if (rbnext)
1782                 return rb_entry_crq(rbnext)->request;
1783
1784         return NULL;
1785 }
1786
1787 /*
1788  * we temporarily boost lower priority queues if they are holding fs exclusive
1789  * resources. they are boosted to normal prio (CLASS_BE/4)
1790  */
1791 static void cfq_prio_boost(struct cfq_queue *cfqq)
1792 {
1793         const int ioprio_class = cfqq->ioprio_class;
1794         const int ioprio = cfqq->ioprio;
1795
1796         if (has_fs_excl()) {
1797                 /*
1798                  * boost idle prio on transactions that would lock out other
1799                  * users of the filesystem
1800                  */
1801                 if (cfq_class_idle(cfqq))
1802                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
1803                 if (cfqq->ioprio > IOPRIO_NORM)
1804                         cfqq->ioprio = IOPRIO_NORM;
1805         } else {
1806                 /*
1807                  * check if we need to unboost the queue
1808                  */
1809                 if (cfqq->ioprio_class != cfqq->org_ioprio_class)
1810                         cfqq->ioprio_class = cfqq->org_ioprio_class;
1811                 if (cfqq->ioprio != cfqq->org_ioprio)
1812                         cfqq->ioprio = cfqq->org_ioprio;
1813         }
1814
1815         /*
1816          * refile between round-robin lists if we moved the priority class
1817          */
1818         if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) &&
1819             cfq_cfqq_on_rr(cfqq))
1820                 cfq_resort_rr_list(cfqq, 0);
1821 }
1822
1823 static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
1824 {
1825         if (rw == READ || process_sync(task))
1826                 return task->pid;
1827
1828         return CFQ_KEY_ASYNC;
1829 }
1830
1831 static inline int
1832 __cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1833                 struct task_struct *task, int rw)
1834 {
1835 #if 1
1836         if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
1837             !cfq_cfqq_must_alloc_slice(cfqq)) {
1838                 cfq_mark_cfqq_must_alloc_slice(cfqq);
1839                 return ELV_MQUEUE_MUST;
1840         }
1841
1842         return ELV_MQUEUE_MAY;
1843 #else
1844         if (!cfqq || task->flags & PF_MEMALLOC)
1845                 return ELV_MQUEUE_MAY;
1846         if (!cfqq->allocated[rw] || cfq_cfqq_must_alloc(cfqq)) {
1847                 if (cfq_cfqq_wait_request(cfqq))
1848                         return ELV_MQUEUE_MUST;
1849
1850                 /*
1851                  * only allow 1 ELV_MQUEUE_MUST per slice, otherwise we
1852                  * can quickly flood the queue with writes from a single task
1853                  */
1854                 if (rw == READ || !cfq_cfqq_must_alloc_slice(cfqq)) {
1855                         cfq_mark_cfqq_must_alloc_slice(cfqq);
1856                         return ELV_MQUEUE_MUST;
1857                 }
1858
1859                 return ELV_MQUEUE_MAY;
1860         }
1861         if (cfq_class_idle(cfqq))
1862                 return ELV_MQUEUE_NO;
1863         if (cfqq->allocated[rw] >= cfqd->max_queued) {
1864                 struct io_context *ioc = get_io_context(GFP_ATOMIC);
1865                 int ret = ELV_MQUEUE_NO;
1866
1867                 if (ioc && ioc->nr_batch_requests)
1868                         ret = ELV_MQUEUE_MAY;
1869
1870                 put_io_context(ioc);
1871                 return ret;
1872         }
1873
1874         return ELV_MQUEUE_MAY;
1875 #endif
1876 }
1877
1878 static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio)
1879 {
1880         struct cfq_data *cfqd = q->elevator->elevator_data;
1881         struct task_struct *tsk = current;
1882         struct cfq_queue *cfqq;
1883
1884         /*
1885          * don't force setup of a queue from here, as a call to may_queue
1886          * does not necessarily imply that a request actually will be queued.
1887          * so just lookup a possibly existing queue, or return 'may queue'
1888          * if that fails
1889          */
1890         cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw), tsk->ioprio);
1891         if (cfqq) {
1892                 cfq_init_prio_data(cfqq);
1893                 cfq_prio_boost(cfqq);
1894
1895                 return __cfq_may_queue(cfqd, cfqq, tsk, rw);
1896         }
1897
1898         return ELV_MQUEUE_MAY;
1899 }
1900
1901 static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
1902 {
1903         struct cfq_data *cfqd = q->elevator->elevator_data;
1904         struct request_list *rl = &q->rq;
1905
1906         if (cfqq->allocated[READ] <= cfqd->max_queued || cfqd->rq_starved) {
1907                 smp_mb();
1908                 if (waitqueue_active(&rl->wait[READ]))
1909                         wake_up(&rl->wait[READ]);
1910         }
1911
1912         if (cfqq->allocated[WRITE] <= cfqd->max_queued || cfqd->rq_starved) {
1913                 smp_mb();
1914                 if (waitqueue_active(&rl->wait[WRITE]))
1915                         wake_up(&rl->wait[WRITE]);
1916         }
1917 }
1918
1919 /*
1920  * queue lock held here
1921  */
1922 static void cfq_put_request(request_queue_t *q, struct request *rq)
1923 {
1924         struct cfq_data *cfqd = q->elevator->elevator_data;
1925         struct cfq_rq *crq = RQ_DATA(rq);
1926
1927         if (crq) {
1928                 struct cfq_queue *cfqq = crq->cfq_queue;
1929                 const int rw = rq_data_dir(rq);
1930
1931                 BUG_ON(!cfqq->allocated[rw]);
1932                 cfqq->allocated[rw]--;
1933
1934                 put_io_context(crq->io_context->ioc);
1935
1936                 mempool_free(crq, cfqd->crq_pool);
1937                 rq->elevator_private = NULL;
1938
1939                 cfq_check_waiters(q, cfqq);
1940                 cfq_put_queue(cfqq);
1941         }
1942 }
1943
1944 /*
1945  * Allocate cfq data structures associated with this request.
1946  */
1947 static int
1948 cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
1949                 gfp_t gfp_mask)
1950 {
1951         struct cfq_data *cfqd = q->elevator->elevator_data;
1952         struct task_struct *tsk = current;
1953         struct cfq_io_context *cic;
1954         const int rw = rq_data_dir(rq);
1955         pid_t key = cfq_queue_pid(tsk, rw);
1956         struct cfq_queue *cfqq;
1957         struct cfq_rq *crq;
1958         unsigned long flags;
1959         int is_sync = key != CFQ_KEY_ASYNC;
1960
1961         might_sleep_if(gfp_mask & __GFP_WAIT);
1962
1963         cic = cfq_get_io_context(cfqd, gfp_mask);
1964
1965         spin_lock_irqsave(q->queue_lock, flags);
1966
1967         if (!cic)
1968                 goto queue_fail;
1969
1970         if (!cic->cfqq[is_sync]) {
1971                 cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask);
1972                 if (!cfqq)
1973                         goto queue_fail;
1974
1975                 cic->cfqq[is_sync] = cfqq;
1976         } else
1977                 cfqq = cic->cfqq[is_sync];
1978
1979         cfqq->allocated[rw]++;
1980         cfq_clear_cfqq_must_alloc(cfqq);
1981         cfqd->rq_starved = 0;
1982         atomic_inc(&cfqq->ref);
1983         spin_unlock_irqrestore(q->queue_lock, flags);
1984
1985         crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
1986         if (crq) {
1987                 RB_CLEAR(&crq->rb_node);
1988                 crq->rb_key = 0;
1989                 crq->request = rq;
1990                 INIT_HLIST_NODE(&crq->hash);
1991                 crq->cfq_queue = cfqq;
1992                 crq->io_context = cic;
1993
1994                 if (is_sync)
1995                         cfq_mark_crq_is_sync(crq);
1996                 else
1997                         cfq_clear_crq_is_sync(crq);
1998
1999                 rq->elevator_private = crq;
2000                 return 0;
2001         }
2002
2003         spin_lock_irqsave(q->queue_lock, flags);
2004         cfqq->allocated[rw]--;
2005         if (!(cfqq->allocated[0] + cfqq->allocated[1]))
2006                 cfq_mark_cfqq_must_alloc(cfqq);
2007         cfq_put_queue(cfqq);
2008 queue_fail:
2009         if (cic)
2010                 put_io_context(cic->ioc);
2011         /*
2012          * mark us rq allocation starved. we need to kickstart the process
2013          * ourselves if there are no pending requests that can do it for us.
2014          * that would be an extremely rare OOM situation
2015          */
2016         cfqd->rq_starved = 1;
2017         cfq_schedule_dispatch(cfqd);
2018         spin_unlock_irqrestore(q->queue_lock, flags);
2019         return 1;
2020 }
2021
2022 static void cfq_kick_queue(void *data)
2023 {
2024         request_queue_t *q = data;
2025         struct cfq_data *cfqd = q->elevator->elevator_data;
2026         unsigned long flags;
2027
2028         spin_lock_irqsave(q->queue_lock, flags);
2029
2030         if (cfqd->rq_starved) {
2031                 struct request_list *rl = &q->rq;
2032
2033                 /*
2034                  * we aren't guaranteed to get a request after this, but we
2035                  * have to be opportunistic
2036                  */
2037                 smp_mb();
2038                 if (waitqueue_active(&rl->wait[READ]))
2039                         wake_up(&rl->wait[READ]);
2040                 if (waitqueue_active(&rl->wait[WRITE]))
2041                         wake_up(&rl->wait[WRITE]);
2042         }
2043
2044         blk_remove_plug(q);
2045         q->request_fn(q);
2046         spin_unlock_irqrestore(q->queue_lock, flags);
2047 }
2048
2049 /*
2050  * Timer running if the active_queue is currently idling inside its time slice
2051  */
2052 static void cfq_idle_slice_timer(unsigned long data)
2053 {
2054         struct cfq_data *cfqd = (struct cfq_data *) data;
2055         struct cfq_queue *cfqq;
2056         unsigned long flags;
2057
2058         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2059
2060         if ((cfqq = cfqd->active_queue) != NULL) {
2061                 unsigned long now = jiffies;
2062
2063                 /*
2064                  * expired
2065                  */
2066                 if (time_after(now, cfqq->slice_end))
2067                         goto expire;
2068
2069                 /*
2070                  * only expire and reinvoke request handler, if there are
2071                  * other queues with pending requests
2072                  */
2073                 if (!cfqd->busy_queues) {
2074                         cfqd->idle_slice_timer.expires = min(now + cfqd->cfq_slice_idle, cfqq->slice_end);
2075                         add_timer(&cfqd->idle_slice_timer);
2076                         goto out_cont;
2077                 }
2078
2079                 /*
2080                  * not expired and it has a request pending, let it dispatch
2081                  */
2082                 if (!RB_EMPTY(&cfqq->sort_list)) {
2083                         cfq_mark_cfqq_must_dispatch(cfqq);
2084                         goto out_kick;
2085                 }
2086         }
2087 expire:
2088         cfq_slice_expired(cfqd, 0);
2089 out_kick:
2090         cfq_schedule_dispatch(cfqd);
2091 out_cont:
2092         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2093 }
2094
2095 /*
2096  * Timer running if an idle class queue is waiting for service
2097  */
2098 static void cfq_idle_class_timer(unsigned long data)
2099 {
2100         struct cfq_data *cfqd = (struct cfq_data *) data;
2101         unsigned long flags, end;
2102
2103         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2104
2105         /*
2106          * race with a non-idle queue, reset timer
2107          */
2108         end = cfqd->last_end_request + CFQ_IDLE_GRACE;
2109         if (!time_after_eq(jiffies, end)) {
2110                 cfqd->idle_class_timer.expires = end;
2111                 add_timer(&cfqd->idle_class_timer);
2112         } else
2113                 cfq_schedule_dispatch(cfqd);
2114
2115         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2116 }
2117
2118 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
2119 {
2120         del_timer_sync(&cfqd->idle_slice_timer);
2121         del_timer_sync(&cfqd->idle_class_timer);
2122         blk_sync_queue(cfqd->queue);
2123 }
2124
2125 static void cfq_exit_queue(elevator_t *e)
2126 {
2127         struct cfq_data *cfqd = e->elevator_data;
2128         request_queue_t *q = cfqd->queue;
2129
2130         cfq_shutdown_timer_wq(cfqd);
2131
2132         write_lock(&cfq_exit_lock);
2133         spin_lock_irq(q->queue_lock);
2134
2135         if (cfqd->active_queue)
2136                 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
2137
2138         while (!list_empty(&cfqd->cic_list)) {
2139                 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
2140                                                         struct cfq_io_context,
2141                                                         queue_list);
2142                 if (cic->cfqq[ASYNC]) {
2143                         cfq_put_queue(cic->cfqq[ASYNC]);
2144                         cic->cfqq[ASYNC] = NULL;
2145                 }
2146                 if (cic->cfqq[SYNC]) {
2147                         cfq_put_queue(cic->cfqq[SYNC]);
2148                         cic->cfqq[SYNC] = NULL;
2149                 }
2150                 cic->key = NULL;
2151                 list_del_init(&cic->queue_list);
2152         }
2153
2154         spin_unlock_irq(q->queue_lock);
2155         write_unlock(&cfq_exit_lock);
2156
2157         cfq_shutdown_timer_wq(cfqd);
2158
2159         mempool_destroy(cfqd->crq_pool);
2160         kfree(cfqd->crq_hash);
2161         kfree(cfqd->cfq_hash);
2162         kfree(cfqd);
2163 }
2164
2165 static int cfq_init_queue(request_queue_t *q, elevator_t *e)
2166 {
2167         struct cfq_data *cfqd;
2168         int i;
2169
2170         cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
2171         if (!cfqd)
2172                 return -ENOMEM;
2173
2174         memset(cfqd, 0, sizeof(*cfqd));
2175
2176         for (i = 0; i < CFQ_PRIO_LISTS; i++)
2177                 INIT_LIST_HEAD(&cfqd->rr_list[i]);
2178
2179         INIT_LIST_HEAD(&cfqd->busy_rr);
2180         INIT_LIST_HEAD(&cfqd->cur_rr);
2181         INIT_LIST_HEAD(&cfqd->idle_rr);
2182         INIT_LIST_HEAD(&cfqd->empty_list);
2183         INIT_LIST_HEAD(&cfqd->cic_list);
2184
2185         cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
2186         if (!cfqd->crq_hash)
2187                 goto out_crqhash;
2188
2189         cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
2190         if (!cfqd->cfq_hash)
2191                 goto out_cfqhash;
2192
2193         cfqd->crq_pool = mempool_create_slab_pool(BLKDEV_MIN_RQ, crq_pool);
2194         if (!cfqd->crq_pool)
2195                 goto out_crqpool;
2196
2197         for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
2198                 INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
2199         for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
2200                 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
2201
2202         e->elevator_data = cfqd;
2203
2204         cfqd->queue = q;
2205
2206         cfqd->max_queued = q->nr_requests / 4;
2207         q->nr_batching = cfq_queued;
2208
2209         init_timer(&cfqd->idle_slice_timer);
2210         cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
2211         cfqd->idle_slice_timer.data = (unsigned long) cfqd;
2212
2213         init_timer(&cfqd->idle_class_timer);
2214         cfqd->idle_class_timer.function = cfq_idle_class_timer;
2215         cfqd->idle_class_timer.data = (unsigned long) cfqd;
2216
2217         INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
2218
2219         cfqd->cfq_queued = cfq_queued;
2220         cfqd->cfq_quantum = cfq_quantum;
2221         cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
2222         cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
2223         cfqd->cfq_back_max = cfq_back_max;
2224         cfqd->cfq_back_penalty = cfq_back_penalty;
2225         cfqd->cfq_slice[0] = cfq_slice_async;
2226         cfqd->cfq_slice[1] = cfq_slice_sync;
2227         cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2228         cfqd->cfq_slice_idle = cfq_slice_idle;
2229         cfqd->cfq_max_depth = cfq_max_depth;
2230
2231         return 0;
2232 out_crqpool:
2233         kfree(cfqd->cfq_hash);
2234 out_cfqhash:
2235         kfree(cfqd->crq_hash);
2236 out_crqhash:
2237         kfree(cfqd);
2238         return -ENOMEM;
2239 }
2240
2241 static void cfq_slab_kill(void)
2242 {
2243         if (crq_pool)
2244                 kmem_cache_destroy(crq_pool);
2245         if (cfq_pool)
2246                 kmem_cache_destroy(cfq_pool);
2247         if (cfq_ioc_pool)
2248                 kmem_cache_destroy(cfq_ioc_pool);
2249 }
2250
2251 static int __init cfq_slab_setup(void)
2252 {
2253         crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
2254                                         NULL, NULL);
2255         if (!crq_pool)
2256                 goto fail;
2257
2258         cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
2259                                         NULL, NULL);
2260         if (!cfq_pool)
2261                 goto fail;
2262
2263         cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
2264                         sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
2265         if (!cfq_ioc_pool)
2266                 goto fail;
2267
2268         return 0;
2269 fail:
2270         cfq_slab_kill();
2271         return -ENOMEM;
2272 }
2273
2274 /*
2275  * sysfs parts below -->
2276  */
2277
2278 static ssize_t
2279 cfq_var_show(unsigned int var, char *page)
2280 {
2281         return sprintf(page, "%d\n", var);
2282 }
2283
2284 static ssize_t
2285 cfq_var_store(unsigned int *var, const char *page, size_t count)
2286 {
2287         char *p = (char *) page;
2288
2289         *var = simple_strtoul(p, &p, 10);
2290         return count;
2291 }
2292
2293 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
2294 static ssize_t __FUNC(elevator_t *e, char *page)                        \
2295 {                                                                       \
2296         struct cfq_data *cfqd = e->elevator_data;                       \
2297         unsigned int __data = __VAR;                                    \
2298         if (__CONV)                                                     \
2299                 __data = jiffies_to_msecs(__data);                      \
2300         return cfq_var_show(__data, (page));                            \
2301 }
2302 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
2303 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
2304 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
2305 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
2306 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
2307 SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
2308 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
2309 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
2310 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
2311 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
2312 SHOW_FUNCTION(cfq_max_depth_show, cfqd->cfq_max_depth, 0);
2313 #undef SHOW_FUNCTION
2314
2315 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
2316 static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)    \
2317 {                                                                       \
2318         struct cfq_data *cfqd = e->elevator_data;                       \
2319         unsigned int __data;                                            \
2320         int ret = cfq_var_store(&__data, (page), count);                \
2321         if (__data < (MIN))                                             \
2322                 __data = (MIN);                                         \
2323         else if (__data > (MAX))                                        \
2324                 __data = (MAX);                                         \
2325         if (__CONV)                                                     \
2326                 *(__PTR) = msecs_to_jiffies(__data);                    \
2327         else                                                            \
2328                 *(__PTR) = __data;                                      \
2329         return ret;                                                     \
2330 }
2331 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
2332 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
2333 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
2334 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
2335 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
2336 STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
2337 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
2338 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
2339 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
2340 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
2341 STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0);
2342 #undef STORE_FUNCTION
2343
2344 #define CFQ_ATTR(name) \
2345         __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
2346
2347 static struct elv_fs_entry cfq_attrs[] = {
2348         CFQ_ATTR(quantum),
2349         CFQ_ATTR(queued),
2350         CFQ_ATTR(fifo_expire_sync),
2351         CFQ_ATTR(fifo_expire_async),
2352         CFQ_ATTR(back_seek_max),
2353         CFQ_ATTR(back_seek_penalty),
2354         CFQ_ATTR(slice_sync),
2355         CFQ_ATTR(slice_async),
2356         CFQ_ATTR(slice_async_rq),
2357         CFQ_ATTR(slice_idle),
2358         CFQ_ATTR(max_depth),
2359         __ATTR_NULL
2360 };
2361
2362 static struct elevator_type iosched_cfq = {
2363         .ops = {
2364                 .elevator_merge_fn =            cfq_merge,
2365                 .elevator_merged_fn =           cfq_merged_request,
2366                 .elevator_merge_req_fn =        cfq_merged_requests,
2367                 .elevator_dispatch_fn =         cfq_dispatch_requests,
2368                 .elevator_add_req_fn =          cfq_insert_request,
2369                 .elevator_activate_req_fn =     cfq_activate_request,
2370                 .elevator_deactivate_req_fn =   cfq_deactivate_request,
2371                 .elevator_queue_empty_fn =      cfq_queue_empty,
2372                 .elevator_completed_req_fn =    cfq_completed_request,
2373                 .elevator_former_req_fn =       cfq_former_request,
2374                 .elevator_latter_req_fn =       cfq_latter_request,
2375                 .elevator_set_req_fn =          cfq_set_request,
2376                 .elevator_put_req_fn =          cfq_put_request,
2377                 .elevator_may_queue_fn =        cfq_may_queue,
2378                 .elevator_init_fn =             cfq_init_queue,
2379                 .elevator_exit_fn =             cfq_exit_queue,
2380                 .trim =                         cfq_trim,
2381         },
2382         .elevator_attrs =       cfq_attrs,
2383         .elevator_name =        "cfq",
2384         .elevator_owner =       THIS_MODULE,
2385 };
2386
2387 static int __init cfq_init(void)
2388 {
2389         int ret;
2390
2391         /*
2392          * could be 0 on HZ < 1000 setups
2393          */
2394         if (!cfq_slice_async)
2395                 cfq_slice_async = 1;
2396         if (!cfq_slice_idle)
2397                 cfq_slice_idle = 1;
2398
2399         if (cfq_slab_setup())
2400                 return -ENOMEM;
2401
2402         ret = elv_register(&iosched_cfq);
2403         if (ret)
2404                 cfq_slab_kill();
2405
2406         return ret;
2407 }
2408
2409 static void __exit cfq_exit(void)
2410 {
2411         DECLARE_COMPLETION(all_gone);
2412         elv_unregister(&iosched_cfq);
2413         ioc_gone = &all_gone;
2414         barrier();
2415         if (atomic_read(&ioc_count))
2416                 complete(ioc_gone);
2417         synchronize_rcu();
2418         cfq_slab_kill();
2419 }
2420
2421 module_init(cfq_init);
2422 module_exit(cfq_exit);
2423
2424 MODULE_AUTHOR("Jens Axboe");
2425 MODULE_LICENSE("GPL");
2426 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");