[PATCH] Block queue IO tracing support (blktrace) as of 2006-03-23
[linux-2.6.git] / mm / highmem.c
1 /*
2  * High memory handling common code and variables.
3  *
4  * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
5  *          Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
6  *
7  *
8  * Redesigned the x86 32-bit VM architecture to deal with
9  * 64-bit physical space. With current x86 CPUs this
10  * means up to 64 Gigabytes physical RAM.
11  *
12  * Rewrote high memory support to move the page cache into
13  * high memory. Implemented permanent (schedulable) kmaps
14  * based on Linus' idea.
15  *
16  * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
17  */
18
19 #include <linux/mm.h>
20 #include <linux/module.h>
21 #include <linux/swap.h>
22 #include <linux/bio.h>
23 #include <linux/pagemap.h>
24 #include <linux/mempool.h>
25 #include <linux/blkdev.h>
26 #include <linux/init.h>
27 #include <linux/hash.h>
28 #include <linux/highmem.h>
29 #include <linux/blktrace_api.h>
30 #include <asm/tlbflush.h>
31
32 static mempool_t *page_pool, *isa_page_pool;
33
34 static void *page_pool_alloc_isa(gfp_t gfp_mask, void *data)
35 {
36         return alloc_page(gfp_mask | GFP_DMA);
37 }
38
39 static void page_pool_free(void *page, void *data)
40 {
41         __free_page(page);
42 }
43
44 /*
45  * Virtual_count is not a pure "count".
46  *  0 means that it is not mapped, and has not been mapped
47  *    since a TLB flush - it is usable.
48  *  1 means that there are no users, but it has been mapped
49  *    since the last TLB flush - so we can't use it.
50  *  n means that there are (n-1) current users of it.
51  */
52 #ifdef CONFIG_HIGHMEM
53
54 static void *page_pool_alloc(gfp_t gfp_mask, void *data)
55 {
56         return alloc_page(gfp_mask);
57 }
58
59 static int pkmap_count[LAST_PKMAP];
60 static unsigned int last_pkmap_nr;
61 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
62
63 pte_t * pkmap_page_table;
64
65 static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
66
67 static void flush_all_zero_pkmaps(void)
68 {
69         int i;
70
71         flush_cache_kmaps();
72
73         for (i = 0; i < LAST_PKMAP; i++) {
74                 struct page *page;
75
76                 /*
77                  * zero means we don't have anything to do,
78                  * >1 means that it is still in use. Only
79                  * a count of 1 means that it is free but
80                  * needs to be unmapped
81                  */
82                 if (pkmap_count[i] != 1)
83                         continue;
84                 pkmap_count[i] = 0;
85
86                 /* sanity check */
87                 if (pte_none(pkmap_page_table[i]))
88                         BUG();
89
90                 /*
91                  * Don't need an atomic fetch-and-clear op here;
92                  * no-one has the page mapped, and cannot get at
93                  * its virtual address (and hence PTE) without first
94                  * getting the kmap_lock (which is held here).
95                  * So no dangers, even with speculative execution.
96                  */
97                 page = pte_page(pkmap_page_table[i]);
98                 pte_clear(&init_mm, (unsigned long)page_address(page),
99                           &pkmap_page_table[i]);
100
101                 set_page_address(page, NULL);
102         }
103         flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
104 }
105
106 static inline unsigned long map_new_virtual(struct page *page)
107 {
108         unsigned long vaddr;
109         int count;
110
111 start:
112         count = LAST_PKMAP;
113         /* Find an empty entry */
114         for (;;) {
115                 last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
116                 if (!last_pkmap_nr) {
117                         flush_all_zero_pkmaps();
118                         count = LAST_PKMAP;
119                 }
120                 if (!pkmap_count[last_pkmap_nr])
121                         break;  /* Found a usable entry */
122                 if (--count)
123                         continue;
124
125                 /*
126                  * Sleep for somebody else to unmap their entries
127                  */
128                 {
129                         DECLARE_WAITQUEUE(wait, current);
130
131                         __set_current_state(TASK_UNINTERRUPTIBLE);
132                         add_wait_queue(&pkmap_map_wait, &wait);
133                         spin_unlock(&kmap_lock);
134                         schedule();
135                         remove_wait_queue(&pkmap_map_wait, &wait);
136                         spin_lock(&kmap_lock);
137
138                         /* Somebody else might have mapped it while we slept */
139                         if (page_address(page))
140                                 return (unsigned long)page_address(page);
141
142                         /* Re-start */
143                         goto start;
144                 }
145         }
146         vaddr = PKMAP_ADDR(last_pkmap_nr);
147         set_pte_at(&init_mm, vaddr,
148                    &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
149
150         pkmap_count[last_pkmap_nr] = 1;
151         set_page_address(page, (void *)vaddr);
152
153         return vaddr;
154 }
155
156 void fastcall *kmap_high(struct page *page)
157 {
158         unsigned long vaddr;
159
160         /*
161          * For highmem pages, we can't trust "virtual" until
162          * after we have the lock.
163          *
164          * We cannot call this from interrupts, as it may block
165          */
166         spin_lock(&kmap_lock);
167         vaddr = (unsigned long)page_address(page);
168         if (!vaddr)
169                 vaddr = map_new_virtual(page);
170         pkmap_count[PKMAP_NR(vaddr)]++;
171         if (pkmap_count[PKMAP_NR(vaddr)] < 2)
172                 BUG();
173         spin_unlock(&kmap_lock);
174         return (void*) vaddr;
175 }
176
177 EXPORT_SYMBOL(kmap_high);
178
179 void fastcall kunmap_high(struct page *page)
180 {
181         unsigned long vaddr;
182         unsigned long nr;
183         int need_wakeup;
184
185         spin_lock(&kmap_lock);
186         vaddr = (unsigned long)page_address(page);
187         if (!vaddr)
188                 BUG();
189         nr = PKMAP_NR(vaddr);
190
191         /*
192          * A count must never go down to zero
193          * without a TLB flush!
194          */
195         need_wakeup = 0;
196         switch (--pkmap_count[nr]) {
197         case 0:
198                 BUG();
199         case 1:
200                 /*
201                  * Avoid an unnecessary wake_up() function call.
202                  * The common case is pkmap_count[] == 1, but
203                  * no waiters.
204                  * The tasks queued in the wait-queue are guarded
205                  * by both the lock in the wait-queue-head and by
206                  * the kmap_lock.  As the kmap_lock is held here,
207                  * no need for the wait-queue-head's lock.  Simply
208                  * test if the queue is empty.
209                  */
210                 need_wakeup = waitqueue_active(&pkmap_map_wait);
211         }
212         spin_unlock(&kmap_lock);
213
214         /* do wake-up, if needed, race-free outside of the spin lock */
215         if (need_wakeup)
216                 wake_up(&pkmap_map_wait);
217 }
218
219 EXPORT_SYMBOL(kunmap_high);
220
221 #define POOL_SIZE       64
222
223 static __init int init_emergency_pool(void)
224 {
225         struct sysinfo i;
226         si_meminfo(&i);
227         si_swapinfo(&i);
228         
229         if (!i.totalhigh)
230                 return 0;
231
232         page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL);
233         if (!page_pool)
234                 BUG();
235         printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
236
237         return 0;
238 }
239
240 __initcall(init_emergency_pool);
241
242 /*
243  * highmem version, map in to vec
244  */
245 static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
246 {
247         unsigned long flags;
248         unsigned char *vto;
249
250         local_irq_save(flags);
251         vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
252         memcpy(vto + to->bv_offset, vfrom, to->bv_len);
253         kunmap_atomic(vto, KM_BOUNCE_READ);
254         local_irq_restore(flags);
255 }
256
257 #else /* CONFIG_HIGHMEM */
258
259 #define bounce_copy_vec(to, vfrom)      \
260         memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
261
262 #endif
263
264 #define ISA_POOL_SIZE   16
265
266 /*
267  * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
268  * as the max address, so check if the pool has already been created.
269  */
270 int init_emergency_isa_pool(void)
271 {
272         if (isa_page_pool)
273                 return 0;
274
275         isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc_isa, page_pool_free, NULL);
276         if (!isa_page_pool)
277                 BUG();
278
279         printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
280         return 0;
281 }
282
283 /*
284  * Simple bounce buffer support for highmem pages. Depending on the
285  * queue gfp mask set, *to may or may not be a highmem page. kmap it
286  * always, it will do the Right Thing
287  */
288 static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
289 {
290         unsigned char *vfrom;
291         struct bio_vec *tovec, *fromvec;
292         int i;
293
294         __bio_for_each_segment(tovec, to, i, 0) {
295                 fromvec = from->bi_io_vec + i;
296
297                 /*
298                  * not bounced
299                  */
300                 if (tovec->bv_page == fromvec->bv_page)
301                         continue;
302
303                 /*
304                  * fromvec->bv_offset and fromvec->bv_len might have been
305                  * modified by the block layer, so use the original copy,
306                  * bounce_copy_vec already uses tovec->bv_len
307                  */
308                 vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
309
310                 flush_dcache_page(tovec->bv_page);
311                 bounce_copy_vec(tovec, vfrom);
312         }
313 }
314
315 static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
316 {
317         struct bio *bio_orig = bio->bi_private;
318         struct bio_vec *bvec, *org_vec;
319         int i;
320
321         if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
322                 set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
323
324         /*
325          * free up bounce indirect pages used
326          */
327         __bio_for_each_segment(bvec, bio, i, 0) {
328                 org_vec = bio_orig->bi_io_vec + i;
329                 if (bvec->bv_page == org_vec->bv_page)
330                         continue;
331
332                 mempool_free(bvec->bv_page, pool);      
333                 dec_page_state(nr_bounce);
334         }
335
336         bio_endio(bio_orig, bio_orig->bi_size, err);
337         bio_put(bio);
338 }
339
340 static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done,int err)
341 {
342         if (bio->bi_size)
343                 return 1;
344
345         bounce_end_io(bio, page_pool, err);
346         return 0;
347 }
348
349 static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
350 {
351         if (bio->bi_size)
352                 return 1;
353
354         bounce_end_io(bio, isa_page_pool, err);
355         return 0;
356 }
357
358 static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
359 {
360         struct bio *bio_orig = bio->bi_private;
361
362         if (test_bit(BIO_UPTODATE, &bio->bi_flags))
363                 copy_to_high_bio_irq(bio_orig, bio);
364
365         bounce_end_io(bio, pool, err);
366 }
367
368 static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
369 {
370         if (bio->bi_size)
371                 return 1;
372
373         __bounce_end_io_read(bio, page_pool, err);
374         return 0;
375 }
376
377 static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
378 {
379         if (bio->bi_size)
380                 return 1;
381
382         __bounce_end_io_read(bio, isa_page_pool, err);
383         return 0;
384 }
385
386 static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
387                         mempool_t *pool)
388 {
389         struct page *page;
390         struct bio *bio = NULL;
391         int i, rw = bio_data_dir(*bio_orig);
392         struct bio_vec *to, *from;
393
394         bio_for_each_segment(from, *bio_orig, i) {
395                 page = from->bv_page;
396
397                 /*
398                  * is destination page below bounce pfn?
399                  */
400                 if (page_to_pfn(page) < q->bounce_pfn)
401                         continue;
402
403                 /*
404                  * irk, bounce it
405                  */
406                 if (!bio)
407                         bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
408
409                 to = bio->bi_io_vec + i;
410
411                 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
412                 to->bv_len = from->bv_len;
413                 to->bv_offset = from->bv_offset;
414                 inc_page_state(nr_bounce);
415
416                 if (rw == WRITE) {
417                         char *vto, *vfrom;
418
419                         flush_dcache_page(from->bv_page);
420                         vto = page_address(to->bv_page) + to->bv_offset;
421                         vfrom = kmap(from->bv_page) + from->bv_offset;
422                         memcpy(vto, vfrom, to->bv_len);
423                         kunmap(from->bv_page);
424                 }
425         }
426
427         /*
428          * no pages bounced
429          */
430         if (!bio)
431                 return;
432
433         /*
434          * at least one page was bounced, fill in possible non-highmem
435          * pages
436          */
437         __bio_for_each_segment(from, *bio_orig, i, 0) {
438                 to = bio_iovec_idx(bio, i);
439                 if (!to->bv_page) {
440                         to->bv_page = from->bv_page;
441                         to->bv_len = from->bv_len;
442                         to->bv_offset = from->bv_offset;
443                 }
444         }
445
446         bio->bi_bdev = (*bio_orig)->bi_bdev;
447         bio->bi_flags |= (1 << BIO_BOUNCED);
448         bio->bi_sector = (*bio_orig)->bi_sector;
449         bio->bi_rw = (*bio_orig)->bi_rw;
450
451         bio->bi_vcnt = (*bio_orig)->bi_vcnt;
452         bio->bi_idx = (*bio_orig)->bi_idx;
453         bio->bi_size = (*bio_orig)->bi_size;
454
455         if (pool == page_pool) {
456                 bio->bi_end_io = bounce_end_io_write;
457                 if (rw == READ)
458                         bio->bi_end_io = bounce_end_io_read;
459         } else {
460                 bio->bi_end_io = bounce_end_io_write_isa;
461                 if (rw == READ)
462                         bio->bi_end_io = bounce_end_io_read_isa;
463         }
464
465         bio->bi_private = *bio_orig;
466         *bio_orig = bio;
467 }
468
469 void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
470 {
471         mempool_t *pool;
472
473         /*
474          * for non-isa bounce case, just check if the bounce pfn is equal
475          * to or bigger than the highest pfn in the system -- in that case,
476          * don't waste time iterating over bio segments
477          */
478         if (!(q->bounce_gfp & GFP_DMA)) {
479                 if (q->bounce_pfn >= blk_max_pfn)
480                         return;
481                 pool = page_pool;
482         } else {
483                 BUG_ON(!isa_page_pool);
484                 pool = isa_page_pool;
485         }
486
487         blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
488
489         /*
490          * slow path
491          */
492         __blk_queue_bounce(q, bio_orig, pool);
493 }
494
495 EXPORT_SYMBOL(blk_queue_bounce);
496
497 #if defined(HASHED_PAGE_VIRTUAL)
498
499 #define PA_HASH_ORDER   7
500
501 /*
502  * Describes one page->virtual association
503  */
504 struct page_address_map {
505         struct page *page;
506         void *virtual;
507         struct list_head list;
508 };
509
510 /*
511  * page_address_map freelist, allocated from page_address_maps.
512  */
513 static struct list_head page_address_pool;      /* freelist */
514 static spinlock_t pool_lock;                    /* protects page_address_pool */
515
516 /*
517  * Hash table bucket
518  */
519 static struct page_address_slot {
520         struct list_head lh;                    /* List of page_address_maps */
521         spinlock_t lock;                        /* Protect this bucket's list */
522 } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
523
524 static struct page_address_slot *page_slot(struct page *page)
525 {
526         return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
527 }
528
529 void *page_address(struct page *page)
530 {
531         unsigned long flags;
532         void *ret;
533         struct page_address_slot *pas;
534
535         if (!PageHighMem(page))
536                 return lowmem_page_address(page);
537
538         pas = page_slot(page);
539         ret = NULL;
540         spin_lock_irqsave(&pas->lock, flags);
541         if (!list_empty(&pas->lh)) {
542                 struct page_address_map *pam;
543
544                 list_for_each_entry(pam, &pas->lh, list) {
545                         if (pam->page == page) {
546                                 ret = pam->virtual;
547                                 goto done;
548                         }
549                 }
550         }
551 done:
552         spin_unlock_irqrestore(&pas->lock, flags);
553         return ret;
554 }
555
556 EXPORT_SYMBOL(page_address);
557
558 void set_page_address(struct page *page, void *virtual)
559 {
560         unsigned long flags;
561         struct page_address_slot *pas;
562         struct page_address_map *pam;
563
564         BUG_ON(!PageHighMem(page));
565
566         pas = page_slot(page);
567         if (virtual) {          /* Add */
568                 BUG_ON(list_empty(&page_address_pool));
569
570                 spin_lock_irqsave(&pool_lock, flags);
571                 pam = list_entry(page_address_pool.next,
572                                 struct page_address_map, list);
573                 list_del(&pam->list);
574                 spin_unlock_irqrestore(&pool_lock, flags);
575
576                 pam->page = page;
577                 pam->virtual = virtual;
578
579                 spin_lock_irqsave(&pas->lock, flags);
580                 list_add_tail(&pam->list, &pas->lh);
581                 spin_unlock_irqrestore(&pas->lock, flags);
582         } else {                /* Remove */
583                 spin_lock_irqsave(&pas->lock, flags);
584                 list_for_each_entry(pam, &pas->lh, list) {
585                         if (pam->page == page) {
586                                 list_del(&pam->list);
587                                 spin_unlock_irqrestore(&pas->lock, flags);
588                                 spin_lock_irqsave(&pool_lock, flags);
589                                 list_add_tail(&pam->list, &page_address_pool);
590                                 spin_unlock_irqrestore(&pool_lock, flags);
591                                 goto done;
592                         }
593                 }
594                 spin_unlock_irqrestore(&pas->lock, flags);
595         }
596 done:
597         return;
598 }
599
600 static struct page_address_map page_address_maps[LAST_PKMAP];
601
602 void __init page_address_init(void)
603 {
604         int i;
605
606         INIT_LIST_HEAD(&page_address_pool);
607         for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
608                 list_add(&page_address_maps[i].list, &page_address_pool);
609         for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
610                 INIT_LIST_HEAD(&page_address_htable[i].lh);
611                 spin_lock_init(&page_address_htable[i].lock);
612         }
613         spin_lock_init(&pool_lock);
614 }
615
616 #endif  /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */