readahead: clean up and simplify the code for filemap page fault readahead
Linus Torvalds [Tue, 16 Jun 2009 22:31:25 +0000 (15:31 -0700)]
This shouldn't really change behavior all that much, but the single rather
complex function with read-ahead inside a loop etc is broken up into more
manageable pieces.

The behaviour is also less subtle, with the read-ahead being done up-front
rather than inside some subtle loop and thus avoiding the now unnecessary
extra state variables (ie "did_readaround" is gone).

Fengguang: the code split in fact fixed a bug reported by Pavel Levshin:
the PGMAJFAULT accounting used to be bypassed when MADV_RANDOM is set, in
which case the original code will directly jump to no_cached_page reading.

Cc: Pavel Levshin <lpk@581.spb.su>
Cc: <wli@movementarian.org>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

mm/filemap.c

index dcef9fd..8275364 100644 (file)
@@ -1456,6 +1456,68 @@ static int page_cache_read(struct file *file, pgoff_t offset)
 
 #define MMAP_LOTSAMISS  (100)
 
+/*
+ * Synchronous readahead happens when we don't even find
+ * a page in the page cache at all.
+ */
+static void do_sync_mmap_readahead(struct vm_area_struct *vma,
+                                  struct file_ra_state *ra,
+                                  struct file *file,
+                                  pgoff_t offset)
+{
+       unsigned long ra_pages;
+       struct address_space *mapping = file->f_mapping;
+
+       /* If we don't want any read-ahead, don't bother */
+       if (VM_RandomReadHint(vma))
+               return;
+
+       if (VM_SequentialReadHint(vma)) {
+               page_cache_sync_readahead(mapping, ra, file, offset, 1);
+               return;
+       }
+
+       if (ra->mmap_miss < INT_MAX)
+               ra->mmap_miss++;
+
+       /*
+        * Do we miss much more than hit in this file? If so,
+        * stop bothering with read-ahead. It will only hurt.
+        */
+       if (ra->mmap_miss > MMAP_LOTSAMISS)
+               return;
+
+       ra_pages = max_sane_readahead(ra->ra_pages);
+       if (ra_pages) {
+               pgoff_t start = 0;
+
+               if (offset > ra_pages / 2)
+                       start = offset - ra_pages / 2;
+               do_page_cache_readahead(mapping, file, start, ra_pages);
+       }
+}
+
+/*
+ * Asynchronous readahead happens when we find the page and PG_readahead,
+ * so we want to possibly extend the readahead further..
+ */
+static void do_async_mmap_readahead(struct vm_area_struct *vma,
+                                   struct file_ra_state *ra,
+                                   struct file *file,
+                                   struct page *page,
+                                   pgoff_t offset)
+{
+       struct address_space *mapping = file->f_mapping;
+
+       /* If we don't want any read-ahead, don't bother */
+       if (VM_RandomReadHint(vma))
+               return;
+       if (ra->mmap_miss > 0)
+               ra->mmap_miss--;
+       if (PageReadahead(page))
+               page_cache_async_readahead(mapping, ra, file, page, offset, 1);
+}
+
 /**
  * filemap_fault - read in file data for page fault handling
  * @vma:       vma in which the fault was taken
@@ -1475,78 +1537,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct address_space *mapping = file->f_mapping;
        struct file_ra_state *ra = &file->f_ra;
        struct inode *inode = mapping->host;
+       pgoff_t offset = vmf->pgoff;
        struct page *page;
        pgoff_t size;
-       int did_readaround = 0;
        int ret = 0;
 
        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-       if (vmf->pgoff >= size)
+       if (offset >= size)
                return VM_FAULT_SIGBUS;
 
-       /* If we don't want any read-ahead, don't bother */
-       if (VM_RandomReadHint(vma))
-               goto no_cached_page;
-
        /*
         * Do we have something in the page cache already?
         */
-retry_find:
-       page = find_lock_page(mapping, vmf->pgoff);
-       /*
-        * For sequential accesses, we use the generic readahead logic.
-        */
-       if (VM_SequentialReadHint(vma)) {
-               if (!page) {
-                       page_cache_sync_readahead(mapping, ra, file,
-                                                          vmf->pgoff, 1);
-                       page = find_lock_page(mapping, vmf->pgoff);
-                       if (!page)
-                               goto no_cached_page;
-               }
-               if (PageReadahead(page)) {
-                       page_cache_async_readahead(mapping, ra, file, page,
-                                                          vmf->pgoff, 1);
-               }
-       }
-
-       if (!page) {
-               unsigned long ra_pages;
-
-               ra->mmap_miss++;
-
+       page = find_get_page(mapping, offset);
+       if (likely(page)) {
                /*
-                * Do we miss much more than hit in this file? If so,
-                * stop bothering with read-ahead. It will only hurt.
+                * We found the page, so try async readahead before
+                * waiting for the lock.
                 */
-               if (ra->mmap_miss > MMAP_LOTSAMISS)
-                       goto no_cached_page;
+               do_async_mmap_readahead(vma, ra, file, page, offset);
+               lock_page(page);
 
-               /*
-                * To keep the pgmajfault counter straight, we need to
-                * check did_readaround, as this is an inner loop.
-                */
-               if (!did_readaround) {
-                       ret = VM_FAULT_MAJOR;
-                       count_vm_event(PGMAJFAULT);
-               }
-               did_readaround = 1;
-               ra_pages = max_sane_readahead(file->f_ra.ra_pages);
-               if (ra_pages) {
-                       pgoff_t start = 0;
-
-                       if (vmf->pgoff > ra_pages / 2)
-                               start = vmf->pgoff - ra_pages / 2;
-                       do_page_cache_readahead(mapping, file, start, ra_pages);
+               /* Did it get truncated? */
+               if (unlikely(page->mapping != mapping)) {
+                       unlock_page(page);
+                       put_page(page);
+                       goto no_cached_page;
                }
-               page = find_lock_page(mapping, vmf->pgoff);
+       } else {
+               /* No page in the page cache at all */
+               do_sync_mmap_readahead(vma, ra, file, offset);
+               count_vm_event(PGMAJFAULT);
+               ret = VM_FAULT_MAJOR;
+retry_find:
+               page = find_lock_page(mapping, offset);
                if (!page)
                        goto no_cached_page;
        }
 
-       if (!did_readaround)
-               ra->mmap_miss--;
-
        /*
         * We have a locked page in the page cache, now we need to check
         * that it's up-to-date. If not, it is going to be due to an error.
@@ -1554,18 +1582,18 @@ retry_find:
        if (unlikely(!PageUptodate(page)))
                goto page_not_uptodate;
 
-       /* Must recheck i_size under page lock */
+       /*
+        * Found the page and have a reference on it.
+        * We must recheck i_size under page lock.
+        */
        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-       if (unlikely(vmf->pgoff >= size)) {
+       if (unlikely(offset >= size)) {
                unlock_page(page);
                page_cache_release(page);
                return VM_FAULT_SIGBUS;
        }
 
-       /*
-        * Found the page and have a reference on it.
-        */
-       ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
+       ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
        vmf->page = page;
        return ret | VM_FAULT_LOCKED;
 
@@ -1574,7 +1602,7 @@ no_cached_page:
         * We're only likely to ever get here if MADV_RANDOM is in
         * effect.
         */
-       error = page_cache_read(file, vmf->pgoff);
+       error = page_cache_read(file, offset);
 
        /*
         * The page we want has now been added to the page cache.
@@ -1594,12 +1622,6 @@ no_cached_page:
        return VM_FAULT_SIGBUS;
 
 page_not_uptodate:
-       /* IO error path */
-       if (!did_readaround) {
-               ret = VM_FAULT_MAJOR;
-               count_vm_event(PGMAJFAULT);
-       }
-
        /*
         * Umm, take care of errors if the page isn't up-to-date.
         * Try to re-read it _once_. We do this synchronously,