]> nv-tegra.nvidia Code Review - linux-2.6.git/blobdiff - fs/ext4/inode.c
ext4: Adjust ext4_da_writepages() to write out larger contiguous chunks
[linux-2.6.git] / fs / ext4 / inode.c
index 5fb72a98ccbe9b3d2cf3b3dc9b44f467a4c77d13..20e2d704dc2ef7b45ae1296b3dde4121da47d1e9 100644 (file)
@@ -1144,6 +1144,64 @@ static int check_block_validity(struct inode *inode, const char *msg,
        return 0;
 }
 
+/*
+ * Return the number of dirty pages in the given inode starting at
+ * page frame idx.
+ */
+static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
+                                   unsigned int max_pages)
+{
+       struct address_space *mapping = inode->i_mapping;
+       pgoff_t index;
+       struct pagevec pvec;
+       pgoff_t num = 0;
+       int i, nr_pages, done = 0;
+
+       if (max_pages == 0)
+               return 0;
+       pagevec_init(&pvec, 0);
+       while (!done) {
+               index = idx;
+               nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                                             PAGECACHE_TAG_DIRTY,
+                                             (pgoff_t)PAGEVEC_SIZE);
+               if (nr_pages == 0)
+                       break;
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+                       struct buffer_head *bh, *head;
+
+                       lock_page(page);
+                       if (unlikely(page->mapping != mapping) ||
+                           !PageDirty(page) ||
+                           PageWriteback(page) ||
+                           page->index != idx) {
+                               done = 1;
+                               unlock_page(page);
+                               break;
+                       }
+                       head = page_buffers(page);
+                       bh = head;
+                       do {
+                               if (!buffer_delay(bh) &&
+                                   !buffer_unwritten(bh)) {
+                                       done = 1;
+                                       break;
+                               }
+                       } while ((bh = bh->b_this_page) != head);
+                       unlock_page(page);
+                       if (done)
+                               break;
+                       idx++;
+                       num++;
+                       if (num >= max_pages)
+                               break;
+               }
+               pagevec_release(&pvec);
+       }
+       return num;
+}
+
 /*
  * The ext4_get_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
@@ -2743,8 +2801,10 @@ static int ext4_da_writepages(struct address_space *mapping,
        int no_nrwrite_index_update;
        int pages_written = 0;
        long pages_skipped;
+       unsigned int max_pages;
        int range_cyclic, cycled = 1, io_done = 0;
-       int needed_blocks, ret = 0, nr_to_writebump = 0;
+       int needed_blocks, ret = 0;
+       long desired_nr_to_write, nr_to_writebump = 0;
        loff_t range_start = wbc->range_start;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 
@@ -2771,16 +2831,6 @@ static int ext4_da_writepages(struct address_space *mapping,
        if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
                return -EROFS;
 
-       /*
-        * Make sure nr_to_write is >= sbi->s_mb_stream_request
-        * This make sure small files blocks are allocated in
-        * single attempt. This ensure that small files
-        * get less fragmented.
-        */
-       if (wbc->nr_to_write < sbi->s_mb_stream_request) {
-               nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
-               wbc->nr_to_write = sbi->s_mb_stream_request;
-       }
        if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                range_whole = 1;
 
@@ -2795,6 +2845,36 @@ static int ext4_da_writepages(struct address_space *mapping,
        } else
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
 
+       /*
+        * This works around two forms of stupidity.  The first is in
+        * the writeback code, which caps the maximum number of pages
+        * written to be 1024 pages.  This is wrong on multiple
+        * levels; different architectues have a different page size,
+        * which changes the maximum amount of data which gets
+        * written.  Secondly, 4 megabytes is way too small.  XFS
+        * forces this value to be 16 megabytes by multiplying
+        * nr_to_write parameter by four, and then relies on its
+        * allocator to allocate larger extents to make them
+        * contiguous.  Unfortunately this brings us to the second
+        * stupidity, which is that ext4's mballoc code only allocates
+        * at most 2048 blocks.  So we force contiguous writes up to
+        * the number of dirty blocks in the inode, or
+        * sbi->max_writeback_mb_bump whichever is smaller.
+        */
+       max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
+       if (!range_cyclic && range_whole)
+               desired_nr_to_write = wbc->nr_to_write * 8;
+       else
+               desired_nr_to_write = ext4_num_dirty_pages(inode, index,
+                                                          max_pages);
+       if (desired_nr_to_write > max_pages)
+               desired_nr_to_write = max_pages;
+
+       if (wbc->nr_to_write < desired_nr_to_write) {
+               nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
+               wbc->nr_to_write = desired_nr_to_write;
+       }
+
        mpd.wbc = wbc;
        mpd.inode = mapping->host;
 
@@ -2914,7 +2994,8 @@ retry:
 out_writepages:
        if (!no_nrwrite_index_update)
                wbc->no_nrwrite_index_update = 0;
-       wbc->nr_to_write -= nr_to_writebump;
+       if (wbc->nr_to_write > nr_to_writebump)
+               wbc->nr_to_write -= nr_to_writebump;
        wbc->range_start = range_start;
        trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
        return ret;