writeback: fix WB_SYNC_NONE writeback from umount
Jens Axboe [Mon, 17 May 2010 10:55:07 +0000 (12:55 +0200)]
When umount calls sync_filesystem(), we first do a WB_SYNC_NONE
writeback to kick off writeback of pending dirty inodes, then follow
that up with a WB_SYNC_ALL to wait for it. Since umount already holds
the sb s_umount mutex, WB_SYNC_NONE ends up doing nothing and all
writeback happens as WB_SYNC_ALL. This can greatly slow down umount,
since WB_SYNC_ALL writeback is a data integrity operation and thus
a bigger hammer than simple WB_SYNC_NONE. For barrier aware file systems
it's a lot slower.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

fs/fs-writeback.c
fs/sync.c
include/linux/backing-dev.h
include/linux/writeback.h
mm/page-writeback.c

index 760dc8d..67db897 100644 (file)
@@ -45,6 +45,7 @@ struct wb_writeback_args {
        int for_kupdate:1;
        int range_cyclic:1;
        int for_background:1;
+       int sb_pinned:1;
 };
 
 /*
@@ -230,6 +231,11 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
                .sync_mode      = WB_SYNC_ALL,
                .nr_pages       = LONG_MAX,
                .range_cyclic   = 0,
+               /*
+                * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
+                * lets make it explicitly clear.
+                */
+               .sb_pinned      = 1,
        };
        struct bdi_work work;
 
@@ -245,21 +251,23 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
  * @bdi: the backing device to write from
  * @sb: write inodes from this super_block
  * @nr_pages: the number of pages to write
+ * @sb_locked: caller already holds sb umount sem.
  *
  * Description:
  *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
  *   started when this function returns, we make no guarentees on
- *   completion. Caller need not hold sb s_umount semaphore.
+ *   completion. Caller specifies whether sb umount sem is held already or not.
  *
  */
 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-                        long nr_pages)
+                        long nr_pages, int sb_locked)
 {
        struct wb_writeback_args args = {
                .sb             = sb,
                .sync_mode      = WB_SYNC_NONE,
                .nr_pages       = nr_pages,
                .range_cyclic   = 1,
+               .sb_pinned      = sb_locked,
        };
 
        /*
@@ -577,7 +585,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
        /*
         * Caller must already hold the ref for this
         */
-       if (wbc->sync_mode == WB_SYNC_ALL) {
+       if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
                WARN_ON(!rwsem_is_locked(&sb->s_umount));
                return SB_NOT_PINNED;
        }
@@ -751,6 +759,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                .for_kupdate            = args->for_kupdate,
                .for_background         = args->for_background,
                .range_cyclic           = args->range_cyclic,
+               .sb_pinned              = args->sb_pinned,
        };
        unsigned long oldest_jif;
        long wrote = 0;
@@ -1193,6 +1202,18 @@ static void wait_sb_inodes(struct super_block *sb)
        iput(old_inode);
 }
 
+static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
+{
+       unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+       unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+       long nr_to_write;
+
+       nr_to_write = nr_dirty + nr_unstable +
+                       (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+
+       bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
+}
+
 /**
  * writeback_inodes_sb -       writeback dirty inodes from given super_block
  * @sb: the superblock
@@ -1204,18 +1225,23 @@ static void wait_sb_inodes(struct super_block *sb)
  */
 void writeback_inodes_sb(struct super_block *sb)
 {
-       unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-       unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-       long nr_to_write;
-
-       nr_to_write = nr_dirty + nr_unstable +
-                       (inodes_stat.nr_inodes - inodes_stat.nr_unused);
-
-       bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
+       __writeback_inodes_sb(sb, 0);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 
 /**
+ * writeback_inodes_sb_locked  - writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Like writeback_inodes_sb(), except the caller already holds the
+ * sb umount sem.
+ */
+void writeback_inodes_sb_locked(struct super_block *sb)
+{
+       __writeback_inodes_sb(sb, 1);
+}
+
+/**
  * writeback_inodes_sb_if_idle -       start writeback if none underway
  * @sb: the superblock
  *
index 92b2281..de6a441 100644 (file)
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
        if (wait)
                sync_inodes_sb(sb);
        else
-               writeback_inodes_sb(sb);
+               writeback_inodes_sb_locked(sb);
 
        if (sb->s_op->sync_fs)
                sb->s_op->sync_fs(sb, wait);
index 7534979..ff8bac6 100644 (file)
@@ -106,7 +106,7 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
 int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-                               long nr_pages);
+                               long nr_pages, int sb_locked);
 int bdi_writeback_task(struct bdi_writeback *wb);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 
index eb38a2c..47e1c68 100644 (file)
@@ -65,6 +65,15 @@ struct writeback_control {
         * so we use a single control to update them
         */
        unsigned no_nrwrite_index_update:1;
+
+       /*
+        * For WB_SYNC_ALL, the sb must always be pinned. For WB_SYNC_NONE,
+        * the writeback code will pin the sb for the caller. However,
+        * for eg umount, the caller does WB_SYNC_NONE but already has
+        * the sb pinned. If the below is set, caller already has the
+        * sb pinned.
+        */
+       unsigned sb_pinned:1;
 };
 
 /*
@@ -73,6 +82,7 @@ struct writeback_control {
 struct bdi_writeback;
 int inode_wait(void *);
 void writeback_inodes_sb(struct super_block *);
+void writeback_inodes_sb_locked(struct super_block *);
 int writeback_inodes_sb_if_idle(struct super_block *);
 void sync_inodes_sb(struct super_block *);
 void writeback_inodes_wbc(struct writeback_control *wbc);
index d0f2b37..53b2fcf 100644 (file)
@@ -597,7 +597,7 @@ static void balance_dirty_pages(struct address_space *mapping,
            (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
                               + global_page_state(NR_UNSTABLE_NFS))
                                          > background_thresh)))
-               bdi_start_writeback(bdi, NULL, 0);
+               bdi_start_writeback(bdi, NULL, 0, 0);
 }
 
 void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -705,7 +705,7 @@ void laptop_mode_timer_fn(unsigned long data)
         */
 
        if (bdi_has_dirty_io(&q->backing_dev_info))
-               bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages);
+               bdi_start_writeback(&q->backing_dev_info, NULL, 0, nr_pages);
 }
 
 /*