Btrfs: break out of shrink_delalloc earlier
Chris Mason [Sat, 12 Mar 2011 12:08:42 +0000 (07:08 -0500)]
Josef had changed shrink_delalloc to exit after three shrink
attempts, which wasn't quite enough because new writers could
race in and steal free space.

But it also fixed deadlocks and stalls as we tried to recover
delalloc reservations.  The code was tweaked to loop 1024
times, and would reset the counter any time a small amount
of progress was made.  This was too drastic, and with a
lot of writers we can end up stuck in shrink_delalloc forever.

The shrink_delalloc loop is fairly complex because the caller is looping
too, and the caller will go ahead and force a transaction commit to make
sure we reclaim space.

This reworks things to exit shrink_delalloc when we've forced some
writeback and the delalloc reservations have gone down.  This means
the writeback has not just started but has also finished at
least some of the metadata changes required to reclaim delalloc
space.

If we've got this wrong, we're returning ENOSPC too early, which
is a big improvement over the current behavior of hanging the machine.

Test 224 in xfstests hammers on this nicely, and with 1000 writers
trying to fill a 1GB drive we get our first ENOSPC at 93% full.  The
other writers are able to continue until we get 100%.

This is a worst case test for btrfs because the 1000 writers are doing
small IO, and the small FS size means we don't have a lot of room
for metadata chunks.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

fs/btrfs/ctree.h
fs/btrfs/extent-tree.c

index 28188a7..8b4b9d1 100644 (file)
@@ -729,6 +729,15 @@ struct btrfs_space_info {
        u64 disk_total;         /* total bytes on disk, takes mirrors into
                                   account */
 
+       /*
+        * we bump reservation progress every time we decrement
+        * bytes_reserved.  This way people waiting for reservations
+        * know something good has happened and they can check
+        * for progress.  The number here isn't to be trusted, it
+        * just shows reclaim activity
+        */
+       unsigned long reservation_progress;
+
        int full;               /* indicates that we cannot allocate any more
                                   chunks for this space */
        int force_alloc;        /* set if we need to force a chunk alloc for
index 100e409..f1db57d 100644 (file)
@@ -3343,15 +3343,16 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
        u64 max_reclaim;
        u64 reclaimed = 0;
        long time_left;
-       int pause = 1;
        int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
        int loops = 0;
+       unsigned long progress;
 
        block_rsv = &root->fs_info->delalloc_block_rsv;
        space_info = block_rsv->space_info;
 
        smp_mb();
        reserved = space_info->bytes_reserved;
+       progress = space_info->reservation_progress;
 
        if (reserved == 0)
                return 0;
@@ -3366,31 +3367,36 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
 
                spin_lock(&space_info->lock);
-               if (reserved > space_info->bytes_reserved) {
-                       loops = 0;
+               if (reserved > space_info->bytes_reserved)
                        reclaimed += reserved - space_info->bytes_reserved;
-               } else {
-                       loops++;
-               }
                reserved = space_info->bytes_reserved;
                spin_unlock(&space_info->lock);
 
+               loops++;
+
                if (reserved == 0 || reclaimed >= max_reclaim)
                        break;
 
                if (trans && trans->transaction->blocked)
                        return -EAGAIN;
 
-               __set_current_state(TASK_INTERRUPTIBLE);
-               time_left = schedule_timeout(pause);
+               time_left = schedule_timeout_interruptible(1);
 
                /* We were interrupted, exit */
                if (time_left)
                        break;
 
-               pause <<= 1;
-               if (pause > HZ / 10)
-                       pause = HZ / 10;
+               /* we've kicked the IO a few times, if anything has been freed,
+                * exit.  There is no sense in looping here for a long time
+                * when we really need to commit the transaction, or there are
+                * just too many writers without enough free space
+                */
+
+               if (loops > 3) {
+                       smp_mb();
+                       if (progress != space_info->reservation_progress)
+                               break;
+               }
 
        }
        return reclaimed >= to_reclaim;
@@ -3613,6 +3619,7 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
                if (num_bytes) {
                        spin_lock(&space_info->lock);
                        space_info->bytes_reserved -= num_bytes;
+                       space_info->reservation_progress++;
                        spin_unlock(&space_info->lock);
                }
        }
@@ -3845,6 +3852,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
        if (block_rsv->reserved >= block_rsv->size) {
                num_bytes = block_rsv->reserved - block_rsv->size;
                sinfo->bytes_reserved -= num_bytes;
+               sinfo->reservation_progress++;
                block_rsv->reserved = block_rsv->size;
                block_rsv->full = 1;
        }
@@ -4006,7 +4014,6 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                to_reserve = 0;
        }
        spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
        to_reserve += calc_csum_metadata_size(inode, num_bytes);
        ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
        if (ret)
@@ -4134,6 +4141,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        btrfs_set_block_group_used(&cache->item, old_val);
                        cache->reserved -= num_bytes;
                        cache->space_info->bytes_reserved -= num_bytes;
+                       cache->space_info->reservation_progress++;
                        cache->space_info->bytes_used += num_bytes;
                        cache->space_info->disk_used += num_bytes * factor;
                        spin_unlock(&cache->lock);
@@ -4185,6 +4193,7 @@ static int pin_down_extent(struct btrfs_root *root,
        if (reserved) {
                cache->reserved -= num_bytes;
                cache->space_info->bytes_reserved -= num_bytes;
+               cache->space_info->reservation_progress++;
        }
        spin_unlock(&cache->lock);
        spin_unlock(&cache->space_info->lock);
@@ -4235,6 +4244,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
                                space_info->bytes_readonly += num_bytes;
                        cache->reserved -= num_bytes;
                        space_info->bytes_reserved -= num_bytes;
+                       space_info->reservation_progress++;
                }
                spin_unlock(&cache->lock);
                spin_unlock(&space_info->lock);
@@ -4713,6 +4723,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                if (ret) {
                        spin_lock(&cache->space_info->lock);
                        cache->space_info->bytes_reserved -= buf->len;
+                       cache->space_info->reservation_progress++;
                        spin_unlock(&cache->space_info->lock);
                }
                goto out;