Btrfs: cleanup scrub bio and worker wait code
[linux-3.10.git] / fs / btrfs / extent-tree.c
index ec0328b..f8a358a 100644 (file)
@@ -33,6 +33,9 @@
 #include "volumes.h"
 #include "locking.h"
 #include "free-space-cache.h"
+#include "math.h"
+
+#undef SCRAMBLE_DELAYED_REFS
 
 /*
  * control flags for do_chunk_alloc's force field
@@ -92,8 +95,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                                     u64 flags, struct btrfs_disk_key *key,
                                     int level, struct btrfs_key *ins);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *extent_root, u64 alloc_bytes,
-                         u64 flags, int force);
+                         struct btrfs_root *extent_root, u64 flags,
+                         int force);
 static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -310,7 +313,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
        while (start < end) {
                ret = find_first_extent_bit(info->pinned_extents, start,
                                            &extent_start, &extent_end,
-                                           EXTENT_DIRTY | EXTENT_UPTODATE);
+                                           EXTENT_DIRTY | EXTENT_UPTODATE,
+                                           NULL);
                if (ret)
                        break;
 
@@ -646,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
        rcu_read_unlock();
 }
 
-static u64 div_factor(u64 num, int factor)
-{
-       if (factor == 10)
-               return num;
-       num *= factor;
-       do_div(num, 10);
-       return num;
-}
-
-static u64 div_factor_fine(u64 num, int factor)
-{
-       if (factor == 100)
-               return num;
-       num *= factor;
-       do_div(num, 100);
-       return num;
-}
-
 u64 btrfs_find_block_group(struct btrfs_root *root,
                           u64 search_start, u64 search_hint, int owner)
 {
@@ -2217,6 +2203,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *locked_ref = NULL;
        struct btrfs_delayed_extent_op *extent_op;
+       struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;
        int count = 0;
        int must_insert_reserved = 0;
@@ -2249,13 +2236,23 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                }
 
                /*
+                * We need to try and merge add/drops of the same ref since we
+                * can run into issues with relocate dropping the implicit ref
+                * and then it being added back again before the drop can
+                * finish.  If we merged anything we need to re-loop so we can
+                * get a good ref.
+                */
+               btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+                                        locked_ref);
+
+               /*
                 * locked_ref is the head node, so we have to go one
                 * node back for any delayed ref updates
                 */
                ref = select_delayed_ref(locked_ref);
 
                if (ref && ref->seq &&
-                   btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
+                   btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
                        /*
                         * there are still refs with lower seq numbers in the
                         * process of being added. Don't run this ref yet.
@@ -2300,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                                kfree(extent_op);
 
                                if (ret) {
+                                       list_del_init(&locked_ref->cluster);
+                                       mutex_unlock(&locked_ref->mutex);
+
                                        printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
                                        spin_lock(&delayed_refs->lock);
                                        return ret;
@@ -2315,12 +2315,23 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                ref->in_tree = 0;
                rb_erase(&ref->rb_node, &delayed_refs->root);
                delayed_refs->num_entries--;
-               /*
-                * we modified num_entries, but as we're currently running
-                * delayed refs, skip
-                *     wake_up(&delayed_refs->seq_wait);
-                * here.
-                */
+               if (locked_ref) {
+                       /*
+                        * when we play the delayed ref, also correct the
+                        * ref_mod on head
+                        */
+                       switch (ref->action) {
+                       case BTRFS_ADD_DELAYED_REF:
+                       case BTRFS_ADD_DELAYED_EXTENT:
+                               locked_ref->node.ref_mod -= ref->ref_mod;
+                               break;
+                       case BTRFS_DROP_DELAYED_REF:
+                               locked_ref->node.ref_mod += ref->ref_mod;
+                               break;
+                       default:
+                               WARN_ON(1);
+                       }
+               }
                spin_unlock(&delayed_refs->lock);
 
                ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2331,35 +2342,97 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                count++;
 
                if (ret) {
+                       if (locked_ref) {
+                               list_del_init(&locked_ref->cluster);
+                               mutex_unlock(&locked_ref->mutex);
+                       }
                        printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
                        spin_lock(&delayed_refs->lock);
                        return ret;
                }
 
 next:
-               do_chunk_alloc(trans, root->fs_info->extent_root,
-                              2 * 1024 * 1024,
-                              btrfs_get_alloc_profile(root, 0),
-                              CHUNK_ALLOC_NO_FORCE);
                cond_resched();
                spin_lock(&delayed_refs->lock);
        }
        return count;
 }
 
-static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
-                              unsigned long num_refs,
-                              struct list_head *first_seq)
+#ifdef SCRAMBLE_DELAYED_REFS
+/*
+ * Normally delayed refs get processed in ascending bytenr order. This
+ * correlates in most cases to the order added. To expose dependencies on this
+ * order, we start to process the tree in the middle instead of the beginning
+ */
+static u64 find_middle(struct rb_root *root)
 {
-       spin_unlock(&delayed_refs->lock);
-       pr_debug("waiting for more refs (num %ld, first %p)\n",
-                num_refs, first_seq);
-       wait_event(delayed_refs->seq_wait,
-                  num_refs != delayed_refs->num_entries ||
-                  delayed_refs->seq_head.next != first_seq);
-       pr_debug("done waiting for more refs (num %ld, first %p)\n",
-                delayed_refs->num_entries, delayed_refs->seq_head.next);
-       spin_lock(&delayed_refs->lock);
+       struct rb_node *n = root->rb_node;
+       struct btrfs_delayed_ref_node *entry;
+       int alt = 1;
+       u64 middle;
+       u64 first = 0, last = 0;
+
+       n = rb_first(root);
+       if (n) {
+               entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+               first = entry->bytenr;
+       }
+       n = rb_last(root);
+       if (n) {
+               entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+               last = entry->bytenr;
+       }
+       n = root->rb_node;
+
+       while (n) {
+               entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+               WARN_ON(!entry->in_tree);
+
+               middle = entry->bytenr;
+
+               if (alt)
+                       n = n->rb_left;
+               else
+                       n = n->rb_right;
+
+               alt = 1 - alt;
+       }
+       return middle;
+}
+#endif
+
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+                                        struct btrfs_fs_info *fs_info)
+{
+       struct qgroup_update *qgroup_update;
+       int ret = 0;
+
+       if (list_empty(&trans->qgroup_ref_list) !=
+           !trans->delayed_ref_elem.seq) {
+               /* list without seq or seq without list */
+               printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
+                       list_empty(&trans->qgroup_ref_list) ? "" : " not",
+                       trans->delayed_ref_elem.seq);
+               BUG();
+       }
+
+       if (!trans->delayed_ref_elem.seq)
+               return 0;
+
+       while (!list_empty(&trans->qgroup_ref_list)) {
+               qgroup_update = list_first_entry(&trans->qgroup_ref_list,
+                                                struct qgroup_update, list);
+               list_del(&qgroup_update->list);
+               if (!ret)
+                       ret = btrfs_qgroup_account_ref(
+                                       trans, fs_info, qgroup_update->node,
+                                       qgroup_update->extent_op);
+               kfree(qgroup_update);
+       }
+
+       btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+
+       return ret;
 }
 
 /*
@@ -2379,13 +2452,11 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_node *ref;
        struct list_head cluster;
-       struct list_head *first_seq = NULL;
        int ret;
        u64 delayed_start;
        int run_all = count == (unsigned long)-1;
        int run_most = 0;
-       unsigned long num_refs = 0;
-       int consider_waiting;
+       int loops;
 
        /* We'll clean this up in btrfs_cleanup_transaction */
        if (trans->aborted)
@@ -2394,15 +2465,18 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        if (root == root->fs_info->extent_root)
                root = root->fs_info->tree_root;
 
-       do_chunk_alloc(trans, root->fs_info->extent_root,
-                      2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
-                      CHUNK_ALLOC_NO_FORCE);
+       btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
 
        delayed_refs = &trans->transaction->delayed_refs;
        INIT_LIST_HEAD(&cluster);
 again:
-       consider_waiting = 0;
+       loops = 0;
        spin_lock(&delayed_refs->lock);
+
+#ifdef SCRAMBLE_DELAYED_REFS
+       delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
+#endif
+
        if (count == 0) {
                count = delayed_refs->num_entries * 2;
                run_most = 1;
@@ -2424,31 +2498,6 @@ again:
                if (ret)
                        break;
 
-               if (delayed_start >= delayed_refs->run_delayed_start) {
-                       if (consider_waiting == 0) {
-                               /*
-                                * btrfs_find_ref_cluster looped. let's do one
-                                * more cycle. if we don't run any delayed ref
-                                * during that cycle (because we can't because
-                                * all of them are blocked) and if the number of
-                                * refs doesn't change, we avoid busy waiting.
-                                */
-                               consider_waiting = 1;
-                               num_refs = delayed_refs->num_entries;
-                               first_seq = root->fs_info->tree_mod_seq_list.next;
-                       } else {
-                               wait_for_more_refs(delayed_refs,
-                                                  num_refs, first_seq);
-                               /*
-                                * after waiting, things have changed. we
-                                * dropped the lock and someone else might have
-                                * run some refs, built new clusters and so on.
-                                * therefore, we restart staleness detection.
-                                */
-                               consider_waiting = 0;
-                       }
-               }
-
                ret = run_clustered_refs(trans, root, &cluster);
                if (ret < 0) {
                        spin_unlock(&delayed_refs->lock);
@@ -2461,13 +2510,36 @@ again:
                if (count == 0)
                        break;
 
-               if (ret || delayed_refs->run_delayed_start == 0) {
+               if (delayed_start >= delayed_refs->run_delayed_start) {
+                       if (loops == 0) {
+                               /*
+                                * btrfs_find_ref_cluster looped. let's do one
+                                * more cycle. if we don't run any delayed ref
+                                * during that cycle (because we can't because
+                                * all of them are blocked), bail out.
+                                */
+                               loops = 1;
+                       } else {
+                               /*
+                                * no runnable refs left, stop trying
+                                */
+                               BUG_ON(run_all);
+                               break;
+                       }
+               }
+               if (ret) {
                        /* refs were run, let's reset staleness detection */
-                       consider_waiting = 0;
+                       loops = 0;
                }
        }
 
        if (run_all) {
+               if (!list_empty(&trans->new_bgs)) {
+                       spin_unlock(&delayed_refs->lock);
+                       btrfs_create_pending_block_groups(trans, root);
+                       spin_lock(&delayed_refs->lock);
+               }
+
                node = rb_first(&delayed_refs->root);
                if (!node)
                        goto out;
@@ -2502,6 +2574,7 @@ again:
        }
 out:
        spin_unlock(&delayed_refs->lock);
+       assert_qgroups_uptodate(trans);
        return 0;
 }
 
@@ -2581,8 +2654,10 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
 
        node = rb_prev(node);
        if (node) {
+               int seq = ref->seq;
+
                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-               if (ref->bytenr == bytenr)
+               if (ref->bytenr == bytenr && ref->seq == seq)
                        goto out_unlock;
        }
 
@@ -2903,25 +2978,29 @@ again:
        }
 
        spin_lock(&block_group->lock);
-       if (block_group->cached != BTRFS_CACHE_FINISHED) {
-               /* We're not cached, don't bother trying to write stuff out */
+       if (block_group->cached != BTRFS_CACHE_FINISHED ||
+           !btrfs_test_opt(root, SPACE_CACHE)) {
+               /*
+                * don't bother trying to write stuff out _if_
+                * a) we're not cached,
+                * b) we're with nospace_cache mount option.
+                */
                dcs = BTRFS_DC_WRITTEN;
                spin_unlock(&block_group->lock);
                goto out_put;
        }
        spin_unlock(&block_group->lock);
 
-       num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
+       /*
+        * Try to preallocate enough space based on how big the block group is.
+        * Keep in mind this has to include any pinned space which could end up
+        * taking up quite a bit since it's not folded into the other space
+        * cache.
+        */
+       num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
        if (!num_pages)
                num_pages = 1;
 
-       /*
-        * Just to make absolutely sure we have enough space, we're going to
-        * preallocate 12 pages worth of space for each block group.  In
-        * practice we ought to use at most 8, but we need extra space so we can
-        * add our header and have a terminator between the extents and the
-        * bitmaps.
-        */
        num_pages *= 16;
        num_pages *= PAGE_CACHE_SIZE;
 
@@ -3134,6 +3213,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        init_waitqueue_head(&found->wait);
        *space_info = found;
        list_add_rcu(&found->list, &info->space_info);
+       if (flags & BTRFS_BLOCK_GROUP_DATA)
+               info->data_sinfo = found;
        return 0;
 }
 
@@ -3263,12 +3344,6 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
        return get_alloc_profile(root, flags);
 }
 
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
-{
-       BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
-                                                      BTRFS_BLOCK_GROUP_DATA);
-}
-
 /*
  * This will check the space that the inode allocates from to make sure we have
  * enough space for bytes.
@@ -3277,6 +3352,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 {
        struct btrfs_space_info *data_sinfo;
        struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
        u64 used;
        int ret = 0, committed = 0, alloc_chunk = 1;
 
@@ -3289,7 +3365,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
                committed = 1;
        }
 
-       data_sinfo = BTRFS_I(inode)->space_info;
+       data_sinfo = fs_info->data_sinfo;
        if (!data_sinfo)
                goto alloc;
 
@@ -3319,7 +3395,6 @@ alloc:
                                return PTR_ERR(trans);
 
                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                            bytes + 2 * 1024 * 1024,
                                             alloc_target,
                                             CHUNK_ALLOC_NO_FORCE);
                        btrfs_end_transaction(trans, root);
@@ -3330,10 +3405,9 @@ alloc:
                                        goto commit_trans;
                        }
 
-                       if (!data_sinfo) {
-                               btrfs_set_inode_space_info(root, inode);
-                               data_sinfo = BTRFS_I(inode)->space_info;
-                       }
+                       if (!data_sinfo)
+                               data_sinfo = fs_info->data_sinfo;
+
                        goto again;
                }
 
@@ -3380,7 +3454,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
        /* make sure bytes are sectorsize aligned */
        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
 
-       data_sinfo = BTRFS_I(inode)->space_info;
+       data_sinfo = root->fs_info->data_sinfo;
        spin_lock(&data_sinfo->lock);
        data_sinfo->bytes_may_use -= bytes;
        trace_btrfs_space_reservation(root->fs_info, "space_info",
@@ -3402,8 +3476,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
 }
 
 static int should_alloc_chunk(struct btrfs_root *root,
-                             struct btrfs_space_info *sinfo, u64 alloc_bytes,
-                             int force)
+                             struct btrfs_space_info *sinfo, int force)
 {
        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
@@ -3418,7 +3491,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
         * and purposes it's used space.  Don't worry about locking the
         * global_rsv, it doesn't change except when the transaction commits.
         */
-       num_allocated += global_rsv->size;
+       if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
+               num_allocated += global_rsv->size;
 
        /*
         * in limited mode, we want to have some free space up to
@@ -3432,15 +3506,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
                if (num_bytes - num_allocated < thresh)
                        return 1;
        }
-       thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
 
-       /* 256MB or 2% of the FS */
-       thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
-       /* system chunks need a much small threshold */
-       if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
-               thresh = 32 * 1024 * 1024;
-
-       if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
+       if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
                return 0;
        return 1;
 }
@@ -3490,8 +3557,7 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
 }
 
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *extent_root, u64 alloc_bytes,
-                         u64 flags, int force)
+                         struct btrfs_root *extent_root, u64 flags, int force)
 {
        struct btrfs_space_info *space_info;
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
@@ -3515,7 +3581,7 @@ again:
                return 0;
        }
 
-       if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
+       if (!should_alloc_chunk(extent_root, space_info, force)) {
                spin_unlock(&space_info->lock);
                return 0;
        } else if (space_info->chunk_alloc) {
@@ -3583,92 +3649,110 @@ out:
        return ret;
 }
 
+static int can_overcommit(struct btrfs_root *root,
+                         struct btrfs_space_info *space_info, u64 bytes,
+                         enum btrfs_reserve_flush_enum flush)
+{
+       u64 profile = btrfs_get_alloc_profile(root, 0);
+       u64 avail;
+       u64 used;
+
+       used = space_info->bytes_used + space_info->bytes_reserved +
+               space_info->bytes_pinned + space_info->bytes_readonly +
+               space_info->bytes_may_use;
+
+       spin_lock(&root->fs_info->free_chunk_lock);
+       avail = root->fs_info->free_chunk_space;
+       spin_unlock(&root->fs_info->free_chunk_lock);
+
+       /*
+        * If we have dup, raid1 or raid10 then only half of the free
+        * space is actually useable.
+        */
+       if (profile & (BTRFS_BLOCK_GROUP_DUP |
+                      BTRFS_BLOCK_GROUP_RAID1 |
+                      BTRFS_BLOCK_GROUP_RAID10))
+               avail >>= 1;
+
+       /*
+        * If we aren't flushing all things, let us overcommit up to
+        * 1/2th of the space. If we can flush, don't let us overcommit
+        * too much, let it overcommit up to 1/8 of the space.
+        */
+       if (flush == BTRFS_RESERVE_FLUSH_ALL)
+               avail >>= 3;
+       else
+               avail >>= 1;
+
+       if (used + bytes < space_info->total_bytes + avail)
+               return 1;
+       return 0;
+}
+
 /*
  * shrink metadata reservation for delalloc
  */
-static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
-                          bool wait_ordered)
+static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
+                           bool wait_ordered)
 {
        struct btrfs_block_rsv *block_rsv;
        struct btrfs_space_info *space_info;
        struct btrfs_trans_handle *trans;
-       u64 reserved;
+       u64 delalloc_bytes;
        u64 max_reclaim;
-       u64 reclaimed = 0;
        long time_left;
        unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
        int loops = 0;
-       unsigned long progress;
+       enum btrfs_reserve_flush_enum flush;
 
        trans = (struct btrfs_trans_handle *)current->journal_info;
        block_rsv = &root->fs_info->delalloc_block_rsv;
        space_info = block_rsv->space_info;
 
        smp_mb();
-       reserved = space_info->bytes_may_use;
-       progress = space_info->reservation_progress;
-
-       if (reserved == 0)
-               return 0;
-
-       smp_mb();
-       if (root->fs_info->delalloc_bytes == 0) {
+       delalloc_bytes = root->fs_info->delalloc_bytes;
+       if (delalloc_bytes == 0) {
                if (trans)
-                       return 0;
-               btrfs_wait_ordered_extents(root, 0, 0);
-               return 0;
+                       return;
+               btrfs_wait_ordered_extents(root, 0);
+               return;
        }
 
-       max_reclaim = min(reserved, to_reclaim);
-       nr_pages = max_t(unsigned long, nr_pages,
-                        max_reclaim >> PAGE_CACHE_SHIFT);
-       while (loops < 1024) {
-               /* have the flusher threads jump in and do some IO */
-               smp_mb();
-               nr_pages = min_t(unsigned long, nr_pages,
-                      root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
+       while (delalloc_bytes && loops < 3) {
+               max_reclaim = min(delalloc_bytes, to_reclaim);
+               nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
-                                               WB_REASON_FS_FREE_SPACE);
+                                              WB_REASON_FS_FREE_SPACE);
+
+               /*
+                * We need to wait for the async pages to actually start before
+                * we do anything.
+                */
+               wait_event(root->fs_info->async_submit_wait,
+                          !atomic_read(&root->fs_info->async_delalloc_pages));
 
+               if (!trans)
+                       flush = BTRFS_RESERVE_FLUSH_ALL;
+               else
+                       flush = BTRFS_RESERVE_NO_FLUSH;
                spin_lock(&space_info->lock);
-               if (reserved > space_info->bytes_may_use)
-                       reclaimed += reserved - space_info->bytes_may_use;
-               reserved = space_info->bytes_may_use;
+               if (can_overcommit(root, space_info, orig, flush)) {
+                       spin_unlock(&space_info->lock);
+                       break;
+               }
                spin_unlock(&space_info->lock);
 
                loops++;
-
-               if (reserved == 0 || reclaimed >= max_reclaim)
-                       break;
-
-               if (trans && trans->transaction->blocked)
-                       return -EAGAIN;
-
                if (wait_ordered && !trans) {
-                       btrfs_wait_ordered_extents(root, 0, 0);
+                       btrfs_wait_ordered_extents(root, 0);
                } else {
-                       time_left = schedule_timeout_interruptible(1);
-
-                       /* We were interrupted, exit */
+                       time_left = schedule_timeout_killable(1);
                        if (time_left)
                                break;
                }
-
-               /* we've kicked the IO a few times, if anything has been freed,
-                * exit.  There is no sense in looping here for a long time
-                * when we really need to commit the transaction, or there are
-                * just too many writers without enough free space
-                */
-
-               if (loops > 3) {
-                       smp_mb();
-                       if (progress != space_info->reservation_progress)
-                               break;
-               }
-
+               smp_mb();
+               delalloc_bytes = root->fs_info->delalloc_bytes;
        }
-
-       return reclaimed >= to_reclaim;
 }
 
 /**
@@ -3729,11 +3813,12 @@ commit:
 }
 
 enum flush_state {
-       FLUSH_DELALLOC          =       1,
-       FLUSH_DELALLOC_WAIT     =       2,
-       FLUSH_DELAYED_ITEMS_NR  =       3,
-       FLUSH_DELAYED_ITEMS     =       4,
-       COMMIT_TRANS            =       5,
+       FLUSH_DELAYED_ITEMS_NR  =       1,
+       FLUSH_DELAYED_ITEMS     =       2,
+       FLUSH_DELALLOC          =       3,
+       FLUSH_DELALLOC_WAIT     =       4,
+       ALLOC_CHUNK             =       5,
+       COMMIT_TRANS            =       6,
 };
 
 static int flush_space(struct btrfs_root *root,
@@ -3742,16 +3827,9 @@ static int flush_space(struct btrfs_root *root,
 {
        struct btrfs_trans_handle *trans;
        int nr;
-       int ret;
+       int ret = 0;
 
        switch (state) {
-       case FLUSH_DELALLOC:
-       case FLUSH_DELALLOC_WAIT:
-               ret = shrink_delalloc(root, num_bytes,
-                                     state == FLUSH_DELALLOC_WAIT);
-               if (ret > 0)
-                       ret = 0;
-               break;
        case FLUSH_DELAYED_ITEMS_NR:
        case FLUSH_DELAYED_ITEMS:
                if (state == FLUSH_DELAYED_ITEMS_NR) {
@@ -3772,6 +3850,24 @@ static int flush_space(struct btrfs_root *root,
                ret = btrfs_run_delayed_items_nr(trans, root, nr);
                btrfs_end_transaction(trans, root);
                break;
+       case FLUSH_DELALLOC:
+       case FLUSH_DELALLOC_WAIT:
+               shrink_delalloc(root, num_bytes, orig_bytes,
+                               state == FLUSH_DELALLOC_WAIT);
+               break;
+       case ALLOC_CHUNK:
+               trans = btrfs_join_transaction(root);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       break;
+               }
+               ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                                    btrfs_get_alloc_profile(root, 0),
+                                    CHUNK_ALLOC_NO_FORCE);
+               btrfs_end_transaction(trans, root);
+               if (ret == -ENOSPC)
+                       ret = 0;
+               break;
        case COMMIT_TRANS:
                ret = may_commit_transaction(root, space_info, orig_bytes, 0);
                break;
@@ -3798,24 +3894,25 @@ static int flush_space(struct btrfs_root *root,
  */
 static int reserve_metadata_bytes(struct btrfs_root *root,
                                  struct btrfs_block_rsv *block_rsv,
-                                 u64 orig_bytes, int flush)
+                                 u64 orig_bytes,
+                                 enum btrfs_reserve_flush_enum flush)
 {
        struct btrfs_space_info *space_info = block_rsv->space_info;
        u64 used;
        u64 num_bytes = orig_bytes;
-       int flush_state = FLUSH_DELALLOC;
+       int flush_state = FLUSH_DELAYED_ITEMS_NR;
        int ret = 0;
        bool flushing = false;
-       bool committed = false;
 
 again:
        ret = 0;
        spin_lock(&space_info->lock);
        /*
-        * We only want to wait if somebody other than us is flushing and we are
-        * actually alloed to flush.
+        * We only want to wait if somebody other than us is flushing and we
+        * are actually allowed to flush all things.
         */
-       while (flush && !flushing && space_info->flush) {
+       while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
+              space_info->flush) {
                spin_unlock(&space_info->lock);
                /*
                 * If we have a trans handle we can't wait because the flusher
@@ -3869,80 +3966,52 @@ again:
                        (orig_bytes * 2);
        }
 
-       if (ret) {
-               u64 profile = btrfs_get_alloc_profile(root, 0);
-               u64 avail;
-
-               /*
-                * If we have a lot of space that's pinned, don't bother doing
-                * the overcommit dance yet and just commit the transaction.
-                */
-               avail = (space_info->total_bytes - space_info->bytes_used) * 8;
-               do_div(avail, 10);
-               if (space_info->bytes_pinned >= avail && flush && !committed) {
-                       space_info->flush = 1;
-                       flushing = true;
-                       spin_unlock(&space_info->lock);
-                       ret = may_commit_transaction(root, space_info,
-                                                    orig_bytes, 1);
-                       if (ret)
-                               goto out;
-                       committed = true;
-                       goto again;
-               }
-
-               spin_lock(&root->fs_info->free_chunk_lock);
-               avail = root->fs_info->free_chunk_space;
-
-               /*
-                * If we have dup, raid1 or raid10 then only half of the free
-                * space is actually useable.
-                */
-               if (profile & (BTRFS_BLOCK_GROUP_DUP |
-                              BTRFS_BLOCK_GROUP_RAID1 |
-                              BTRFS_BLOCK_GROUP_RAID10))
-                       avail >>= 1;
-
-               /*
-                * If we aren't flushing don't let us overcommit too much, say
-                * 1/8th of the space.  If we can flush, let it overcommit up to
-                * 1/2 of the space.
-                */
-               if (flush)
-                       avail >>= 3;
-               else
-                       avail >>= 1;
-                spin_unlock(&root->fs_info->free_chunk_lock);
-
-               if (used + num_bytes < space_info->total_bytes + avail) {
-                       space_info->bytes_may_use += orig_bytes;
-                       trace_btrfs_space_reservation(root->fs_info,
-                               "space_info", space_info->flags, orig_bytes, 1);
-                       ret = 0;
-               }
+       if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
+               space_info->bytes_may_use += orig_bytes;
+               trace_btrfs_space_reservation(root->fs_info, "space_info",
+                                             space_info->flags, orig_bytes,
+                                             1);
+               ret = 0;
        }
 
        /*
         * Couldn't make our reservation, save our place so while we're trying
         * to reclaim space we can actually use it instead of somebody else
         * stealing it from us.
+        *
+        * We make the other tasks wait for the flush only when we can flush
+        * all things.
         */
-       if (ret && flush) {
+       if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
                flushing = true;
                space_info->flush = 1;
        }
 
        spin_unlock(&space_info->lock);
 
-       if (!ret || !flush)
+       if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
                goto out;
 
        ret = flush_space(root, space_info, num_bytes, orig_bytes,
                          flush_state);
        flush_state++;
+
+       /*
+        * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
+        * would happen. So skip delalloc flush.
+        */
+       if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+           (flush_state == FLUSH_DELALLOC ||
+            flush_state == FLUSH_DELALLOC_WAIT))
+               flush_state = ALLOC_CHUNK;
+
        if (!ret)
                goto again;
-       else if (flush_state <= COMMIT_TRANS)
+       else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+                flush_state < COMMIT_TRANS)
+               goto again;
+       else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+                flush_state <= COMMIT_TRANS)
                goto again;
 
 out:
@@ -4061,13 +4130,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
        return 0;
 }
 
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
 {
        memset(rsv, 0, sizeof(*rsv));
        spin_lock_init(&rsv->lock);
+       rsv->type = type;
 }
 
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+                                             unsigned short type)
 {
        struct btrfs_block_rsv *block_rsv;
        struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4076,7 +4147,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
        if (!block_rsv)
                return NULL;
 
-       btrfs_init_block_rsv(block_rsv);
+       btrfs_init_block_rsv(block_rsv, type);
        block_rsv->space_info = __find_space_info(fs_info,
                                                  BTRFS_BLOCK_GROUP_METADATA);
        return block_rsv;
@@ -4085,13 +4156,15 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 void btrfs_free_block_rsv(struct btrfs_root *root,
                          struct btrfs_block_rsv *rsv)
 {
+       if (!rsv)
+               return;
        btrfs_block_rsv_release(root, rsv, (u64)-1);
        kfree(rsv);
 }
 
-static inline int __block_rsv_add(struct btrfs_root *root,
-                                 struct btrfs_block_rsv *block_rsv,
-                                 u64 num_bytes, int flush)
+int btrfs_block_rsv_add(struct btrfs_root *root,
+                       struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+                       enum btrfs_reserve_flush_enum flush)
 {
        int ret;
 
@@ -4107,20 +4180,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,
        return ret;
 }
 
-int btrfs_block_rsv_add(struct btrfs_root *root,
-                       struct btrfs_block_rsv *block_rsv,
-                       u64 num_bytes)
-{
-       return __block_rsv_add(root, block_rsv, num_bytes, 1);
-}
-
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
-                               struct btrfs_block_rsv *block_rsv,
-                               u64 num_bytes)
-{
-       return __block_rsv_add(root, block_rsv, num_bytes, 0);
-}
-
 int btrfs_block_rsv_check(struct btrfs_root *root,
                          struct btrfs_block_rsv *block_rsv, int min_factor)
 {
@@ -4139,9 +4198,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
        return ret;
 }
 
-static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
-                                          struct btrfs_block_rsv *block_rsv,
-                                          u64 min_reserved, int flush)
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+                          enum btrfs_reserve_flush_enum flush)
 {
        u64 num_bytes = 0;
        int ret = -ENOSPC;
@@ -4169,20 +4228,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
        return ret;
 }
 
-int btrfs_block_rsv_refill(struct btrfs_root *root,
-                          struct btrfs_block_rsv *block_rsv,
-                          u64 min_reserved)
-{
-       return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
-}
-
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
-                                  struct btrfs_block_rsv *block_rsv,
-                                  u64 min_reserved)
-{
-       return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
-}
-
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                            struct btrfs_block_rsv *dst_rsv,
                            u64 num_bytes)
@@ -4363,10 +4408,10 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
        struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
        /*
-        * two for root back/forward refs, two for directory entries
-        * and one for root of the snapshot.
+        * two for root back/forward refs, two for directory entries,
+        * one for root of the snapshot and one for parent inode.
         */
-       u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
+       u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
        dst_rsv->space_info = src_rsv->space_info;
        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
@@ -4473,14 +4518,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        u64 csum_bytes;
        unsigned nr_extents = 0;
        int extra_reserve = 0;
-       int flush = 1;
+       enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
        int ret;
 
        /* Need to be holding the i_mutex here if we aren't free space cache */
-       if (btrfs_is_free_space_inode(root, inode))
-               flush = 0;
+       if (btrfs_is_free_space_inode(inode))
+               flush = BTRFS_RESERVE_NO_FLUSH;
 
-       if (flush && btrfs_transaction_in_commit(root->fs_info))
+       if (flush != BTRFS_RESERVE_NO_FLUSH &&
+           btrfs_transaction_in_commit(root->fs_info))
                schedule_timeout(1);
 
        mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
@@ -4509,6 +4555,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        csum_bytes = BTRFS_I(inode)->csum_bytes;
        spin_unlock(&BTRFS_I(inode)->lock);
 
+       if (root->fs_info->quota_enabled) {
+               ret = btrfs_qgroup_reserve(root, num_bytes +
+                                          nr_extents * root->leafsize);
+               if (ret) {
+                       mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+                       return ret;
+               }
+       }
+
        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
        if (ret) {
                u64 to_free = 0;
@@ -4587,6 +4642,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 
        trace_btrfs_space_reservation(root->fs_info, "delalloc",
                                      btrfs_ino(inode), to_free, 0);
+       if (root->fs_info->quota_enabled) {
+               btrfs_qgroup_free(root, num_bytes +
+                                       dropped * root->leafsize);
+       }
+
        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
                                to_free);
 }
@@ -4896,9 +4956,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_block_group_cache *cache = NULL;
+       struct btrfs_space_info *space_info;
+       struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
        u64 len;
+       bool readonly;
 
        while (start <= end) {
+               readonly = false;
                if (!cache ||
                    start >= cache->key.objectid + cache->key.offset) {
                        if (cache)
@@ -4916,15 +4980,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
                }
 
                start += len;
+               space_info = cache->space_info;
 
-               spin_lock(&cache->space_info->lock);
+               spin_lock(&space_info->lock);
                spin_lock(&cache->lock);
                cache->pinned -= len;
-               cache->space_info->bytes_pinned -= len;
-               if (cache->ro)
-                       cache->space_info->bytes_readonly += len;
+               space_info->bytes_pinned -= len;
+               if (cache->ro) {
+                       space_info->bytes_readonly += len;
+                       readonly = true;
+               }
                spin_unlock(&cache->lock);
-               spin_unlock(&cache->space_info->lock);
+               if (!readonly && global_rsv->space_info == space_info) {
+                       spin_lock(&global_rsv->lock);
+                       if (!global_rsv->full) {
+                               len = min(len, global_rsv->size -
+                                         global_rsv->reserved);
+                               global_rsv->reserved += len;
+                               space_info->bytes_may_use += len;
+                               if (global_rsv->reserved >= global_rsv->size)
+                                       global_rsv->full = 1;
+                       }
+                       spin_unlock(&global_rsv->lock);
+               }
+               spin_unlock(&space_info->lock);
        }
 
        if (cache)
@@ -4951,7 +5030,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
        while (1) {
                ret = find_first_extent_bit(unpin, 0, &start, &end,
-                                           EXTENT_DIRTY);
+                                           EXTENT_DIRTY, NULL);
                if (ret)
                        break;
 
@@ -5029,8 +5108,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        ret = remove_extent_backref(trans, extent_root, path,
                                                    NULL, refs_to_drop,
                                                    is_data);
-                       if (ret)
-                               goto abort;
+                       if (ret) {
+                               btrfs_abort_transaction(trans, extent_root, ret);
+                               goto out;
+                       }
                        btrfs_release_path(path);
                        path->leave_spinning = 1;
 
@@ -5048,8 +5129,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                        btrfs_print_leaf(extent_root,
                                                         path->nodes[0]);
                        }
-                       if (ret < 0)
-                               goto abort;
+                       if (ret < 0) {
+                               btrfs_abort_transaction(trans, extent_root, ret);
+                               goto out;
+                       }
                        extent_slot = path->slots[0];
                }
        } else if (ret == -ENOENT) {
@@ -5063,7 +5146,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                       (unsigned long long)owner_objectid,
                       (unsigned long long)owner_offset);
        } else {
-               goto abort;
+               btrfs_abort_transaction(trans, extent_root, ret);
+               goto out;
        }
 
        leaf = path->nodes[0];
@@ -5073,8 +5157,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                BUG_ON(found_extent || extent_slot != path->slots[0]);
                ret = convert_extent_item_v0(trans, extent_root, path,
                                             owner_objectid, 0);
-               if (ret < 0)
-                       goto abort;
+               if (ret < 0) {
+                       btrfs_abort_transaction(trans, extent_root, ret);
+                       goto out;
+               }
 
                btrfs_release_path(path);
                path->leave_spinning = 1;
@@ -5091,8 +5177,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                               (unsigned long long)bytenr);
                        btrfs_print_leaf(extent_root, path->nodes[0]);
                }
-               if (ret < 0)
-                       goto abort;
+               if (ret < 0) {
+                       btrfs_abort_transaction(trans, extent_root, ret);
+                       goto out;
+               }
+
                extent_slot = path->slots[0];
                leaf = path->nodes[0];
                item_size = btrfs_item_size_nr(leaf, extent_slot);
@@ -5129,8 +5218,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        ret = remove_extent_backref(trans, extent_root, path,
                                                    iref, refs_to_drop,
                                                    is_data);
-                       if (ret)
-                               goto abort;
+                       if (ret) {
+                               btrfs_abort_transaction(trans, extent_root, ret);
+                               goto out;
+                       }
                }
        } else {
                if (found_extent) {
@@ -5147,27 +5238,29 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 
                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                      num_to_del);
-               if (ret)
-                       goto abort;
+               if (ret) {
+                       btrfs_abort_transaction(trans, extent_root, ret);
+                       goto out;
+               }
                btrfs_release_path(path);
 
                if (is_data) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
-                       if (ret)
-                               goto abort;
+                       if (ret) {
+                               btrfs_abort_transaction(trans, extent_root, ret);
+                               goto out;
+                       }
                }
 
                ret = update_block_group(trans, root, bytenr, num_bytes, 0);
-               if (ret)
-                       goto abort;
+               if (ret) {
+                       btrfs_abort_transaction(trans, extent_root, ret);
+                       goto out;
+               }
        }
 out:
        btrfs_free_path(path);
        return ret;
-
-abort:
-       btrfs_abort_transaction(trans, extent_root, ret);
-       goto out;
 }
 
 /*
@@ -5223,8 +5316,6 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        rb_erase(&head->node.rb_node, &delayed_refs->root);
 
        delayed_refs->num_entries--;
-       if (waitqueue_active(&delayed_refs->seq_wait))
-               wake_up(&delayed_refs->seq_wait);
 
        /*
         * we don't take a ref on the node because we're removing it from the
@@ -5432,8 +5523,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_block_group_cache *used_block_group;
        u64 search_start = 0;
        int empty_cluster = 2 * 1024 * 1024;
-       int allowed_chunk_alloc = 0;
-       int done_chunk_alloc = 0;
        struct btrfs_space_info *space_info;
        int loop = 0;
        int index = 0;
@@ -5465,9 +5554,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        if (btrfs_mixed_space_info(space_info))
                use_cluster = false;
 
-       if (orig_root->ref_cows || empty_size)
-               allowed_chunk_alloc = 1;
-
        if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
                last_ptr = &root->fs_info->meta_alloc_cluster;
                if (!btrfs_test_opt(root, SSD))
@@ -5741,10 +5827,6 @@ checks:
 
                trace_btrfs_reserve_extent(orig_root, block_group,
                                           search_start, num_bytes);
-               if (offset < search_start)
-                       btrfs_add_free_space(used_block_group, offset,
-                                            search_start - offset);
-               BUG_ON(offset > search_start);
                if (used_block_group != block_group)
                        btrfs_put_block_group(used_block_group);
                btrfs_put_block_group(block_group);
@@ -5777,30 +5859,17 @@ loop:
                index = 0;
                loop++;
                if (loop == LOOP_ALLOC_CHUNK) {
-                      if (allowed_chunk_alloc) {
-                               ret = do_chunk_alloc(trans, root, num_bytes +
-                                                    2 * 1024 * 1024, data,
-                                                    CHUNK_ALLOC_LIMITED);
-                               if (ret < 0) {
-                                       btrfs_abort_transaction(trans,
-                                                               root, ret);
-                                       goto out;
-                               }
-                               allowed_chunk_alloc = 0;
-                               if (ret == 1)
-                                       done_chunk_alloc = 1;
-                       } else if (!done_chunk_alloc &&
-                                  space_info->force_alloc ==
-                                  CHUNK_ALLOC_NO_FORCE) {
-                               space_info->force_alloc = CHUNK_ALLOC_LIMITED;
+                       ret = do_chunk_alloc(trans, root, data,
+                                            CHUNK_ALLOC_FORCE);
+                       /*
+                        * Do not bail out on ENOSPC since we
+                        * can do more things.
+                        */
+                       if (ret < 0 && ret != -ENOSPC) {
+                               btrfs_abort_transaction(trans,
+                                                       root, ret);
+                               goto out;
                        }
-
-                      /*
-                       * We didn't allocate a chunk, go ahead and drop the
-                       * empty size and loop again.
-                       */
-                      if (!done_chunk_alloc)
-                              loop = LOOP_NO_EMPTY_SIZE;
                }
 
                if (loop == LOOP_NO_EMPTY_SIZE) {
@@ -5849,13 +5918,13 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 again:
        list_for_each_entry(cache, &info->block_groups[index], list) {
                spin_lock(&cache->lock);
-               printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
-                      "%llu pinned %llu reserved\n",
+               printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
                       (unsigned long long)cache->key.objectid,
                       (unsigned long long)cache->key.offset,
                       (unsigned long long)btrfs_block_group_used(&cache->item),
                       (unsigned long long)cache->pinned,
-                      (unsigned long long)cache->reserved);
+                      (unsigned long long)cache->reserved,
+                      cache->ro ? "[readonly]" : "");
                btrfs_dump_free_space(cache, bytes);
                spin_unlock(&cache->lock);
        }
@@ -5875,20 +5944,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 
        data = btrfs_get_alloc_profile(root, data);
 again:
-       /*
-        * the only place that sets empty_size is btrfs_realloc_node, which
-        * is not called recursively on allocations
-        */
-       if (empty_size || root->ref_cows) {
-               ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                    num_bytes + 2 * 1024 * 1024, data,
-                                    CHUNK_ALLOC_NO_FORCE);
-               if (ret < 0 && ret != -ENOSPC) {
-                       btrfs_abort_transaction(trans, root, ret);
-                       return ret;
-               }
-       }
-
        WARN_ON(num_bytes < root->sectorsize);
        ret = find_free_extent(trans, root, num_bytes, empty_size,
                               hint_byte, ins, data);
@@ -5898,12 +5953,6 @@ again:
                        num_bytes = num_bytes >> 1;
                        num_bytes = num_bytes & ~(root->sectorsize - 1);
                        num_bytes = max(num_bytes, min_alloc_size);
-                       ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                      num_bytes, data, CHUNK_ALLOC_FORCE);
-                       if (ret < 0 && ret != -ENOSPC) {
-                               btrfs_abort_transaction(trans, root, ret);
-                               return ret;
-                       }
                        if (num_bytes == min_alloc_size)
                                final_tried = true;
                        goto again;
@@ -6226,7 +6275,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        block_rsv = get_block_rsv(trans, root);
 
        if (block_rsv->size == 0) {
-               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+               ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+                                            BTRFS_RESERVE_NO_FLUSH);
                /*
                 * If we couldn't reserve metadata bytes try and use some from
                 * the global reserve.
@@ -6245,15 +6295,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        ret = block_rsv_use_bytes(block_rsv, blocksize);
        if (!ret)
                return block_rsv;
-       if (ret) {
+       if (ret && !block_rsv->failfast) {
                static DEFINE_RATELIMIT_STATE(_rs,
                                DEFAULT_RATELIMIT_INTERVAL,
                                /*DEFAULT_RATELIMIT_BURST*/ 2);
-               if (__ratelimit(&_rs)) {
-                       printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
-                       WARN_ON(1);
-               }
-               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+               if (__ratelimit(&_rs))
+                       WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
+                            ret);
+               ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+                                            BTRFS_RESERVE_NO_FLUSH);
                if (!ret) {
                        return block_rsv;
                } else if (ret && block_rsv != global_rsv) {
@@ -7210,7 +7260,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 
        alloc_flags = update_block_group_flags(root, cache->flags);
        if (alloc_flags != cache->flags) {
-               ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+               ret = do_chunk_alloc(trans, root, alloc_flags,
                                     CHUNK_ALLOC_FORCE);
                if (ret < 0)
                        goto out;
@@ -7220,7 +7270,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
        if (!ret)
                goto out;
        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
-       ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+       ret = do_chunk_alloc(trans, root, alloc_flags,
                             CHUNK_ALLOC_FORCE);
        if (ret < 0)
                goto out;
@@ -7234,7 +7284,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root, u64 type)
 {
        u64 alloc_flags = get_alloc_profile(root, type);
-       return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+       return do_chunk_alloc(trans, root, alloc_flags,
                              CHUNK_ALLOC_FORCE);
 }
 
@@ -7643,8 +7693,21 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                INIT_LIST_HEAD(&cache->list);
                INIT_LIST_HEAD(&cache->cluster_list);
 
-               if (need_clear)
+               if (need_clear) {
+                       /*
+                        * When we mount with old space cache, we need to
+                        * set BTRFS_DC_CLEAR and set dirty flag.
+                        *
+                        * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
+                        *    truncate the old free space cache inode and
+                        *    setup a new one.
+                        * b) Setting 'dirty flag' makes sure that we flush
+                        *    the new space cache info onto disk.
+                        */
                        cache->disk_cache_state = BTRFS_DC_CLEAR;
+                       if (btrfs_test_opt(root, SPACE_CACHE))
+                               cache->dirty = 1;
+               }
 
                read_extent_buffer(leaf, &cache->item,
                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -7728,6 +7791,34 @@ error:
        return ret;
 }
 
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root)
+{
+       struct btrfs_block_group_cache *block_group, *tmp;
+       struct btrfs_root *extent_root = root->fs_info->extent_root;
+       struct btrfs_block_group_item item;
+       struct btrfs_key key;
+       int ret = 0;
+
+       list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
+                                new_bg_list) {
+               list_del_init(&block_group->new_bg_list);
+
+               if (ret)
+                       continue;
+
+               spin_lock(&block_group->lock);
+               memcpy(&item, &block_group->item, sizeof(item));
+               memcpy(&key, &block_group->key, sizeof(key));
+               spin_unlock(&block_group->lock);
+
+               ret = btrfs_insert_item(trans, extent_root, &key, &item,
+                                       sizeof(item));
+               if (ret)
+                       btrfs_abort_transaction(trans, extent_root, ret);
+       }
+}
+
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, u64 bytes_used,
                           u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -7761,6 +7852,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        spin_lock_init(&cache->lock);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
+       INIT_LIST_HEAD(&cache->new_bg_list);
 
        btrfs_init_free_space_ctl(cache);
 
@@ -7792,12 +7884,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        ret = btrfs_add_block_group_cache(root->fs_info, cache);
        BUG_ON(ret); /* Logic error */
 
-       ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
-                               sizeof(cache->item));
-       if (ret) {
-               btrfs_abort_transaction(trans, extent_root, ret);
-               return ret;
-       }
+       list_add_tail(&cache->new_bg_list, &trans->new_bgs);
 
        set_avail_alloc_bits(extent_root->fs_info, type);