Btrfs: cleanup scrub bio and worker wait code
[linux-3.10.git] / fs / btrfs / extent-tree.c
index 15a2294..f8a358a 100644 (file)
 #include "volumes.h"
 #include "locking.h"
 #include "free-space-cache.h"
+#include "math.h"
 
-/* control flags for do_chunk_alloc's force field
+#undef SCRAMBLE_DELAYED_REFS
+
+/*
+ * control flags for do_chunk_alloc's force field
  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  * if we really need one.
  *
- * CHUNK_ALLOC_FORCE means it must try to allocate one
- *
  * CHUNK_ALLOC_LIMITED means to only try and allocate one
  * if we have very few chunks already allocated.  This is
  * used as part of the clustering code to help make sure
  * we have a good pool of storage to cluster in, without
  * filling the FS with empty chunks
  *
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
  */
 enum {
        CHUNK_ALLOC_NO_FORCE = 0,
-       CHUNK_ALLOC_FORCE = 1,
-       CHUNK_ALLOC_LIMITED = 2,
+       CHUNK_ALLOC_LIMITED = 1,
+       CHUNK_ALLOC_FORCE = 2,
 };
 
 /*
@@ -91,8 +95,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                                     u64 flags, struct btrfs_disk_key *key,
                                     int level, struct btrfs_key *ins);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *extent_root, u64 alloc_bytes,
-                         u64 flags, int force);
+                         struct btrfs_root *extent_root, u64 flags,
+                         int force);
 static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -244,7 +248,7 @@ static int exclude_super_stripes(struct btrfs_root *root,
                cache->bytes_super += stripe_len;
                ret = add_excluded_extent(root, cache->key.objectid,
                                          stripe_len);
-               BUG_ON(ret);
+               BUG_ON(ret); /* -ENOMEM */
        }
 
        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -252,13 +256,13 @@ static int exclude_super_stripes(struct btrfs_root *root,
                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
                                       cache->key.objectid, bytenr,
                                       0, &logical, &nr, &stripe_len);
-               BUG_ON(ret);
+               BUG_ON(ret); /* -ENOMEM */
 
                while (nr--) {
                        cache->bytes_super += stripe_len;
                        ret = add_excluded_extent(root, logical[nr],
                                                  stripe_len);
-                       BUG_ON(ret);
+                       BUG_ON(ret); /* -ENOMEM */
                }
 
                kfree(logical);
@@ -309,7 +313,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
        while (start < end) {
                ret = find_first_extent_bit(info->pinned_extents, start,
                                            &extent_start, &extent_end,
-                                           EXTENT_DIRTY | EXTENT_UPTODATE);
+                                           EXTENT_DIRTY | EXTENT_UPTODATE,
+                                           NULL);
                if (ret)
                        break;
 
@@ -320,7 +325,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
                        total_added += size;
                        ret = btrfs_add_free_space(block_group, start,
                                                   size);
-                       BUG_ON(ret);
+                       BUG_ON(ret); /* -ENOMEM or logic error */
                        start = extent_end + 1;
                } else {
                        break;
@@ -331,7 +336,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
                size = end - start;
                total_added += size;
                ret = btrfs_add_free_space(block_group, start, size);
-               BUG_ON(ret);
+               BUG_ON(ret); /* -ENOMEM or logic error */
        }
 
        return total_added;
@@ -473,7 +478,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        int ret = 0;
 
        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
-       BUG_ON(!caching_ctl);
+       if (!caching_ctl)
+               return -ENOMEM;
 
        INIT_LIST_HEAD(&caching_ctl->list);
        mutex_init(&caching_ctl->mutex);
@@ -527,9 +533,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
         * allocate blocks for the tree root we can't do the fast caching since
         * we likely hold important locks.
         */
-       if (trans && (!trans->transaction->in_commit) &&
-           (root && root != root->fs_info->tree_root) &&
-           btrfs_test_opt(root, SPACE_CACHE)) {
+       if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
                ret = load_free_space_cache(fs_info, cache);
 
                spin_lock(&cache->lock);
@@ -646,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
        rcu_read_unlock();
 }
 
-static u64 div_factor(u64 num, int factor)
-{
-       if (factor == 10)
-               return num;
-       num *= factor;
-       do_div(num, 10);
-       return num;
-}
-
-static u64 div_factor_fine(u64 num, int factor)
-{
-       if (factor == 100)
-               return num;
-       num *= factor;
-       do_div(num, 100);
-       return num;
-}
-
 u64 btrfs_find_block_group(struct btrfs_root *root,
                           u64 search_start, u64 search_hint, int owner)
 {
@@ -981,7 +967,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
                                ret = btrfs_next_leaf(root, path);
                                if (ret < 0)
                                        return ret;
-                               BUG_ON(ret > 0);
+                               BUG_ON(ret > 0); /* Corruption */
                                leaf = path->nodes[0];
                        }
                        btrfs_item_key_to_cpu(leaf, &found_key,
@@ -1007,9 +993,9 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
                                new_size + extra_size, 1);
        if (ret < 0)
                return ret;
-       BUG_ON(ret);
+       BUG_ON(ret); /* Corruption */
 
-       ret = btrfs_extend_item(trans, root, path, new_size);
+       btrfs_extend_item(trans, root, path, new_size);
 
        leaf = path->nodes[0];
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -1477,7 +1463,11 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
                err = ret;
                goto out;
        }
-       BUG_ON(ret);
+       if (ret && !insert) {
+               err = -ENOENT;
+               goto out;
+       }
+       BUG_ON(ret); /* Corruption */
 
        leaf = path->nodes[0];
        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -1591,13 +1581,13 @@ out:
  * helper to add new inline back ref
  */
 static noinline_for_stack
-int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
-                               struct btrfs_path *path,
-                               struct btrfs_extent_inline_ref *iref,
-                               u64 parent, u64 root_objectid,
-                               u64 owner, u64 offset, int refs_to_add,
-                               struct btrfs_delayed_extent_op *extent_op)
+void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_path *path,
+                                struct btrfs_extent_inline_ref *iref,
+                                u64 parent, u64 root_objectid,
+                                u64 owner, u64 offset, int refs_to_add,
+                                struct btrfs_delayed_extent_op *extent_op)
 {
        struct extent_buffer *leaf;
        struct btrfs_extent_item *ei;
@@ -1607,7 +1597,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
        u64 refs;
        int size;
        int type;
-       int ret;
 
        leaf = path->nodes[0];
        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -1616,7 +1605,7 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
        type = extent_ref_type(parent, owner);
        size = btrfs_extent_inline_ref_size(type);
 
-       ret = btrfs_extend_item(trans, root, path, size);
+       btrfs_extend_item(trans, root, path, size);
 
        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
        refs = btrfs_extent_refs(leaf, ei);
@@ -1651,7 +1640,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
        }
        btrfs_mark_buffer_dirty(leaf);
-       return 0;
 }
 
 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
@@ -1686,12 +1674,12 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,
  * helper to update/remove inline back ref
  */
 static noinline_for_stack
-int update_inline_extent_backref(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root,
-                                struct btrfs_path *path,
-                                struct btrfs_extent_inline_ref *iref,
-                                int refs_to_mod,
-                                struct btrfs_delayed_extent_op *extent_op)
+void update_inline_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct btrfs_extent_inline_ref *iref,
+                                 int refs_to_mod,
+                                 struct btrfs_delayed_extent_op *extent_op)
 {
        struct extent_buffer *leaf;
        struct btrfs_extent_item *ei;
@@ -1702,7 +1690,6 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans,
        u32 item_size;
        int size;
        int type;
-       int ret;
        u64 refs;
 
        leaf = path->nodes[0];
@@ -1744,10 +1731,9 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans,
                        memmove_extent_buffer(leaf, ptr, ptr + size,
                                              end - ptr - size);
                item_size -= size;
-               ret = btrfs_truncate_item(trans, root, path, item_size, 1);
+               btrfs_truncate_item(trans, root, path, item_size, 1);
        }
        btrfs_mark_buffer_dirty(leaf);
-       return 0;
 }
 
 static noinline_for_stack
@@ -1767,13 +1753,13 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
                                           root_objectid, owner, offset, 1);
        if (ret == 0) {
                BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
-               ret = update_inline_extent_backref(trans, root, path, iref,
-                                                  refs_to_add, extent_op);
+               update_inline_extent_backref(trans, root, path, iref,
+                                            refs_to_add, extent_op);
        } else if (ret == -ENOENT) {
-               ret = setup_inline_extent_backref(trans, root, path, iref,
-                                                 parent, root_objectid,
-                                                 owner, offset, refs_to_add,
-                                                 extent_op);
+               setup_inline_extent_backref(trans, root, path, iref, parent,
+                                           root_objectid, owner, offset,
+                                           refs_to_add, extent_op);
+               ret = 0;
        }
        return ret;
 }
@@ -1803,12 +1789,12 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
                                 struct btrfs_extent_inline_ref *iref,
                                 int refs_to_drop, int is_data)
 {
-       int ret;
+       int ret = 0;
 
        BUG_ON(!is_data && refs_to_drop != 1);
        if (iref) {
-               ret = update_inline_extent_backref(trans, root, path, iref,
-                                                  -refs_to_drop, NULL);
+               update_inline_extent_backref(trans, root, path, iref,
+                                            -refs_to_drop, NULL);
        } else if (is_data) {
                ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
        } else {
@@ -1834,6 +1820,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
        /* Tell the block device(s) that the sectors can be discarded */
        ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
                              bytenr, &num_bytes, &bbio, 0);
+       /* Error condition is -ENOMEM */
        if (!ret) {
                struct btrfs_bio_stripe *stripe = bbio->stripes;
                int i;
@@ -1849,7 +1836,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                        if (!ret)
                                discarded_bytes += stripe->length;
                        else if (ret != -EOPNOTSUPP)
-                               break;
+                               break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
 
                        /*
                         * Just in case we get back EOPNOTSUPP for some reason,
@@ -1868,23 +1855,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
        return ret;
 }
 
+/* Can return -ENOMEM */
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
-                        u64 root_objectid, u64 owner, u64 offset)
+                        u64 root_objectid, u64 owner, u64 offset, int for_cow)
 {
        int ret;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+
        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
               root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-               ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+               ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+                                       num_bytes,
                                        parent, root_objectid, (int)owner,
-                                       BTRFS_ADD_DELAYED_REF, NULL);
+                                       BTRFS_ADD_DELAYED_REF, NULL, for_cow);
        } else {
-               ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+               ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+                                       num_bytes,
                                        parent, root_objectid, owner, offset,
-                                       BTRFS_ADD_DELAYED_REF, NULL);
+                                       BTRFS_ADD_DELAYED_REF, NULL, for_cow);
        }
        return ret;
 }
@@ -1939,7 +1931,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
        ret = insert_extent_backref(trans, root->fs_info->extent_root,
                                    path, bytenr, parent, root_objectid,
                                    owner, offset, refs_to_add);
-       BUG_ON(ret);
+       if (ret)
+               btrfs_abort_transaction(trans, root, ret);
 out:
        btrfs_free_path(path);
        return err;
@@ -2026,6 +2019,9 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
        int ret;
        int err = 0;
 
+       if (trans->aborted)
+               return 0;
+
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -2123,7 +2119,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                               struct btrfs_delayed_extent_op *extent_op,
                               int insert_reserved)
 {
-       int ret;
+       int ret = 0;
+
+       if (trans->aborted)
+               return 0;
+
        if (btrfs_delayed_ref_is_head(node)) {
                struct btrfs_delayed_ref_head *head;
                /*
@@ -2141,11 +2141,10 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                                ret = btrfs_del_csums(trans, root,
                                                      node->bytenr,
                                                      node->num_bytes);
-                               BUG_ON(ret);
                        }
                }
                mutex_unlock(&head->mutex);
-               return 0;
+               return ret;
        }
 
        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
@@ -2192,6 +2191,10 @@ again:
        return NULL;
 }
 
+/*
+ * Returns 0 on success or if called with an already aborted transaction.
+ * Returns -ENOMEM or -EIO on failure and will abort the transaction.
+ */
 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root,
                                       struct list_head *cluster)
@@ -2200,6 +2203,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *locked_ref = NULL;
        struct btrfs_delayed_extent_op *extent_op;
+       struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;
        int count = 0;
        int must_insert_reserved = 0;
@@ -2232,6 +2236,38 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                }
 
                /*
+                * We need to try and merge add/drops of the same ref since we
+                * can run into issues with relocate dropping the implicit ref
+                * and then it being added back again before the drop can
+                * finish.  If we merged anything we need to re-loop so we can
+                * get a good ref.
+                */
+               btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+                                        locked_ref);
+
+               /*
+                * locked_ref is the head node, so we have to go one
+                * node back for any delayed ref updates
+                */
+               ref = select_delayed_ref(locked_ref);
+
+               if (ref && ref->seq &&
+                   btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
+                       /*
+                        * there are still refs with lower seq numbers in the
+                        * process of being added. Don't run this ref yet.
+                        */
+                       list_del_init(&locked_ref->cluster);
+                       mutex_unlock(&locked_ref->mutex);
+                       locked_ref = NULL;
+                       delayed_refs->num_heads_ready++;
+                       spin_unlock(&delayed_refs->lock);
+                       cond_resched();
+                       spin_lock(&delayed_refs->lock);
+                       continue;
+               }
+
+               /*
                 * record the must insert reserved flag before we
                 * drop the spin lock.
                 */
@@ -2241,11 +2277,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                extent_op = locked_ref->extent_op;
                locked_ref->extent_op = NULL;
 
-               /*
-                * locked_ref is the head node, so we have to go one
-                * node back for any delayed ref updates
-                */
-               ref = select_delayed_ref(locked_ref);
                if (!ref) {
                        /* All delayed refs have been processed, Go ahead
                         * and send the head node to run_one_delayed_ref,
@@ -2263,12 +2294,18 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 
                                ret = run_delayed_extent_op(trans, root,
                                                            ref, extent_op);
-                               BUG_ON(ret);
                                kfree(extent_op);
 
-                               cond_resched();
-                               spin_lock(&delayed_refs->lock);
-                               continue;
+                               if (ret) {
+                                       list_del_init(&locked_ref->cluster);
+                                       mutex_unlock(&locked_ref->mutex);
+
+                                       printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
+                                       spin_lock(&delayed_refs->lock);
+                                       return ret;
+                               }
+
+                               goto next;
                        }
 
                        list_del_init(&locked_ref->cluster);
@@ -2278,29 +2315,135 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                ref->in_tree = 0;
                rb_erase(&ref->rb_node, &delayed_refs->root);
                delayed_refs->num_entries--;
-
+               if (locked_ref) {
+                       /*
+                        * when we play the delayed ref, also correct the
+                        * ref_mod on head
+                        */
+                       switch (ref->action) {
+                       case BTRFS_ADD_DELAYED_REF:
+                       case BTRFS_ADD_DELAYED_EXTENT:
+                               locked_ref->node.ref_mod -= ref->ref_mod;
+                               break;
+                       case BTRFS_DROP_DELAYED_REF:
+                               locked_ref->node.ref_mod += ref->ref_mod;
+                               break;
+                       default:
+                               WARN_ON(1);
+                       }
+               }
                spin_unlock(&delayed_refs->lock);
 
                ret = run_one_delayed_ref(trans, root, ref, extent_op,
                                          must_insert_reserved);
-               BUG_ON(ret);
 
                btrfs_put_delayed_ref(ref);
                kfree(extent_op);
                count++;
 
+               if (ret) {
+                       if (locked_ref) {
+                               list_del_init(&locked_ref->cluster);
+                               mutex_unlock(&locked_ref->mutex);
+                       }
+                       printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
+                       spin_lock(&delayed_refs->lock);
+                       return ret;
+               }
+
+next:
                cond_resched();
                spin_lock(&delayed_refs->lock);
        }
        return count;
 }
 
+#ifdef SCRAMBLE_DELAYED_REFS
+/*
+ * Normally delayed refs get processed in ascending bytenr order. This
+ * correlates in most cases to the order added. To expose dependencies on this
+ * order, we start to process the tree in the middle instead of the beginning
+ */
+static u64 find_middle(struct rb_root *root)
+{
+       struct rb_node *n = root->rb_node;
+       struct btrfs_delayed_ref_node *entry;
+       int alt = 1;
+       u64 middle;
+       u64 first = 0, last = 0;
+
+       n = rb_first(root);
+       if (n) {
+               entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+               first = entry->bytenr;
+       }
+       n = rb_last(root);
+       if (n) {
+               entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+               last = entry->bytenr;
+       }
+       n = root->rb_node;
+
+       while (n) {
+               entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+               WARN_ON(!entry->in_tree);
+
+               middle = entry->bytenr;
+
+               if (alt)
+                       n = n->rb_left;
+               else
+                       n = n->rb_right;
+
+               alt = 1 - alt;
+       }
+       return middle;
+}
+#endif
+
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+                                        struct btrfs_fs_info *fs_info)
+{
+       struct qgroup_update *qgroup_update;
+       int ret = 0;
+
+       if (list_empty(&trans->qgroup_ref_list) !=
+           !trans->delayed_ref_elem.seq) {
+               /* list without seq or seq without list */
+               printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
+                       list_empty(&trans->qgroup_ref_list) ? "" : " not",
+                       trans->delayed_ref_elem.seq);
+               BUG();
+       }
+
+       if (!trans->delayed_ref_elem.seq)
+               return 0;
+
+       while (!list_empty(&trans->qgroup_ref_list)) {
+               qgroup_update = list_first_entry(&trans->qgroup_ref_list,
+                                                struct qgroup_update, list);
+               list_del(&qgroup_update->list);
+               if (!ret)
+                       ret = btrfs_qgroup_account_ref(
+                                       trans, fs_info, qgroup_update->node,
+                                       qgroup_update->extent_op);
+               kfree(qgroup_update);
+       }
+
+       btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+
+       return ret;
+}
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
  * 0, which means to process everything in the tree at the start
  * of the run (but not newly added entries), or it can be some target
  * number you'd like to process.
+ *
+ * Returns 0 on success or if called with an aborted transaction
+ * Returns <0 on error and aborts the transaction
  */
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, unsigned long count)
@@ -2310,16 +2453,30 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_node *ref;
        struct list_head cluster;
        int ret;
+       u64 delayed_start;
        int run_all = count == (unsigned long)-1;
        int run_most = 0;
+       int loops;
+
+       /* We'll clean this up in btrfs_cleanup_transaction */
+       if (trans->aborted)
+               return 0;
 
        if (root == root->fs_info->extent_root)
                root = root->fs_info->tree_root;
 
+       btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+
        delayed_refs = &trans->transaction->delayed_refs;
        INIT_LIST_HEAD(&cluster);
 again:
+       loops = 0;
        spin_lock(&delayed_refs->lock);
+
+#ifdef SCRAMBLE_DELAYED_REFS
+       delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
+#endif
+
        if (count == 0) {
                count = delayed_refs->num_entries * 2;
                run_most = 1;
@@ -2335,21 +2492,54 @@ again:
                 * of refs to process starting at the first one we are able to
                 * lock
                 */
+               delayed_start = delayed_refs->run_delayed_start;
                ret = btrfs_find_ref_cluster(trans, &cluster,
                                             delayed_refs->run_delayed_start);
                if (ret)
                        break;
 
                ret = run_clustered_refs(trans, root, &cluster);
-               BUG_ON(ret < 0);
+               if (ret < 0) {
+                       spin_unlock(&delayed_refs->lock);
+                       btrfs_abort_transaction(trans, root, ret);
+                       return ret;
+               }
 
                count -= min_t(unsigned long, ret, count);
 
                if (count == 0)
                        break;
+
+               if (delayed_start >= delayed_refs->run_delayed_start) {
+                       if (loops == 0) {
+                               /*
+                                * btrfs_find_ref_cluster looped. let's do one
+                                * more cycle. if we don't run any delayed ref
+                                * during that cycle (because we can't because
+                                * all of them are blocked), bail out.
+                                */
+                               loops = 1;
+                       } else {
+                               /*
+                                * no runnable refs left, stop trying
+                                */
+                               BUG_ON(run_all);
+                               break;
+                       }
+               }
+               if (ret) {
+                       /* refs were run, let's reset staleness detection */
+                       loops = 0;
+               }
        }
 
        if (run_all) {
+               if (!list_empty(&trans->new_bgs)) {
+                       spin_unlock(&delayed_refs->lock);
+                       btrfs_create_pending_block_groups(trans, root);
+                       spin_lock(&delayed_refs->lock);
+               }
+
                node = rb_first(&delayed_refs->root);
                if (!node)
                        goto out;
@@ -2384,6 +2574,7 @@ again:
        }
 out:
        spin_unlock(&delayed_refs->lock);
+       assert_qgroups_uptodate(trans);
        return 0;
 }
 
@@ -2404,7 +2595,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
        extent_op->update_key = 0;
        extent_op->is_data = is_data ? 1 : 0;
 
-       ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+       ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
+                                         num_bytes, extent_op);
        if (ret)
                kfree(extent_op);
        return ret;
@@ -2462,8 +2654,10 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
 
        node = rb_prev(node);
        if (node) {
+               int seq = ref->seq;
+
                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-               if (ref->bytenr == bytenr)
+               if (ref->bytenr == bytenr && ref->seq == seq)
                        goto out_unlock;
        }
 
@@ -2500,7 +2694,7 @@ static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
        if (ret < 0)
                goto out;
-       BUG_ON(ret == 0);
+       BUG_ON(ret == 0); /* Corruption */
 
        ret = -ENOENT;
        if (path->slots[0] == 0)
@@ -2589,7 +2783,7 @@ out:
 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                          int full_backref, int inc)
+                          int full_backref, int inc, int for_cow)
 {
        u64 bytenr;
        u64 num_bytes;
@@ -2602,7 +2796,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
        int level;
        int ret = 0;
        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-                           u64, u64, u64, u64, u64, u64);
+                           u64, u64, u64, u64, u64, u64, int);
 
        ref_root = btrfs_header_owner(buf);
        nritems = btrfs_header_nritems(buf);
@@ -2639,34 +2833,34 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                        key.offset -= btrfs_file_extent_offset(buf, fi);
                        ret = process_func(trans, root, bytenr, num_bytes,
                                           parent, ref_root, key.objectid,
-                                          key.offset);
+                                          key.offset, for_cow);
                        if (ret)
                                goto fail;
                } else {
                        bytenr = btrfs_node_blockptr(buf, i);
                        num_bytes = btrfs_level_size(root, level - 1);
                        ret = process_func(trans, root, bytenr, num_bytes,
-                                          parent, ref_root, level - 1, 0);
+                                          parent, ref_root, level - 1, 0,
+                                          for_cow);
                        if (ret)
                                goto fail;
                }
        }
        return 0;
 fail:
-       BUG();
        return ret;
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref)
+                 struct extent_buffer *buf, int full_backref, int for_cow)
 {
-       return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+       return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
 }
 
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref)
+                 struct extent_buffer *buf, int full_backref, int for_cow)
 {
-       return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+       return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
 }
 
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2682,7 +2876,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
        ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
        if (ret < 0)
                goto fail;
-       BUG_ON(ret);
+       BUG_ON(ret); /* Corruption */
 
        leaf = path->nodes[0];
        bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
@@ -2690,8 +2884,10 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(path);
 fail:
-       if (ret)
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
                return ret;
+       }
        return 0;
 
 }
@@ -2782,25 +2978,29 @@ again:
        }
 
        spin_lock(&block_group->lock);
-       if (block_group->cached != BTRFS_CACHE_FINISHED) {
-               /* We're not cached, don't bother trying to write stuff out */
+       if (block_group->cached != BTRFS_CACHE_FINISHED ||
+           !btrfs_test_opt(root, SPACE_CACHE)) {
+               /*
+                * don't bother trying to write stuff out _if_
+                * a) we're not cached,
+                * b) we're with nospace_cache mount option.
+                */
                dcs = BTRFS_DC_WRITTEN;
                spin_unlock(&block_group->lock);
                goto out_put;
        }
        spin_unlock(&block_group->lock);
 
-       num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
+       /*
+        * Try to preallocate enough space based on how big the block group is.
+        * Keep in mind this has to include any pinned space which could end up
+        * taking up quite a bit since it's not folded into the other space
+        * cache.
+        */
+       num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
        if (!num_pages)
                num_pages = 1;
 
-       /*
-        * Just to make absolutely sure we have enough space, we're going to
-        * preallocate 12 pages worth of space for each block group.  In
-        * practice we ought to use at most 8, but we need extra space so we can
-        * add our header and have a terminator between the extents and the
-        * bitmaps.
-        */
        num_pages *= 16;
        num_pages *= PAGE_CACHE_SIZE;
 
@@ -2864,7 +3064,8 @@ again:
                if (last == 0) {
                        err = btrfs_run_delayed_refs(trans, root,
                                                     (unsigned long)-1);
-                       BUG_ON(err);
+                       if (err) /* File system offline */
+                               goto out;
                }
 
                cache = btrfs_lookup_first_block_group(root->fs_info, last);
@@ -2891,7 +3092,9 @@ again:
                last = cache->key.objectid + cache->key.offset;
 
                err = write_one_cache_group(trans, root, path, cache);
-               BUG_ON(err);
+               if (err) /* File system offline */
+                       goto out;
+
                btrfs_put_block_group(cache);
        }
 
@@ -2904,7 +3107,8 @@ again:
                if (last == 0) {
                        err = btrfs_run_delayed_refs(trans, root,
                                                     (unsigned long)-1);
-                       BUG_ON(err);
+                       if (err) /* File system offline */
+                               goto out;
                }
 
                cache = btrfs_lookup_first_block_group(root->fs_info, last);
@@ -2929,20 +3133,21 @@ again:
                        continue;
                }
 
-               btrfs_write_out_cache(root, trans, cache, path);
+               err = btrfs_write_out_cache(root, trans, cache, path);
 
                /*
                 * If we didn't have an error then the cache state is still
                 * NEED_WRITE, so we can set it to WRITTEN.
                 */
-               if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+               if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
                        cache->disk_cache_state = BTRFS_DC_WRITTEN;
                last = cache->key.objectid + cache->key.offset;
                btrfs_put_block_group(cache);
        }
+out:
 
        btrfs_free_path(path);
-       return 0;
+       return err;
 }
 
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -3008,16 +3213,15 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        init_waitqueue_head(&found->wait);
        *space_info = found;
        list_add_rcu(&found->list, &info->space_info);
+       if (flags & BTRFS_BLOCK_GROUP_DATA)
+               info->data_sinfo = found;
        return 0;
 }
 
 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
-       u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
-
-       /* chunk -> extended profile */
-       if (extra_flags == 0)
-               extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+       u64 extra_flags = chunk_to_extended(flags) &
+                               BTRFS_EXTENDED_PROFILE_MASK;
 
        if (flags & BTRFS_BLOCK_GROUP_DATA)
                fs_info->avail_data_alloc_bits |= extra_flags;
@@ -3028,9 +3232,39 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 }
 
 /*
+ * returns target flags in extended format or 0 if restripe for this
+ * chunk_type is not in progress
+ *
+ * should be called with either volume_mutex or balance_lock held
+ */
+static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
+{
+       struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+       u64 target = 0;
+
+       if (!bctl)
+               return 0;
+
+       if (flags & BTRFS_BLOCK_GROUP_DATA &&
+           bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+               target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+       } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+                  bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+               target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+       } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+                  bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+               target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+       }
+
+       return target;
+}
+
+/*
  * @flags: available profiles in extended format (see ctree.h)
  *
- * Returns reduced profile in chunk format.
+ * Returns reduced profile in chunk format.  If profile changing is in
+ * progress (either running or paused) picks the target profile (if it's
+ * already available), otherwise falls back to plain reducing.
  */
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
@@ -3041,6 +3275,22 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
         */
        u64 num_devices = root->fs_info->fs_devices->rw_devices +
                root->fs_info->fs_devices->missing_devices;
+       u64 target;
+
+       /*
+        * see if restripe for this chunk_type is in progress, if so
+        * try to reduce to the target profile
+        */
+       spin_lock(&root->fs_info->balance_lock);
+       target = get_restripe_target(root->fs_info, flags);
+       if (target) {
+               /* pick target profile only if it's already available */
+               if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
+                       spin_unlock(&root->fs_info->balance_lock);
+                       return extended_to_chunk(target);
+               }
+       }
+       spin_unlock(&root->fs_info->balance_lock);
 
        if (num_devices == 1)
                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -3065,9 +3315,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
                flags &= ~BTRFS_BLOCK_GROUP_RAID0;
        }
 
-       /* extended -> chunk profile */
-       flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-       return flags;
+       return extended_to_chunk(flags);
 }
 
 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
@@ -3096,12 +3344,6 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
        return get_alloc_profile(root, flags);
 }
 
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
-{
-       BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
-                                                      BTRFS_BLOCK_GROUP_DATA);
-}
-
 /*
  * This will check the space that the inode allocates from to make sure we have
  * enough space for bytes.
@@ -3110,6 +3352,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 {
        struct btrfs_space_info *data_sinfo;
        struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
        u64 used;
        int ret = 0, committed = 0, alloc_chunk = 1;
 
@@ -3122,7 +3365,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
                committed = 1;
        }
 
-       data_sinfo = BTRFS_I(inode)->space_info;
+       data_sinfo = fs_info->data_sinfo;
        if (!data_sinfo)
                goto alloc;
 
@@ -3152,7 +3395,6 @@ alloc:
                                return PTR_ERR(trans);
 
                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                            bytes + 2 * 1024 * 1024,
                                             alloc_target,
                                             CHUNK_ALLOC_NO_FORCE);
                        btrfs_end_transaction(trans, root);
@@ -3163,10 +3405,9 @@ alloc:
                                        goto commit_trans;
                        }
 
-                       if (!data_sinfo) {
-                               btrfs_set_inode_space_info(root, inode);
-                               data_sinfo = BTRFS_I(inode)->space_info;
-                       }
+                       if (!data_sinfo)
+                               data_sinfo = fs_info->data_sinfo;
+
                        goto again;
                }
 
@@ -3195,6 +3436,8 @@ commit_trans:
                return -ENOSPC;
        }
        data_sinfo->bytes_may_use += bytes;
+       trace_btrfs_space_reservation(root->fs_info, "space_info",
+                                     data_sinfo->flags, bytes, 1);
        spin_unlock(&data_sinfo->lock);
 
        return 0;
@@ -3211,9 +3454,11 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
        /* make sure bytes are sectorsize aligned */
        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
 
-       data_sinfo = BTRFS_I(inode)->space_info;
+       data_sinfo = root->fs_info->data_sinfo;
        spin_lock(&data_sinfo->lock);
        data_sinfo->bytes_may_use -= bytes;
+       trace_btrfs_space_reservation(root->fs_info, "space_info",
+                                     data_sinfo->flags, bytes, 0);
        spin_unlock(&data_sinfo->lock);
 }
 
@@ -3231,8 +3476,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
 }
 
 static int should_alloc_chunk(struct btrfs_root *root,
-                             struct btrfs_space_info *sinfo, u64 alloc_bytes,
-                             int force)
+                             struct btrfs_space_info *sinfo, int force)
 {
        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
@@ -3247,7 +3491,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
         * and purposes it's used space.  Don't worry about locking the
         * global_rsv, it doesn't change except when the transaction commits.
         */
-       num_allocated += global_rsv->size;
+       if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
+               num_allocated += global_rsv->size;
 
        /*
         * in limited mode, we want to have some free space up to
@@ -3262,59 +3507,81 @@ static int should_alloc_chunk(struct btrfs_root *root,
                        return 1;
        }
 
-       /*
-        * we have two similar checks here, one based on percentage
-        * and once based on a hard number of 256MB.  The idea
-        * is that if we have a good amount of free
-        * room, don't allocate a chunk.  A good mount is
-        * less than 80% utilized of the chunks we have allocated,
-        * or more than 256MB free
-        */
-       if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+       if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
                return 0;
+       return 1;
+}
 
-       if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
-               return 0;
+static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
+{
+       u64 num_dev;
 
-       thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
+       if (type & BTRFS_BLOCK_GROUP_RAID10 ||
+           type & BTRFS_BLOCK_GROUP_RAID0)
+               num_dev = root->fs_info->fs_devices->rw_devices;
+       else if (type & BTRFS_BLOCK_GROUP_RAID1)
+               num_dev = 2;
+       else
+               num_dev = 1;    /* DUP or single */
 
-       /* 256MB or 5% of the FS */
-       thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+       /* metadata for updaing devices and chunk tree */
+       return btrfs_calc_trans_metadata_size(root, num_dev + 1);
+}
 
-       if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
-               return 0;
-       return 1;
+static void check_system_chunk(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root, u64 type)
+{
+       struct btrfs_space_info *info;
+       u64 left;
+       u64 thresh;
+
+       info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+       spin_lock(&info->lock);
+       left = info->total_bytes - info->bytes_used - info->bytes_pinned -
+               info->bytes_reserved - info->bytes_readonly;
+       spin_unlock(&info->lock);
+
+       thresh = get_system_chunk_thresh(root, type);
+       if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
+               printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n",
+                      left, thresh, type);
+               dump_space_info(info, 0, 0);
+       }
+
+       if (left < thresh) {
+               u64 flags;
+
+               flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
+               btrfs_alloc_chunk(trans, root, flags);
+       }
 }
 
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *extent_root, u64 alloc_bytes,
-                         u64 flags, int force)
+                         struct btrfs_root *extent_root, u64 flags, int force)
 {
        struct btrfs_space_info *space_info;
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
        int wait_for_alloc = 0;
        int ret = 0;
 
-       flags = btrfs_reduce_alloc_profile(extent_root, flags);
-
        space_info = __find_space_info(extent_root->fs_info, flags);
        if (!space_info) {
                ret = update_space_info(extent_root->fs_info, flags,
                                        0, 0, &space_info);
-               BUG_ON(ret);
+               BUG_ON(ret); /* -ENOMEM */
        }
-       BUG_ON(!space_info);
+       BUG_ON(!space_info); /* Logic error */
 
 again:
        spin_lock(&space_info->lock);
-       if (space_info->force_alloc)
+       if (force < space_info->force_alloc)
                force = space_info->force_alloc;
        if (space_info->full) {
                spin_unlock(&space_info->lock);
                return 0;
        }
 
-       if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
+       if (!should_alloc_chunk(extent_root, space_info, force)) {
                spin_unlock(&space_info->lock);
                return 0;
        } else if (space_info->chunk_alloc) {
@@ -3358,6 +3625,12 @@ again:
                        force_metadata_allocation(fs_info);
        }
 
+       /*
+        * Check if we have enough space in SYSTEM chunk because we may need
+        * to update devices.
+        */
+       check_system_chunk(trans, extent_root, flags);
+
        ret = btrfs_alloc_chunk(trans, extent_root, flags);
        if (ret < 0 && ret != -ENOSPC)
                goto out;
@@ -3372,95 +3645,114 @@ again:
        space_info->chunk_alloc = 0;
        spin_unlock(&space_info->lock);
 out:
-       mutex_unlock(&extent_root->fs_info->chunk_mutex);
+       mutex_unlock(&fs_info->chunk_mutex);
        return ret;
 }
 
+static int can_overcommit(struct btrfs_root *root,
+                         struct btrfs_space_info *space_info, u64 bytes,
+                         enum btrfs_reserve_flush_enum flush)
+{
+       u64 profile = btrfs_get_alloc_profile(root, 0);
+       u64 avail;
+       u64 used;
+
+       used = space_info->bytes_used + space_info->bytes_reserved +
+               space_info->bytes_pinned + space_info->bytes_readonly +
+               space_info->bytes_may_use;
+
+       spin_lock(&root->fs_info->free_chunk_lock);
+       avail = root->fs_info->free_chunk_space;
+       spin_unlock(&root->fs_info->free_chunk_lock);
+
+       /*
+        * If we have dup, raid1 or raid10 then only half of the free
+        * space is actually useable.
+        */
+       if (profile & (BTRFS_BLOCK_GROUP_DUP |
+                      BTRFS_BLOCK_GROUP_RAID1 |
+                      BTRFS_BLOCK_GROUP_RAID10))
+               avail >>= 1;
+
+       /*
+        * If we aren't flushing all things, let us overcommit up to
+        * 1/2th of the space. If we can flush, don't let us overcommit
+        * too much, let it overcommit up to 1/8 of the space.
+        */
+       if (flush == BTRFS_RESERVE_FLUSH_ALL)
+               avail >>= 3;
+       else
+               avail >>= 1;
+
+       if (used + bytes < space_info->total_bytes + avail)
+               return 1;
+       return 0;
+}
+
 /*
  * shrink metadata reservation for delalloc
  */
-static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
-                          bool wait_ordered)
+static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
+                           bool wait_ordered)
 {
        struct btrfs_block_rsv *block_rsv;
        struct btrfs_space_info *space_info;
        struct btrfs_trans_handle *trans;
-       u64 reserved;
+       u64 delalloc_bytes;
        u64 max_reclaim;
-       u64 reclaimed = 0;
        long time_left;
        unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
        int loops = 0;
-       unsigned long progress;
+       enum btrfs_reserve_flush_enum flush;
 
        trans = (struct btrfs_trans_handle *)current->journal_info;
        block_rsv = &root->fs_info->delalloc_block_rsv;
        space_info = block_rsv->space_info;
 
        smp_mb();
-       reserved = space_info->bytes_may_use;
-       progress = space_info->reservation_progress;
-
-       if (reserved == 0)
-               return 0;
-
-       smp_mb();
-       if (root->fs_info->delalloc_bytes == 0) {
+       delalloc_bytes = root->fs_info->delalloc_bytes;
+       if (delalloc_bytes == 0) {
                if (trans)
-                       return 0;
-               btrfs_wait_ordered_extents(root, 0, 0);
-               return 0;
+                       return;
+               btrfs_wait_ordered_extents(root, 0);
+               return;
        }
 
-       max_reclaim = min(reserved, to_reclaim);
-       nr_pages = max_t(unsigned long, nr_pages,
-                        max_reclaim >> PAGE_CACHE_SHIFT);
-       while (loops < 1024) {
-               /* have the flusher threads jump in and do some IO */
-               smp_mb();
-               nr_pages = min_t(unsigned long, nr_pages,
-                      root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
-               writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
+       while (delalloc_bytes && loops < 3) {
+               max_reclaim = min(delalloc_bytes, to_reclaim);
+               nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
+               writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
+                                              WB_REASON_FS_FREE_SPACE);
+
+               /*
+                * We need to wait for the async pages to actually start before
+                * we do anything.
+                */
+               wait_event(root->fs_info->async_submit_wait,
+                          !atomic_read(&root->fs_info->async_delalloc_pages));
 
+               if (!trans)
+                       flush = BTRFS_RESERVE_FLUSH_ALL;
+               else
+                       flush = BTRFS_RESERVE_NO_FLUSH;
                spin_lock(&space_info->lock);
-               if (reserved > space_info->bytes_may_use)
-                       reclaimed += reserved - space_info->bytes_may_use;
-               reserved = space_info->bytes_may_use;
+               if (can_overcommit(root, space_info, orig, flush)) {
+                       spin_unlock(&space_info->lock);
+                       break;
+               }
                spin_unlock(&space_info->lock);
 
                loops++;
-
-               if (reserved == 0 || reclaimed >= max_reclaim)
-                       break;
-
-               if (trans && trans->transaction->blocked)
-                       return -EAGAIN;
-
                if (wait_ordered && !trans) {
-                       btrfs_wait_ordered_extents(root, 0, 0);
+                       btrfs_wait_ordered_extents(root, 0);
                } else {
-                       time_left = schedule_timeout_interruptible(1);
-
-                       /* We were interrupted, exit */
+                       time_left = schedule_timeout_killable(1);
                        if (time_left)
                                break;
                }
-
-               /* we've kicked the IO a few times, if anything has been freed,
-                * exit.  There is no sense in looping here for a long time
-                * when we really need to commit the transaction, or there are
-                * just too many writers without enough free space
-                */
-
-               if (loops > 3) {
-                       smp_mb();
-                       if (progress != space_info->reservation_progress)
-                               break;
-               }
-
+               smp_mb();
+               delalloc_bytes = root->fs_info->delalloc_bytes;
        }
-
-       return reclaimed >= to_reclaim;
 }
 
 /**
@@ -3502,12 +3794,15 @@ static int may_commit_transaction(struct btrfs_root *root,
        if (space_info != delayed_rsv->space_info)
                return -ENOSPC;
 
+       spin_lock(&space_info->lock);
        spin_lock(&delayed_rsv->lock);
-       if (delayed_rsv->size < bytes) {
+       if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
                spin_unlock(&delayed_rsv->lock);
+               spin_unlock(&space_info->lock);
                return -ENOSPC;
        }
        spin_unlock(&delayed_rsv->lock);
+       spin_unlock(&space_info->lock);
 
 commit:
        trans = btrfs_join_transaction(root);
@@ -3517,6 +3812,72 @@ commit:
        return btrfs_commit_transaction(trans, root);
 }
 
+enum flush_state {
+       FLUSH_DELAYED_ITEMS_NR  =       1,
+       FLUSH_DELAYED_ITEMS     =       2,
+       FLUSH_DELALLOC          =       3,
+       FLUSH_DELALLOC_WAIT     =       4,
+       ALLOC_CHUNK             =       5,
+       COMMIT_TRANS            =       6,
+};
+
+static int flush_space(struct btrfs_root *root,
+                      struct btrfs_space_info *space_info, u64 num_bytes,
+                      u64 orig_bytes, int state)
+{
+       struct btrfs_trans_handle *trans;
+       int nr;
+       int ret = 0;
+
+       switch (state) {
+       case FLUSH_DELAYED_ITEMS_NR:
+       case FLUSH_DELAYED_ITEMS:
+               if (state == FLUSH_DELAYED_ITEMS_NR) {
+                       u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
+
+                       nr = (int)div64_u64(num_bytes, bytes);
+                       if (!nr)
+                               nr = 1;
+                       nr *= 2;
+               } else {
+                       nr = -1;
+               }
+               trans = btrfs_join_transaction(root);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       break;
+               }
+               ret = btrfs_run_delayed_items_nr(trans, root, nr);
+               btrfs_end_transaction(trans, root);
+               break;
+       case FLUSH_DELALLOC:
+       case FLUSH_DELALLOC_WAIT:
+               shrink_delalloc(root, num_bytes, orig_bytes,
+                               state == FLUSH_DELALLOC_WAIT);
+               break;
+       case ALLOC_CHUNK:
+               trans = btrfs_join_transaction(root);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       break;
+               }
+               ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                                    btrfs_get_alloc_profile(root, 0),
+                                    CHUNK_ALLOC_NO_FORCE);
+               btrfs_end_transaction(trans, root);
+               if (ret == -ENOSPC)
+                       ret = 0;
+               break;
+       case COMMIT_TRANS:
+               ret = may_commit_transaction(root, space_info, orig_bytes, 0);
+               break;
+       default:
+               ret = -ENOSPC;
+               break;
+       }
+
+       return ret;
+}
 /**
  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
  * @root - the root we're allocating for
@@ -3533,25 +3894,25 @@ commit:
  */
 static int reserve_metadata_bytes(struct btrfs_root *root,
                                  struct btrfs_block_rsv *block_rsv,
-                                 u64 orig_bytes, int flush)
+                                 u64 orig_bytes,
+                                 enum btrfs_reserve_flush_enum flush)
 {
        struct btrfs_space_info *space_info = block_rsv->space_info;
        u64 used;
        u64 num_bytes = orig_bytes;
-       int retries = 0;
+       int flush_state = FLUSH_DELAYED_ITEMS_NR;
        int ret = 0;
-       bool committed = false;
        bool flushing = false;
-       bool wait_ordered = false;
 
 again:
        ret = 0;
        spin_lock(&space_info->lock);
        /*
-        * We only want to wait if somebody other than us is flushing and we are
-        * actually alloed to flush.
+        * We only want to wait if somebody other than us is flushing and we
+        * are actually allowed to flush all things.
         */
-       while (flush && !flushing && space_info->flush) {
+       while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
+              space_info->flush) {
                spin_unlock(&space_info->lock);
                /*
                 * If we have a trans handle we can't wait because the flusher
@@ -3561,9 +3922,8 @@ again:
                 */
                if (current->journal_info)
                        return -EAGAIN;
-               ret = wait_event_interruptible(space_info->wait,
-                                              !space_info->flush);
-               /* Must have been interrupted, return */
+               ret = wait_event_killable(space_info->wait, !space_info->flush);
+               /* Must have been killed, return */
                if (ret)
                        return -EINTR;
 
@@ -3585,6 +3945,8 @@ again:
        if (used <= space_info->total_bytes) {
                if (used + orig_bytes <= space_info->total_bytes) {
                        space_info->bytes_may_use += orig_bytes;
+                       trace_btrfs_space_reservation(root->fs_info,
+                               "space_info", space_info->flags, orig_bytes, 1);
                        ret = 0;
                } else {
                        /*
@@ -3600,109 +3962,57 @@ again:
                 * amount plus the amount of bytes that we need for this
                 * reservation.
                 */
-               wait_ordered = true;
                num_bytes = used - space_info->total_bytes +
-                       (orig_bytes * (retries + 1));
+                       (orig_bytes * 2);
        }
 
-       if (ret) {
-               u64 profile = btrfs_get_alloc_profile(root, 0);
-               u64 avail;
-
-               /*
-                * If we have a lot of space that's pinned, don't bother doing
-                * the overcommit dance yet and just commit the transaction.
-                */
-               avail = (space_info->total_bytes - space_info->bytes_used) * 8;
-               do_div(avail, 10);
-               if (space_info->bytes_pinned >= avail && flush && !committed) {
-                       space_info->flush = 1;
-                       flushing = true;
-                       spin_unlock(&space_info->lock);
-                       ret = may_commit_transaction(root, space_info,
-                                                    orig_bytes, 1);
-                       if (ret)
-                               goto out;
-                       committed = true;
-                       goto again;
-               }
-
-               spin_lock(&root->fs_info->free_chunk_lock);
-               avail = root->fs_info->free_chunk_space;
-
-               /*
-                * If we have dup, raid1 or raid10 then only half of the free
-                * space is actually useable.
-                */
-               if (profile & (BTRFS_BLOCK_GROUP_DUP |
-                              BTRFS_BLOCK_GROUP_RAID1 |
-                              BTRFS_BLOCK_GROUP_RAID10))
-                       avail >>= 1;
-
-               /*
-                * If we aren't flushing don't let us overcommit too much, say
-                * 1/8th of the space.  If we can flush, let it overcommit up to
-                * 1/2 of the space.
-                */
-               if (flush)
-                       avail >>= 3;
-               else
-                       avail >>= 1;
-                spin_unlock(&root->fs_info->free_chunk_lock);
-
-               if (used + num_bytes < space_info->total_bytes + avail) {
-                       space_info->bytes_may_use += orig_bytes;
-                       ret = 0;
-               } else {
-                       wait_ordered = true;
-               }
+       if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
+               space_info->bytes_may_use += orig_bytes;
+               trace_btrfs_space_reservation(root->fs_info, "space_info",
+                                             space_info->flags, orig_bytes,
+                                             1);
+               ret = 0;
        }
 
        /*
         * Couldn't make our reservation, save our place so while we're trying
         * to reclaim space we can actually use it instead of somebody else
         * stealing it from us.
+        *
+        * We make the other tasks wait for the flush only when we can flush
+        * all things.
         */
-       if (ret && flush) {
+       if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
                flushing = true;
                space_info->flush = 1;
        }
 
        spin_unlock(&space_info->lock);
 
-       if (!ret || !flush)
-               goto out;
-
-       /*
-        * We do synchronous shrinking since we don't actually unreserve
-        * metadata until after the IO is completed.
-        */
-       ret = shrink_delalloc(root, num_bytes, wait_ordered);
-       if (ret < 0)
+       if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
                goto out;
 
-       ret = 0;
+       ret = flush_space(root, space_info, num_bytes, orig_bytes,
+                         flush_state);
+       flush_state++;
 
        /*
-        * So if we were overcommitted it's possible that somebody else flushed
-        * out enough space and we simply didn't have enough space to reclaim,
-        * so go back around and try again.
+        * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
+        * would happen. So skip delalloc flush.
         */
-       if (retries < 2) {
-               wait_ordered = true;
-               retries++;
-               goto again;
-       }
+       if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+           (flush_state == FLUSH_DELALLOC ||
+            flush_state == FLUSH_DELALLOC_WAIT))
+               flush_state = ALLOC_CHUNK;
 
-       ret = -ENOSPC;
-       if (committed)
-               goto out;
-
-       ret = may_commit_transaction(root, space_info, orig_bytes, 0);
-       if (!ret) {
-               committed = true;
+       if (!ret)
+               goto again;
+       else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+                flush_state < COMMIT_TRANS)
+               goto again;
+       else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+                flush_state <= COMMIT_TRANS)
                goto again;
-       }
 
 out:
        if (flushing) {
@@ -3714,12 +4024,16 @@ out:
        return ret;
 }
 
-static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
-                                            struct btrfs_root *root)
+static struct btrfs_block_rsv *get_block_rsv(
+                                       const struct btrfs_trans_handle *trans,
+                                       const struct btrfs_root *root)
 {
        struct btrfs_block_rsv *block_rsv = NULL;
 
-       if (root->ref_cows || root == root->fs_info->csum_root)
+       if (root->ref_cows)
+               block_rsv = trans->block_rsv;
+
+       if (root == root->fs_info->csum_root && trans->adding_csums)
                block_rsv = trans->block_rsv;
 
        if (!block_rsv)
@@ -3758,7 +4072,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
        spin_unlock(&block_rsv->lock);
 }
 
-static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+                                   struct btrfs_block_rsv *block_rsv,
                                    struct btrfs_block_rsv *dest, u64 num_bytes)
 {
        struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3794,6 +4109,8 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
                if (num_bytes) {
                        spin_lock(&space_info->lock);
                        space_info->bytes_may_use -= num_bytes;
+                       trace_btrfs_space_reservation(fs_info, "space_info",
+                                       space_info->flags, num_bytes, 0);
                        space_info->reservation_progress++;
                        spin_unlock(&space_info->lock);
                }
@@ -3813,13 +4130,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
        return 0;
 }
 
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
 {
        memset(rsv, 0, sizeof(*rsv));
        spin_lock_init(&rsv->lock);
+       rsv->type = type;
 }
 
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+                                             unsigned short type)
 {
        struct btrfs_block_rsv *block_rsv;
        struct btrfs_fs_info *fs_info = root->fs_info;
@@ -3828,7 +4147,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
        if (!block_rsv)
                return NULL;
 
-       btrfs_init_block_rsv(block_rsv);
+       btrfs_init_block_rsv(block_rsv, type);
        block_rsv->space_info = __find_space_info(fs_info,
                                                  BTRFS_BLOCK_GROUP_METADATA);
        return block_rsv;
@@ -3837,13 +4156,15 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 void btrfs_free_block_rsv(struct btrfs_root *root,
                          struct btrfs_block_rsv *rsv)
 {
+       if (!rsv)
+               return;
        btrfs_block_rsv_release(root, rsv, (u64)-1);
        kfree(rsv);
 }
 
-static inline int __block_rsv_add(struct btrfs_root *root,
-                                 struct btrfs_block_rsv *block_rsv,
-                                 u64 num_bytes, int flush)
+int btrfs_block_rsv_add(struct btrfs_root *root,
+                       struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+                       enum btrfs_reserve_flush_enum flush)
 {
        int ret;
 
@@ -3859,20 +4180,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,
        return ret;
 }
 
-int btrfs_block_rsv_add(struct btrfs_root *root,
-                       struct btrfs_block_rsv *block_rsv,
-                       u64 num_bytes)
-{
-       return __block_rsv_add(root, block_rsv, num_bytes, 1);
-}
-
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
-                               struct btrfs_block_rsv *block_rsv,
-                               u64 num_bytes)
-{
-       return __block_rsv_add(root, block_rsv, num_bytes, 0);
-}
-
 int btrfs_block_rsv_check(struct btrfs_root *root,
                          struct btrfs_block_rsv *block_rsv, int min_factor)
 {
@@ -3891,9 +4198,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
        return ret;
 }
 
-static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
-                                          struct btrfs_block_rsv *block_rsv,
-                                          u64 min_reserved, int flush)
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+                          enum btrfs_reserve_flush_enum flush)
 {
        u64 num_bytes = 0;
        int ret = -ENOSPC;
@@ -3921,20 +4228,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
        return ret;
 }
 
-int btrfs_block_rsv_refill(struct btrfs_root *root,
-                          struct btrfs_block_rsv *block_rsv,
-                          u64 min_reserved)
-{
-       return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
-}
-
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
-                                  struct btrfs_block_rsv *block_rsv,
-                                  u64 min_reserved)
-{
-       return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
-}
-
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                            struct btrfs_block_rsv *dst_rsv,
                            u64 num_bytes)
@@ -3950,7 +4243,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
        if (global_rsv->full || global_rsv == block_rsv ||
            block_rsv->space_info != global_rsv->space_info)
                global_rsv = NULL;
-       block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+       block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
+                               num_bytes);
 }
 
 /*
@@ -3996,8 +4290,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 
        num_bytes = calc_global_metadata_size(fs_info);
 
-       spin_lock(&block_rsv->lock);
        spin_lock(&sinfo->lock);
+       spin_lock(&block_rsv->lock);
 
        block_rsv->size = num_bytes;
 
@@ -4009,18 +4303,22 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
                num_bytes = sinfo->total_bytes - num_bytes;
                block_rsv->reserved += num_bytes;
                sinfo->bytes_may_use += num_bytes;
+               trace_btrfs_space_reservation(fs_info, "space_info",
+                                     sinfo->flags, num_bytes, 1);
        }
 
        if (block_rsv->reserved >= block_rsv->size) {
                num_bytes = block_rsv->reserved - block_rsv->size;
                sinfo->bytes_may_use -= num_bytes;
+               trace_btrfs_space_reservation(fs_info, "space_info",
+                                     sinfo->flags, num_bytes, 0);
                sinfo->reservation_progress++;
                block_rsv->reserved = block_rsv->size;
                block_rsv->full = 1;
        }
 
-       spin_unlock(&sinfo->lock);
        spin_unlock(&block_rsv->lock);
+       spin_unlock(&sinfo->lock);
 }
 
 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -4048,7 +4346,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 
 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 {
-       block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+       block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
+                               (u64)-1);
        WARN_ON(fs_info->delalloc_block_rsv.size > 0);
        WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
        WARN_ON(fs_info->trans_block_rsv.size > 0);
@@ -4062,13 +4361,19 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root)
 {
+       if (!trans->block_rsv)
+               return;
+
        if (!trans->bytes_reserved)
                return;
 
+       trace_btrfs_space_reservation(root->fs_info, "transaction",
+                                     trans->transid, trans->bytes_reserved, 0);
        btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
        trans->bytes_reserved = 0;
 }
 
+/* Can only return 0 or -ENOSPC */
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
                                  struct inode *inode)
 {
@@ -4082,6 +4387,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
         * when we are truly done with the orphan item.
         */
        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+       trace_btrfs_space_reservation(root->fs_info, "orphan",
+                                     btrfs_ino(inode), num_bytes, 1);
        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
 
@@ -4089,6 +4396,8 @@ void btrfs_orphan_release_metadata(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+       trace_btrfs_space_reservation(root->fs_info, "orphan",
+                                     btrfs_ino(inode), num_bytes, 0);
        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
 }
 
@@ -4099,10 +4408,10 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
        struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
        /*
-        * two for root back/forward refs, two for directory entries
-        * and one for root of the snapshot.
+        * two for root back/forward refs, two for directory entries,
+        * one for root of the snapshot and one for parent inode.
         */
-       u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
+       u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
        dst_rsv->space_info = src_rsv->space_info;
        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
@@ -4125,10 +4434,9 @@ static unsigned drop_outstanding_extent(struct inode *inode)
        BTRFS_I(inode)->outstanding_extents--;
 
        if (BTRFS_I(inode)->outstanding_extents == 0 &&
-           BTRFS_I(inode)->delalloc_meta_reserved) {
+           test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+                              &BTRFS_I(inode)->runtime_flags))
                drop_inode_space = 1;
-               BTRFS_I(inode)->delalloc_meta_reserved = 0;
-       }
 
        /*
         * If we have more or the same amount of outsanding extents than we have
@@ -4210,18 +4518,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        u64 csum_bytes;
        unsigned nr_extents = 0;
        int extra_reserve = 0;
-       int flush = 1;
+       enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
        int ret;
 
        /* Need to be holding the i_mutex here if we aren't free space cache */
-       if (btrfs_is_free_space_inode(root, inode))
-               flush = 0;
-       else
-               WARN_ON(!mutex_is_locked(&inode->i_mutex));
+       if (btrfs_is_free_space_inode(inode))
+               flush = BTRFS_RESERVE_NO_FLUSH;
 
-       if (flush && btrfs_transaction_in_commit(root->fs_info))
+       if (flush != BTRFS_RESERVE_NO_FLUSH &&
+           btrfs_transaction_in_commit(root->fs_info))
                schedule_timeout(1);
 
+       mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
        num_bytes = ALIGN(num_bytes, root->sectorsize);
 
        spin_lock(&BTRFS_I(inode)->lock);
@@ -4236,7 +4544,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
         * Add an item to reserve for updating the inode when we complete the
         * delalloc io.
         */
-       if (!BTRFS_I(inode)->delalloc_meta_reserved) {
+       if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+                     &BTRFS_I(inode)->runtime_flags)) {
                nr_extents++;
                extra_reserve = 1;
        }
@@ -4246,6 +4555,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        csum_bytes = BTRFS_I(inode)->csum_bytes;
        spin_unlock(&BTRFS_I(inode)->lock);
 
+       if (root->fs_info->quota_enabled) {
+               ret = btrfs_qgroup_reserve(root, num_bytes +
+                                          nr_extents * root->leafsize);
+               if (ret) {
+                       mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+                       return ret;
+               }
+       }
+
        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
        if (ret) {
                u64 to_free = 0;
@@ -4269,19 +4587,30 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                if (dropped)
                        to_free += btrfs_calc_trans_metadata_size(root, dropped);
 
-               if (to_free)
+               if (to_free) {
                        btrfs_block_rsv_release(root, block_rsv, to_free);
+                       trace_btrfs_space_reservation(root->fs_info,
+                                                     "delalloc",
+                                                     btrfs_ino(inode),
+                                                     to_free, 0);
+               }
+               mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
                return ret;
        }
 
        spin_lock(&BTRFS_I(inode)->lock);
        if (extra_reserve) {
-               BTRFS_I(inode)->delalloc_meta_reserved = 1;
+               set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+                       &BTRFS_I(inode)->runtime_flags);
                nr_extents--;
        }
        BTRFS_I(inode)->reserved_extents += nr_extents;
        spin_unlock(&BTRFS_I(inode)->lock);
+       mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
 
+       if (to_reserve)
+               trace_btrfs_space_reservation(root->fs_info,"delalloc",
+                                             btrfs_ino(inode), to_reserve, 1);
        block_rsv_add_bytes(block_rsv, to_reserve, 1);
 
        return 0;
@@ -4311,6 +4640,13 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
        if (dropped > 0)
                to_free += btrfs_calc_trans_metadata_size(root, dropped);
 
+       trace_btrfs_space_reservation(root->fs_info, "delalloc",
+                                     btrfs_ino(inode), to_free, 0);
+       if (root->fs_info->quota_enabled) {
+               btrfs_qgroup_free(root, num_bytes +
+                                       dropped * root->leafsize);
+       }
+
        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
                                to_free);
 }
@@ -4390,7 +4726,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
        while (total) {
                cache = btrfs_lookup_block_group(info, bytenr);
                if (!cache)
-                       return -1;
+                       return -ENOENT;
                if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
                                    BTRFS_BLOCK_GROUP_RAID1 |
                                    BTRFS_BLOCK_GROUP_RAID10))
@@ -4493,7 +4829,7 @@ int btrfs_pin_extent(struct btrfs_root *root,
        struct btrfs_block_group_cache *cache;
 
        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
-       BUG_ON(!cache);
+       BUG_ON(!cache); /* Logic error */
 
        pin_down_extent(root, cache, bytenr, num_bytes, reserved);
 
@@ -4511,7 +4847,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
        struct btrfs_block_group_cache *cache;
 
        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
-       BUG_ON(!cache);
+       BUG_ON(!cache); /* Logic error */
 
        /*
         * pull in the free space cache (if any) so that our pin
@@ -4556,6 +4892,7 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
 {
        struct btrfs_space_info *space_info = cache->space_info;
        int ret = 0;
+
        spin_lock(&space_info->lock);
        spin_lock(&cache->lock);
        if (reserve != RESERVE_FREE) {
@@ -4565,7 +4902,9 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
                        cache->reserved += num_bytes;
                        space_info->bytes_reserved += num_bytes;
                        if (reserve == RESERVE_ALLOC) {
-                               BUG_ON(space_info->bytes_may_use < num_bytes);
+                               trace_btrfs_space_reservation(cache->fs_info,
+                                               "space_info", space_info->flags,
+                                               num_bytes, 0);
                                space_info->bytes_may_use -= num_bytes;
                        }
                }
@@ -4581,7 +4920,7 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
        return ret;
 }
 
-int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4611,22 +4950,25 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
        up_write(&fs_info->extent_commit_sem);
 
        update_global_block_rsv(fs_info);
-       return 0;
 }
 
 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_block_group_cache *cache = NULL;
+       struct btrfs_space_info *space_info;
+       struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
        u64 len;
+       bool readonly;
 
        while (start <= end) {
+               readonly = false;
                if (!cache ||
                    start >= cache->key.objectid + cache->key.offset) {
                        if (cache)
                                btrfs_put_block_group(cache);
                        cache = btrfs_lookup_block_group(fs_info, start);
-                       BUG_ON(!cache);
+                       BUG_ON(!cache); /* Logic error */
                }
 
                len = cache->key.objectid + cache->key.offset - start;
@@ -4638,15 +4980,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
                }
 
                start += len;
+               space_info = cache->space_info;
 
-               spin_lock(&cache->space_info->lock);
+               spin_lock(&space_info->lock);
                spin_lock(&cache->lock);
                cache->pinned -= len;
-               cache->space_info->bytes_pinned -= len;
-               if (cache->ro)
-                       cache->space_info->bytes_readonly += len;
+               space_info->bytes_pinned -= len;
+               if (cache->ro) {
+                       space_info->bytes_readonly += len;
+                       readonly = true;
+               }
                spin_unlock(&cache->lock);
-               spin_unlock(&cache->space_info->lock);
+               if (!readonly && global_rsv->space_info == space_info) {
+                       spin_lock(&global_rsv->lock);
+                       if (!global_rsv->full) {
+                               len = min(len, global_rsv->size -
+                                         global_rsv->reserved);
+                               global_rsv->reserved += len;
+                               space_info->bytes_may_use += len;
+                               if (global_rsv->reserved >= global_rsv->size)
+                                       global_rsv->full = 1;
+                       }
+                       spin_unlock(&global_rsv->lock);
+               }
+               spin_unlock(&space_info->lock);
        }
 
        if (cache)
@@ -4663,6 +5020,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
        u64 end;
        int ret;
 
+       if (trans->aborted)
+               return 0;
+
        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
                unpin = &fs_info->freed_extents[1];
        else
@@ -4670,7 +5030,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
        while (1) {
                ret = find_first_extent_bit(unpin, 0, &start, &end,
-                                           EXTENT_DIRTY);
+                                           EXTENT_DIRTY, NULL);
                if (ret)
                        break;
 
@@ -4748,7 +5108,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        ret = remove_extent_backref(trans, extent_root, path,
                                                    NULL, refs_to_drop,
                                                    is_data);
-                       BUG_ON(ret);
+                       if (ret) {
+                               btrfs_abort_transaction(trans, extent_root, ret);
+                               goto out;
+                       }
                        btrfs_release_path(path);
                        path->leave_spinning = 1;
 
@@ -4766,10 +5129,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                        btrfs_print_leaf(extent_root,
                                                         path->nodes[0]);
                        }
-                       BUG_ON(ret);
+                       if (ret < 0) {
+                               btrfs_abort_transaction(trans, extent_root, ret);
+                               goto out;
+                       }
                        extent_slot = path->slots[0];
                }
-       } else {
+       } else if (ret == -ENOENT) {
                btrfs_print_leaf(extent_root, path->nodes[0]);
                WARN_ON(1);
                printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
@@ -4779,6 +5145,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                       (unsigned long long)root_objectid,
                       (unsigned long long)owner_objectid,
                       (unsigned long long)owner_offset);
+       } else {
+               btrfs_abort_transaction(trans, extent_root, ret);
+               goto out;
        }
 
        leaf = path->nodes[0];
@@ -4788,7 +5157,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                BUG_ON(found_extent || extent_slot != path->slots[0]);
                ret = convert_extent_item_v0(trans, extent_root, path,
                                             owner_objectid, 0);
-               BUG_ON(ret < 0);
+               if (ret < 0) {
+                       btrfs_abort_transaction(trans, extent_root, ret);
+                       goto out;
+               }
 
                btrfs_release_path(path);
                path->leave_spinning = 1;
@@ -4805,7 +5177,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                               (unsigned long long)bytenr);
                        btrfs_print_leaf(extent_root, path->nodes[0]);
                }
-               BUG_ON(ret);
+               if (ret < 0) {
+                       btrfs_abort_transaction(trans, extent_root, ret);
+                       goto out;
+               }
+
                extent_slot = path->slots[0];
                leaf = path->nodes[0];
                item_size = btrfs_item_size_nr(leaf, extent_slot);
@@ -4842,7 +5218,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        ret = remove_extent_backref(trans, extent_root, path,
                                                    iref, refs_to_drop,
                                                    is_data);
-                       BUG_ON(ret);
+                       if (ret) {
+                               btrfs_abort_transaction(trans, extent_root, ret);
+                               goto out;
+                       }
                }
        } else {
                if (found_extent) {
@@ -4859,21 +5238,27 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 
                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                      num_to_del);
-               BUG_ON(ret);
+               if (ret) {
+                       btrfs_abort_transaction(trans, extent_root, ret);
+                       goto out;
+               }
                btrfs_release_path(path);
 
                if (is_data) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
-                       BUG_ON(ret);
-               } else {
-                       invalidate_mapping_pages(info->btree_inode->i_mapping,
-                            bytenr >> PAGE_CACHE_SHIFT,
-                            (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
+                       if (ret) {
+                               btrfs_abort_transaction(trans, extent_root, ret);
+                               goto out;
+                       }
                }
 
                ret = update_block_group(trans, root, bytenr, num_bytes, 0);
-               BUG_ON(ret);
+               if (ret) {
+                       btrfs_abort_transaction(trans, extent_root, ret);
+                       goto out;
+               }
        }
+out:
        btrfs_free_path(path);
        return ret;
 }
@@ -4964,11 +5349,12 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
        int ret;
 
        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-               ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
-                                               parent, root->root_key.objectid,
-                                               btrfs_header_level(buf),
-                                               BTRFS_DROP_DELAYED_REF, NULL);
-               BUG_ON(ret);
+               ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+                                       buf->start, buf->len,
+                                       parent, root->root_key.objectid,
+                                       btrfs_header_level(buf),
+                                       BTRFS_DROP_DELAYED_REF, NULL, 0);
+               BUG_ON(ret); /* -ENOMEM */
        }
 
        if (!last_ref)
@@ -5002,12 +5388,13 @@ out:
        btrfs_put_block_group(cache);
 }
 
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
-                     struct btrfs_root *root,
-                     u64 bytenr, u64 num_bytes, u64 parent,
-                     u64 root_objectid, u64 owner, u64 offset)
+/* Can return -ENOMEM */
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                     u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+                     u64 owner, u64 offset, int for_cow)
 {
        int ret;
+       struct btrfs_fs_info *fs_info = root->fs_info;
 
        /*
         * tree log blocks never actually go into the extent allocation
@@ -5019,15 +5406,16 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
                btrfs_pin_extent(root, bytenr, num_bytes, 1);
                ret = 0;
        } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-               ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+               ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+                                       num_bytes,
                                        parent, root_objectid, (int)owner,
-                                       BTRFS_DROP_DELAYED_REF, NULL);
-               BUG_ON(ret);
+                                       BTRFS_DROP_DELAYED_REF, NULL, for_cow);
        } else {
-               ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
-                                       parent, root_objectid, owner,
-                                       offset, BTRFS_DROP_DELAYED_REF, NULL);
-               BUG_ON(ret);
+               ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+                                               num_bytes,
+                                               parent, root_objectid, owner,
+                                               offset, BTRFS_DROP_DELAYED_REF,
+                                               NULL, for_cow);
        }
        return ret;
 }
@@ -5084,28 +5472,34 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
        return 0;
 }
 
-static int get_block_group_index(struct btrfs_block_group_cache *cache)
+static int __get_block_group_index(u64 flags)
 {
        int index;
-       if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
+
+       if (flags & BTRFS_BLOCK_GROUP_RAID10)
                index = 0;
-       else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
+       else if (flags & BTRFS_BLOCK_GROUP_RAID1)
                index = 1;
-       else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
+       else if (flags & BTRFS_BLOCK_GROUP_DUP)
                index = 2;
-       else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
+       else if (flags & BTRFS_BLOCK_GROUP_RAID0)
                index = 3;
        else
                index = 4;
+
        return index;
 }
 
+static int get_block_group_index(struct btrfs_block_group_cache *cache)
+{
+       return __get_block_group_index(cache->flags);
+}
+
 enum btrfs_loop_type {
-       LOOP_FIND_IDEAL = 0,
-       LOOP_CACHING_NOWAIT = 1,
-       LOOP_CACHING_WAIT = 2,
-       LOOP_ALLOC_CHUNK = 3,
-       LOOP_NO_EMPTY_SIZE = 4,
+       LOOP_CACHING_NOWAIT = 0,
+       LOOP_CACHING_WAIT = 1,
+       LOOP_ALLOC_CHUNK = 2,
+       LOOP_NO_EMPTY_SIZE = 3,
 };
 
 /*
@@ -5119,7 +5513,6 @@ enum btrfs_loop_type {
 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *orig_root,
                                     u64 num_bytes, u64 empty_size,
-                                    u64 search_start, u64 search_end,
                                     u64 hint_byte, struct btrfs_key *ins,
                                     u64 data)
 {
@@ -5128,9 +5521,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_free_cluster *last_ptr = NULL;
        struct btrfs_block_group_cache *block_group = NULL;
        struct btrfs_block_group_cache *used_block_group;
+       u64 search_start = 0;
        int empty_cluster = 2 * 1024 * 1024;
-       int allowed_chunk_alloc = 0;
-       int done_chunk_alloc = 0;
        struct btrfs_space_info *space_info;
        int loop = 0;
        int index = 0;
@@ -5141,14 +5533,14 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        bool failed_alloc = false;
        bool use_cluster = true;
        bool have_caching_bg = false;
-       u64 ideal_cache_percent = 0;
-       u64 ideal_cache_offset = 0;
 
        WARN_ON(num_bytes < root->sectorsize);
        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
        ins->objectid = 0;
        ins->offset = 0;
 
+       trace_find_free_extent(orig_root, num_bytes, empty_size, data);
+
        space_info = __find_space_info(root->fs_info, data);
        if (!space_info) {
                printk(KERN_ERR "No space info for %llu\n", data);
@@ -5162,9 +5554,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        if (btrfs_mixed_space_info(space_info))
                use_cluster = false;
 
-       if (orig_root->ref_cows || empty_size)
-               allowed_chunk_alloc = 1;
-
        if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
                last_ptr = &root->fs_info->meta_alloc_cluster;
                if (!btrfs_test_opt(root, SSD))
@@ -5190,7 +5579,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
                empty_cluster = 0;
 
        if (search_start == hint_byte) {
-ideal_cache:
                block_group = btrfs_lookup_block_group(root->fs_info,
                                                       search_start);
                used_block_group = block_group;
@@ -5202,8 +5590,7 @@ ideal_cache:
                 * picked out then we don't care that the block group is cached.
                 */
                if (block_group && block_group_bits(block_group, data) &&
-                   (block_group->cached != BTRFS_CACHE_NO ||
-                    search_start == ideal_cache_offset)) {
+                   block_group->cached != BTRFS_CACHE_NO) {
                        down_read(&space_info->groups_sem);
                        if (list_empty(&block_group->list) ||
                            block_group->ro) {
@@ -5257,56 +5644,16 @@ search:
 have_block_group:
                cached = block_group_cache_done(block_group);
                if (unlikely(!cached)) {
-                       u64 free_percent;
-
                        found_uncached_bg = true;
                        ret = cache_block_group(block_group, trans,
-                                               orig_root, 1);
-                       if (block_group->cached == BTRFS_CACHE_FINISHED)
-                               goto alloc;
-
-                       free_percent = btrfs_block_group_used(&block_group->item);
-                       free_percent *= 100;
-                       free_percent = div64_u64(free_percent,
-                                                block_group->key.offset);
-                       free_percent = 100 - free_percent;
-                       if (free_percent > ideal_cache_percent &&
-                           likely(!block_group->ro)) {
-                               ideal_cache_offset = block_group->key.objectid;
-                               ideal_cache_percent = free_percent;
-                       }
-
-                       /*
-                        * The caching workers are limited to 2 threads, so we
-                        * can queue as much work as we care to.
-                        */
-                       if (loop > LOOP_FIND_IDEAL) {
-                               ret = cache_block_group(block_group, trans,
-                                                       orig_root, 0);
-                               BUG_ON(ret);
-                       }
-
-                       /*
-                        * If loop is set for cached only, try the next block
-                        * group.
-                        */
-                       if (loop == LOOP_FIND_IDEAL)
-                               goto loop;
+                                               orig_root, 0);
+                       BUG_ON(ret < 0);
+                       ret = 0;
                }
 
-alloc:
                if (unlikely(block_group->ro))
                        goto loop;
 
-               spin_lock(&block_group->free_space_ctl->tree_lock);
-               if (cached &&
-                   block_group->free_space_ctl->free_space <
-                   num_bytes + empty_cluster + empty_size) {
-                       spin_unlock(&block_group->free_space_ctl->tree_lock);
-                       goto loop;
-               }
-               spin_unlock(&block_group->free_space_ctl->tree_lock);
-
                /*
                 * Ok we want to try and use the cluster allocator, so
                 * lets look there
@@ -5334,6 +5681,8 @@ alloc:
                        if (offset) {
                                /* we have a block, we're done */
                                spin_unlock(&last_ptr->refill_lock);
+                               trace_btrfs_reserve_extent_cluster(root,
+                                       block_group, search_start, num_bytes);
                                goto checks;
                        }
 
@@ -5352,8 +5701,15 @@ refill_cluster:
                         * plenty of times and not have found
                         * anything, so we are likely way too
                         * fragmented for the clustering stuff to find
-                        * anything.  */
-                       if (loop >= LOOP_NO_EMPTY_SIZE) {
+                        * anything.
+                        *
+                        * However, if the cluster is taken from the
+                        * current block group, release the cluster
+                        * first, so that we stand a better chance of
+                        * succeeding in the unclustered
+                        * allocation.  */
+                       if (loop >= LOOP_NO_EMPTY_SIZE &&
+                           last_ptr->block_group != block_group) {
                                spin_unlock(&last_ptr->refill_lock);
                                goto unclustered_alloc;
                        }
@@ -5364,6 +5720,11 @@ refill_cluster:
                         */
                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
 
+                       if (loop >= LOOP_NO_EMPTY_SIZE) {
+                               spin_unlock(&last_ptr->refill_lock);
+                               goto unclustered_alloc;
+                       }
+
                        /* allocate a cluster in this block group */
                        ret = btrfs_find_space_cluster(trans, root,
                                               block_group, last_ptr,
@@ -5380,6 +5741,9 @@ refill_cluster:
                                if (offset) {
                                        /* we found one, proceed */
                                        spin_unlock(&last_ptr->refill_lock);
+                                       trace_btrfs_reserve_extent_cluster(root,
+                                               block_group, search_start,
+                                               num_bytes);
                                        goto checks;
                                }
                        } else if (!cached && loop > LOOP_CACHING_NOWAIT
@@ -5404,6 +5768,15 @@ refill_cluster:
                }
 
 unclustered_alloc:
+               spin_lock(&block_group->free_space_ctl->tree_lock);
+               if (cached &&
+                   block_group->free_space_ctl->free_space <
+                   num_bytes + empty_cluster + empty_size) {
+                       spin_unlock(&block_group->free_space_ctl->tree_lock);
+                       goto loop;
+               }
+               spin_unlock(&block_group->free_space_ctl->tree_lock);
+
                offset = btrfs_find_space_for_alloc(block_group, search_start,
                                                    num_bytes, empty_size);
                /*
@@ -5428,11 +5801,6 @@ unclustered_alloc:
                }
 checks:
                search_start = stripe_align(root, offset);
-               /* move on to the next group */
-               if (search_start + num_bytes >= search_end) {
-                       btrfs_add_free_space(used_block_group, offset, num_bytes);
-                       goto loop;
-               }
 
                /* move on to the next group */
                if (search_start + num_bytes >
@@ -5441,9 +5809,6 @@ checks:
                        goto loop;
                }
 
-               ins->objectid = search_start;
-               ins->offset = num_bytes;
-
                if (offset < search_start)
                        btrfs_add_free_space(used_block_group, offset,
                                             search_start - offset);
@@ -5460,10 +5825,8 @@ checks:
                ins->objectid = search_start;
                ins->offset = num_bytes;
 
-               if (offset < search_start)
-                       btrfs_add_free_space(used_block_group, offset,
-                                            search_start - offset);
-               BUG_ON(offset > search_start);
+               trace_btrfs_reserve_extent(orig_root, block_group,
+                                          search_start, num_bytes);
                if (used_block_group != block_group)
                        btrfs_put_block_group(used_block_group);
                btrfs_put_block_group(block_group);
@@ -5484,9 +5847,7 @@ loop:
        if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
                goto search;
 
-       /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
-        *                      for them to make caching progress.  Also
-        *                      determine the best possible bg to cache
+       /*
         * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
         *                      caching kthreads as we move along
         * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
@@ -5496,65 +5857,19 @@ loop:
         */
        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
                index = 0;
-               if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
-                       found_uncached_bg = false;
-                       loop++;
-                       if (!ideal_cache_percent)
-                               goto search;
-
-                       /*
-                        * 1 of the following 2 things have happened so far
-                        *
-                        * 1) We found an ideal block group for caching that
-                        * is mostly full and will cache quickly, so we might
-                        * as well wait for it.
-                        *
-                        * 2) We searched for cached only and we didn't find
-                        * anything, and we didn't start any caching kthreads
-                        * either, so chances are we will loop through and
-                        * start a couple caching kthreads, and then come back
-                        * around and just wait for them.  This will be slower
-                        * because we will have 2 caching kthreads reading at
-                        * the same time when we could have just started one
-                        * and waited for it to get far enough to give us an
-                        * allocation, so go ahead and go to the wait caching
-                        * loop.
-                        */
-                       loop = LOOP_CACHING_WAIT;
-                       search_start = ideal_cache_offset;
-                       ideal_cache_percent = 0;
-                       goto ideal_cache;
-               } else if (loop == LOOP_FIND_IDEAL) {
-                       /*
-                        * Didn't find a uncached bg, wait on anything we find
-                        * next.
-                        */
-                       loop = LOOP_CACHING_WAIT;
-                       goto search;
-               }
-
                loop++;
-
                if (loop == LOOP_ALLOC_CHUNK) {
-                      if (allowed_chunk_alloc) {
-                               ret = do_chunk_alloc(trans, root, num_bytes +
-                                                    2 * 1024 * 1024, data,
-                                                    CHUNK_ALLOC_LIMITED);
-                               allowed_chunk_alloc = 0;
-                               if (ret == 1)
-                                       done_chunk_alloc = 1;
-                       } else if (!done_chunk_alloc &&
-                                  space_info->force_alloc ==
-                                  CHUNK_ALLOC_NO_FORCE) {
-                               space_info->force_alloc = CHUNK_ALLOC_LIMITED;
+                       ret = do_chunk_alloc(trans, root, data,
+                                            CHUNK_ALLOC_FORCE);
+                       /*
+                        * Do not bail out on ENOSPC since we
+                        * can do more things.
+                        */
+                       if (ret < 0 && ret != -ENOSPC) {
+                               btrfs_abort_transaction(trans,
+                                                       root, ret);
+                               goto out;
                        }
-
-                      /*
-                       * We didn't allocate a chunk, go ahead and drop the
-                       * empty size and loop again.
-                       */
-                      if (!done_chunk_alloc)
-                              loop = LOOP_NO_EMPTY_SIZE;
                }
 
                if (loop == LOOP_NO_EMPTY_SIZE) {
@@ -5568,6 +5883,7 @@ loop:
        } else if (ins->objectid) {
                ret = 0;
        }
+out:
 
        return ret;
 }
@@ -5602,13 +5918,13 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 again:
        list_for_each_entry(cache, &info->block_groups[index], list) {
                spin_lock(&cache->lock);
-               printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
-                      "%llu pinned %llu reserved\n",
+               printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
                       (unsigned long long)cache->key.objectid,
                       (unsigned long long)cache->key.offset,
                       (unsigned long long)btrfs_block_group_used(&cache->item),
                       (unsigned long long)cache->pinned,
-                      (unsigned long long)cache->reserved);
+                      (unsigned long long)cache->reserved,
+                      cache->ro ? "[readonly]" : "");
                btrfs_dump_free_space(cache, bytes);
                spin_unlock(&cache->lock);
        }
@@ -5621,44 +5937,35 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 num_bytes, u64 min_alloc_size,
                         u64 empty_size, u64 hint_byte,
-                        u64 search_end, struct btrfs_key *ins,
-                        u64 data)
+                        struct btrfs_key *ins, u64 data)
 {
+       bool final_tried = false;
        int ret;
-       u64 search_start = 0;
 
        data = btrfs_get_alloc_profile(root, data);
 again:
-       /*
-        * the only place that sets empty_size is btrfs_realloc_node, which
-        * is not called recursively on allocations
-        */
-       if (empty_size || root->ref_cows)
-               ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                    num_bytes + 2 * 1024 * 1024, data,
-                                    CHUNK_ALLOC_NO_FORCE);
-
        WARN_ON(num_bytes < root->sectorsize);
        ret = find_free_extent(trans, root, num_bytes, empty_size,
-                              search_start, search_end, hint_byte,
-                              ins, data);
-
-       if (ret == -ENOSPC && num_bytes > min_alloc_size) {
-               num_bytes = num_bytes >> 1;
-               num_bytes = num_bytes & ~(root->sectorsize - 1);
-               num_bytes = max(num_bytes, min_alloc_size);
-               do_chunk_alloc(trans, root->fs_info->extent_root,
-                              num_bytes, data, CHUNK_ALLOC_FORCE);
-               goto again;
-       }
-       if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
-               struct btrfs_space_info *sinfo;
-
-               sinfo = __find_space_info(root->fs_info, data);
-               printk(KERN_ERR "btrfs allocation failed flags %llu, "
-                      "wanted %llu\n", (unsigned long long)data,
-                      (unsigned long long)num_bytes);
-               dump_space_info(sinfo, num_bytes, 1);
+                              hint_byte, ins, data);
+
+       if (ret == -ENOSPC) {
+               if (!final_tried) {
+                       num_bytes = num_bytes >> 1;
+                       num_bytes = num_bytes & ~(root->sectorsize - 1);
+                       num_bytes = max(num_bytes, min_alloc_size);
+                       if (num_bytes == min_alloc_size)
+                               final_tried = true;
+                       goto again;
+               } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+                       struct btrfs_space_info *sinfo;
+
+                       sinfo = __find_space_info(root->fs_info, data);
+                       printk(KERN_ERR "btrfs allocation failed flags %llu, "
+                              "wanted %llu\n", (unsigned long long)data,
+                              (unsigned long long)num_bytes);
+                       if (sinfo)
+                               dump_space_info(sinfo, num_bytes, 1);
+               }
        }
 
        trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
@@ -5736,7 +6043,10 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        path->leave_spinning = 1;
        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
                                      ins, size);
-       BUG_ON(ret);
+       if (ret) {
+               btrfs_free_path(path);
+               return ret;
+       }
 
        leaf = path->nodes[0];
        extent_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -5766,7 +6076,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        btrfs_free_path(path);
 
        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
-       if (ret) {
+       if (ret) { /* -ENOENT, logic error */
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
                       (unsigned long long)ins->offset);
@@ -5797,7 +6107,10 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
        path->leave_spinning = 1;
        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
                                      ins, size);
-       BUG_ON(ret);
+       if (ret) {
+               btrfs_free_path(path);
+               return ret;
+       }
 
        leaf = path->nodes[0];
        extent_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -5827,7 +6140,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
        btrfs_free_path(path);
 
        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
-       if (ret) {
+       if (ret) { /* -ENOENT, logic error */
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
                       (unsigned long long)ins->offset);
@@ -5845,9 +6158,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 
        BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
-       ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
-                                        0, root_objectid, owner, offset,
-                                        BTRFS_ADD_DELAYED_EXTENT, NULL);
+       ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
+                                        ins->offset, 0,
+                                        root_objectid, owner, offset,
+                                        BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
        return ret;
 }
 
@@ -5874,28 +6188,28 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
        if (!caching_ctl) {
                BUG_ON(!block_group_cache_done(block_group));
                ret = btrfs_remove_free_space(block_group, start, num_bytes);
-               BUG_ON(ret);
+               BUG_ON(ret); /* -ENOMEM */
        } else {
                mutex_lock(&caching_ctl->mutex);
 
                if (start >= caching_ctl->progress) {
                        ret = add_excluded_extent(root, start, num_bytes);
-                       BUG_ON(ret);
+                       BUG_ON(ret); /* -ENOMEM */
                } else if (start + num_bytes <= caching_ctl->progress) {
                        ret = btrfs_remove_free_space(block_group,
                                                      start, num_bytes);
-                       BUG_ON(ret);
+                       BUG_ON(ret); /* -ENOMEM */
                } else {
                        num_bytes = caching_ctl->progress - start;
                        ret = btrfs_remove_free_space(block_group,
                                                      start, num_bytes);
-                       BUG_ON(ret);
+                       BUG_ON(ret); /* -ENOMEM */
 
                        start = caching_ctl->progress;
                        num_bytes = ins->objectid + ins->offset -
                                    caching_ctl->progress;
                        ret = add_excluded_extent(root, start, num_bytes);
-                       BUG_ON(ret);
+                       BUG_ON(ret); /* -ENOMEM */
                }
 
                mutex_unlock(&caching_ctl->mutex);
@@ -5904,7 +6218,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 
        ret = btrfs_update_reserved_bytes(block_group, ins->offset,
                                          RESERVE_ALLOC_NO_ACCOUNT);
-       BUG_ON(ret);
+       BUG_ON(ret); /* logic error */
        btrfs_put_block_group(block_group);
        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
                                         0, owner, offset, ins, 1);
@@ -5925,6 +6239,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
        btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
        btrfs_tree_lock(buf);
        clean_tree_block(trans, root, buf);
+       clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
 
        btrfs_set_lock_blocking(buf);
        btrfs_set_buffer_uptodate(buf);
@@ -5960,7 +6275,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        block_rsv = get_block_rsv(trans, root);
 
        if (block_rsv->size == 0) {
-               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+               ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+                                            BTRFS_RESERVE_NO_FLUSH);
                /*
                 * If we couldn't reserve metadata bytes try and use some from
                 * the global reserve.
@@ -5979,15 +6295,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        ret = block_rsv_use_bytes(block_rsv, blocksize);
        if (!ret)
                return block_rsv;
-       if (ret) {
+       if (ret && !block_rsv->failfast) {
                static DEFINE_RATELIMIT_STATE(_rs,
                                DEFAULT_RATELIMIT_INTERVAL,
                                /*DEFAULT_RATELIMIT_BURST*/ 2);
-               if (__ratelimit(&_rs)) {
-                       printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
-                       WARN_ON(1);
-               }
-               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+               if (__ratelimit(&_rs))
+                       WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
+                            ret);
+               ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+                                            BTRFS_RESERVE_NO_FLUSH);
                if (!ret) {
                        return block_rsv;
                } else if (ret && block_rsv != global_rsv) {
@@ -6000,10 +6316,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        return ERR_PTR(-ENOSPC);
 }
 
-static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
+                           struct btrfs_block_rsv *block_rsv, u32 blocksize)
 {
        block_rsv_add_bytes(block_rsv, blocksize, 0);
-       block_rsv_release_bytes(block_rsv, NULL, 0);
+       block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
 }
 
 /*
@@ -6031,15 +6348,15 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                return ERR_CAST(block_rsv);
 
        ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
-                                  empty_size, hint, (u64)-1, &ins, 0);
+                                  empty_size, hint, &ins, 0);
        if (ret) {
-               unuse_block_rsv(block_rsv, blocksize);
+               unuse_block_rsv(root->fs_info, block_rsv, blocksize);
                return ERR_PTR(ret);
        }
 
        buf = btrfs_init_new_buffer(trans, root, ins.objectid,
                                    blocksize, level);
-       BUG_ON(IS_ERR(buf));
+       BUG_ON(IS_ERR(buf)); /* -ENOMEM */
 
        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
                if (parent == 0)
@@ -6051,7 +6368,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
                struct btrfs_delayed_extent_op *extent_op;
                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-               BUG_ON(!extent_op);
+               BUG_ON(!extent_op); /* -ENOMEM */
                if (key)
                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
                else
@@ -6061,11 +6378,12 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                extent_op->update_flags = 1;
                extent_op->is_data = 0;
 
-               ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+               ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+                                       ins.objectid,
                                        ins.offset, parent, root_objectid,
                                        level, BTRFS_ADD_DELAYED_EXTENT,
-                                       extent_op);
-               BUG_ON(ret);
+                                       extent_op, 0);
+               BUG_ON(ret); /* -ENOMEM */
        }
        return buf;
 }
@@ -6081,6 +6399,7 @@ struct walk_control {
        int keep_locks;
        int reada_slot;
        int reada_count;
+       int for_reloc;
 };
 
 #define DROP_REFERENCE 1
@@ -6134,7 +6453,9 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
                /* We don't lock the tree block, it's OK to be racy here */
                ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
                                               &refs, &flags);
-               BUG_ON(ret);
+               /* We don't care about errors in readahead. */
+               if (ret < 0)
+                       continue;
                BUG_ON(refs == 0);
 
                if (wc->stage == DROP_REFERENCE) {
@@ -6201,7 +6522,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
                                               eb->start, eb->len,
                                               &wc->refs[level],
                                               &wc->flags[level]);
-               BUG_ON(ret);
+               BUG_ON(ret == -ENOMEM);
+               if (ret)
+                       return ret;
                BUG_ON(wc->refs[level] == 0);
        }
 
@@ -6219,13 +6542,13 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
        /* wc->stage == UPDATE_BACKREF */
        if (!(wc->flags[level] & flag)) {
                BUG_ON(!path->locks[level]);
-               ret = btrfs_inc_ref(trans, root, eb, 1);
-               BUG_ON(ret);
-               ret = btrfs_dec_ref(trans, root, eb, 0);
-               BUG_ON(ret);
+               ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
+               BUG_ON(ret); /* -ENOMEM */
+               ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
+               BUG_ON(ret); /* -ENOMEM */
                ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
                                                  eb->len, flag, 0);
-               BUG_ON(ret);
+               BUG_ON(ret); /* -ENOMEM */
                wc->flags[level] |= flag;
        }
 
@@ -6297,7 +6620,11 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
                                       &wc->refs[level - 1],
                                       &wc->flags[level - 1]);
-       BUG_ON(ret);
+       if (ret < 0) {
+               btrfs_tree_unlock(next);
+               return ret;
+       }
+
        BUG_ON(wc->refs[level - 1] == 0);
        *lookup_info = 0;
 
@@ -6326,7 +6653,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                        goto skip;
        }
 
-       if (!btrfs_buffer_uptodate(next, generation)) {
+       if (!btrfs_buffer_uptodate(next, generation, 0)) {
                btrfs_tree_unlock(next);
                free_extent_buffer(next);
                next = NULL;
@@ -6365,8 +6692,8 @@ skip:
                }
 
                ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
-                                       root->root_key.objectid, level - 1, 0);
-               BUG_ON(ret);
+                               root->root_key.objectid, level - 1, 0, 0);
+               BUG_ON(ret); /* -ENOMEM */
        }
        btrfs_tree_unlock(next);
        free_extent_buffer(next);
@@ -6424,7 +6751,10 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                                                       eb->start, eb->len,
                                                       &wc->refs[level],
                                                       &wc->flags[level]);
-                       BUG_ON(ret);
+                       if (ret < 0) {
+                               btrfs_tree_unlock_rw(eb, path->locks[level]);
+                               return ret;
+                       }
                        BUG_ON(wc->refs[level] == 0);
                        if (wc->refs[level] == 1) {
                                btrfs_tree_unlock_rw(eb, path->locks[level]);
@@ -6439,10 +6769,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
        if (wc->refs[level] == 1) {
                if (level == 0) {
                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
-                               ret = btrfs_dec_ref(trans, root, eb, 1);
+                               ret = btrfs_dec_ref(trans, root, eb, 1,
+                                                   wc->for_reloc);
                        else
-                               ret = btrfs_dec_ref(trans, root, eb, 0);
-                       BUG_ON(ret);
+                               ret = btrfs_dec_ref(trans, root, eb, 0,
+                                                   wc->for_reloc);
+                       BUG_ON(ret); /* -ENOMEM */
                }
                /* make block locked assertion in clean_tree_block happy */
                if (!path->locks[level] &&
@@ -6551,8 +6883,9 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
  * also make sure backrefs for the shared block and all lower level
  * blocks are properly updated.
  */
-void btrfs_drop_snapshot(struct btrfs_root *root,
-                        struct btrfs_block_rsv *block_rsv, int update_ref)
+int btrfs_drop_snapshot(struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv, int update_ref,
+                        int for_reloc)
 {
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans;
@@ -6578,7 +6911,10 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
        }
 
        trans = btrfs_start_transaction(tree_root, 0);
-       BUG_ON(IS_ERR(trans));
+       if (IS_ERR(trans)) {
+               err = PTR_ERR(trans);
+               goto out_free;
+       }
 
        if (block_rsv)
                trans->block_rsv = block_rsv;
@@ -6603,7 +6939,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
                path->lowest_level = 0;
                if (ret < 0) {
                        err = ret;
-                       goto out_free;
+                       goto out_end_trans;
                }
                WARN_ON(ret > 0);
 
@@ -6623,7 +6959,10 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
                                                path->nodes[level]->len,
                                                &wc->refs[level],
                                                &wc->flags[level]);
-                       BUG_ON(ret);
+                       if (ret < 0) {
+                               err = ret;
+                               goto out_end_trans;
+                       }
                        BUG_ON(wc->refs[level] == 0);
 
                        if (level == root_item->drop_level)
@@ -6640,6 +6979,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
        wc->stage = DROP_REFERENCE;
        wc->update_ref = update_ref;
        wc->keep_locks = 0;
+       wc->for_reloc = for_reloc;
        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
        while (1) {
@@ -6673,26 +7013,40 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
                        ret = btrfs_update_root(trans, tree_root,
                                                &root->root_key,
                                                root_item);
-                       BUG_ON(ret);
+                       if (ret) {
+                               btrfs_abort_transaction(trans, tree_root, ret);
+                               err = ret;
+                               goto out_end_trans;
+                       }
 
                        btrfs_end_transaction_throttle(trans, tree_root);
                        trans = btrfs_start_transaction(tree_root, 0);
-                       BUG_ON(IS_ERR(trans));
+                       if (IS_ERR(trans)) {
+                               err = PTR_ERR(trans);
+                               goto out_free;
+                       }
                        if (block_rsv)
                                trans->block_rsv = block_rsv;
                }
        }
        btrfs_release_path(path);
-       BUG_ON(err);
+       if (err)
+               goto out_end_trans;
 
        ret = btrfs_del_root(trans, tree_root, &root->root_key);
-       BUG_ON(ret);
+       if (ret) {
+               btrfs_abort_transaction(trans, tree_root, ret);
+               goto out_end_trans;
+       }
 
        if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
                ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
                                           NULL, NULL);
-               BUG_ON(ret < 0);
-               if (ret > 0) {
+               if (ret < 0) {
+                       btrfs_abort_transaction(trans, tree_root, ret);
+                       err = ret;
+                       goto out_end_trans;
+               } else if (ret > 0) {
                        /* if we fail to delete the orphan item this time
                         * around, it'll get picked up the next time.
                         *
@@ -6710,20 +7064,22 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
                free_extent_buffer(root->commit_root);
                kfree(root);
        }
-out_free:
+out_end_trans:
        btrfs_end_transaction_throttle(trans, tree_root);
+out_free:
        kfree(wc);
        btrfs_free_path(path);
 out:
        if (err)
                btrfs_std_error(root->fs_info, err);
-       return;
+       return err;
 }
 
 /*
  * drop subtree rooted at tree block 'node'.
  *
  * NOTE: this function will unlock and release tree block 'node'
+ * only used by relocation code
  */
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
@@ -6768,6 +7124,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        wc->stage = DROP_REFERENCE;
        wc->update_ref = 0;
        wc->keep_locks = 1;
+       wc->for_reloc = 1;
        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
        while (1) {
@@ -6792,8 +7149,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 {
        u64 num_devices;
-       u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
-               BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+       u64 stripped;
+
+       /*
+        * if restripe for this chunk_type is on pick target profile and
+        * return, otherwise do the usual balance
+        */
+       stripped = get_restripe_target(root->fs_info, flags);
+       if (stripped)
+               return extended_to_chunk(stripped);
 
        /*
         * we add in the count of missing devices because we want
@@ -6803,6 +7167,9 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
        num_devices = root->fs_info->fs_devices->rw_devices +
                root->fs_info->fs_devices->missing_devices;
 
+       stripped = BTRFS_BLOCK_GROUP_RAID0 |
+               BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+
        if (num_devices == 1) {
                stripped |= BTRFS_BLOCK_GROUP_DUP;
                stripped = flags & ~stripped;
@@ -6815,7 +7182,6 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
                if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
                             BTRFS_BLOCK_GROUP_RAID10))
                        return stripped | BTRFS_BLOCK_GROUP_DUP;
-               return flags;
        } else {
                /* they already had raid on here, just return */
                if (flags & stripped)
@@ -6828,9 +7194,9 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
                if (flags & BTRFS_BLOCK_GROUP_DUP)
                        return stripped | BTRFS_BLOCK_GROUP_RAID1;
 
-               /* turn single device chunks into raid0 */
-               return stripped | BTRFS_BLOCK_GROUP_RAID0;
+               /* this is drive concat, leave it alone */
        }
+
        return flags;
 }
 
@@ -6889,18 +7255,22 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
        BUG_ON(cache->ro);
 
        trans = btrfs_join_transaction(root);
-       BUG_ON(IS_ERR(trans));
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
 
        alloc_flags = update_block_group_flags(root, cache->flags);
-       if (alloc_flags != cache->flags)
-               do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
-                              CHUNK_ALLOC_FORCE);
+       if (alloc_flags != cache->flags) {
+               ret = do_chunk_alloc(trans, root, alloc_flags,
+                                    CHUNK_ALLOC_FORCE);
+               if (ret < 0)
+                       goto out;
+       }
 
        ret = set_block_group_ro(cache, 0);
        if (!ret)
                goto out;
        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
-       ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+       ret = do_chunk_alloc(trans, root, alloc_flags,
                             CHUNK_ALLOC_FORCE);
        if (ret < 0)
                goto out;
@@ -6914,7 +7284,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root, u64 type)
 {
        u64 alloc_flags = get_alloc_profile(root, type);
-       return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+       return do_chunk_alloc(trans, root, alloc_flags,
                              CHUNK_ALLOC_FORCE);
 }
 
@@ -6974,7 +7344,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
        return free_bytes;
 }
 
-int btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_set_block_group_rw(struct btrfs_root *root,
                              struct btrfs_block_group_cache *cache)
 {
        struct btrfs_space_info *sinfo = cache->space_info;
@@ -6990,7 +7360,6 @@ int btrfs_set_block_group_rw(struct btrfs_root *root,
        cache->ro = 0;
        spin_unlock(&cache->lock);
        spin_unlock(&sinfo->lock);
-       return 0;
 }
 
 /*
@@ -7008,6 +7377,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
        u64 min_free;
        u64 dev_min = 1;
        u64 dev_nr = 0;
+       u64 target;
        int index;
        int full = 0;
        int ret = 0;
@@ -7048,13 +7418,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
        /*
         * ok we don't have enough space, but maybe we have free space on our
         * devices to allocate new chunks for relocation, so loop through our
-        * alloc devices and guess if we have enough space.  However, if we
-        * were marked as full, then we know there aren't enough chunks, and we
-        * can just return.
+        * alloc devices and guess if we have enough space.  if this block
+        * group is going to be restriped, run checks against the target
+        * profile instead of the current one.
         */
        ret = -1;
-       if (full)
-               goto out;
 
        /*
         * index:
@@ -7064,7 +7432,20 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
         *      3: raid0
         *      4: single
         */
-       index = get_block_group_index(block_group);
+       target = get_restripe_target(root->fs_info, block_group->flags);
+       if (target) {
+               index = __get_block_group_index(extended_to_chunk(target));
+       } else {
+               /*
+                * this is just a balance, so if we were marked as full
+                * we know there is no space for a new chunk
+                */
+               if (full)
+                       goto out;
+
+               index = get_block_group_index(block_group);
+       }
+
        if (index == 0) {
                dev_min = 4;
                /* Divide by 2 */
@@ -7088,7 +7469,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                 * space to fit our block group in.
                 */
                if (device->total_bytes > device->bytes_used + min_free) {
-                       ret = find_free_dev_extent(NULL, device, min_free,
+                       ret = find_free_dev_extent(device, min_free,
                                                   &dev_offset, NULL);
                        if (!ret)
                                dev_nr++;
@@ -7312,8 +7693,21 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                INIT_LIST_HEAD(&cache->list);
                INIT_LIST_HEAD(&cache->cluster_list);
 
-               if (need_clear)
+               if (need_clear) {
+                       /*
+                        * When we mount with old space cache, we need to
+                        * set BTRFS_DC_CLEAR and set dirty flag.
+                        *
+                        * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
+                        *    truncate the old free space cache inode and
+                        *    setup a new one.
+                        * b) Setting 'dirty flag' makes sure that we flush
+                        *    the new space cache info onto disk.
+                        */
                        cache->disk_cache_state = BTRFS_DC_CLEAR;
+                       if (btrfs_test_opt(root, SPACE_CACHE))
+                               cache->dirty = 1;
+               }
 
                read_extent_buffer(leaf, &cache->item,
                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -7358,7 +7752,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                ret = update_space_info(info, cache->flags, found_key.offset,
                                        btrfs_block_group_used(&cache->item),
                                        &space_info);
-               BUG_ON(ret);
+               BUG_ON(ret); /* -ENOMEM */
                cache->space_info = space_info;
                spin_lock(&cache->space_info->lock);
                cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7367,7 +7761,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                __link_block_group(space_info, cache);
 
                ret = btrfs_add_block_group_cache(root->fs_info, cache);
-               BUG_ON(ret);
+               BUG_ON(ret); /* Logic error */
 
                set_avail_alloc_bits(root->fs_info, cache->flags);
                if (btrfs_chunk_readonly(root, cache->key.objectid))
@@ -7397,6 +7791,34 @@ error:
        return ret;
 }
 
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root)
+{
+       struct btrfs_block_group_cache *block_group, *tmp;
+       struct btrfs_root *extent_root = root->fs_info->extent_root;
+       struct btrfs_block_group_item item;
+       struct btrfs_key key;
+       int ret = 0;
+
+       list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
+                                new_bg_list) {
+               list_del_init(&block_group->new_bg_list);
+
+               if (ret)
+                       continue;
+
+               spin_lock(&block_group->lock);
+               memcpy(&item, &block_group->item, sizeof(item));
+               memcpy(&key, &block_group->key, sizeof(key));
+               spin_unlock(&block_group->lock);
+
+               ret = btrfs_insert_item(trans, extent_root, &key, &item,
+                                       sizeof(item));
+               if (ret)
+                       btrfs_abort_transaction(trans, extent_root, ret);
+       }
+}
+
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, u64 bytes_used,
                           u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -7430,6 +7852,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        spin_lock_init(&cache->lock);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
+       INIT_LIST_HEAD(&cache->new_bg_list);
 
        btrfs_init_free_space_ctl(cache);
 
@@ -7449,7 +7872,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
                                &cache->space_info);
-       BUG_ON(ret);
+       BUG_ON(ret); /* -ENOMEM */
+       update_global_block_rsv(root->fs_info);
 
        spin_lock(&cache->space_info->lock);
        cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7458,17 +7882,28 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        __link_block_group(cache->space_info, cache);
 
        ret = btrfs_add_block_group_cache(root->fs_info, cache);
-       BUG_ON(ret);
+       BUG_ON(ret); /* Logic error */
 
-       ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
-                               sizeof(cache->item));
-       BUG_ON(ret);
+       list_add_tail(&cache->new_bg_list, &trans->new_bgs);
 
        set_avail_alloc_bits(extent_root->fs_info, type);
 
        return 0;
 }
 
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+       u64 extra_flags = chunk_to_extended(flags) &
+                               BTRFS_EXTENDED_PROFILE_MASK;
+
+       if (flags & BTRFS_BLOCK_GROUP_DATA)
+               fs_info->avail_data_alloc_bits &= ~extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_METADATA)
+               fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+               fs_info->avail_system_alloc_bits &= ~extra_flags;
+}
+
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start)
 {
@@ -7479,6 +7914,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        struct btrfs_key key;
        struct inode *inode;
        int ret;
+       int index;
        int factor;
 
        root = root->fs_info->extent_root;
@@ -7494,6 +7930,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        free_excluded_extents(root, block_group);
 
        memcpy(&key, &block_group->key, sizeof(key));
+       index = get_block_group_index(block_group);
        if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
                                  BTRFS_BLOCK_GROUP_RAID1 |
                                  BTRFS_BLOCK_GROUP_RAID10))
@@ -7525,7 +7962,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        inode = lookup_free_space_inode(tree_root, block_group, path);
        if (!IS_ERR(inode)) {
                ret = btrfs_orphan_add(trans, inode);
-               BUG_ON(ret);
+               if (ret) {
+                       btrfs_add_delayed_iput(inode);
+                       goto out;
+               }
                clear_nlink(inode);
                /* One for the block groups ref */
                spin_lock(&block_group->lock);
@@ -7568,6 +8008,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         * are still on the list after taking the semaphore
         */
        list_del_init(&block_group->list);
+       if (list_empty(&block_group->space_info->block_groups[index]))
+               clear_avail_alloc_bits(root->fs_info, block_group->flags);
        up_write(&block_group->space_info->groups_sem);
 
        if (block_group->cached == BTRFS_CACHE_STARTED)
@@ -7657,9 +8099,16 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
        u64 start;
        u64 end;
        u64 trimmed = 0;
+       u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
        int ret = 0;
 
-       cache = btrfs_lookup_block_group(fs_info, range->start);
+       /*
+        * try to trim all FS space, our block group may start from non-zero.
+        */
+       if (range->len == total_bytes)
+               cache = btrfs_lookup_first_block_group(fs_info, range->start);
+       else
+               cache = btrfs_lookup_block_group(fs_info, range->start);
 
        while (cache) {
                if (cache->key.objectid >= (range->start + range->len)) {