Btrfs: process the delayed reference queue in clusters
Chris Mason [Fri, 13 Mar 2009 14:17:05 +0000 (10:17 -0400)]
The delayed reference queue maintains pending operations that need to
be done to the extent allocation tree.  These are processed by
finding records in the tree that are not currently being processed one at
a time.

This is slow because it uses lots of time searching through the rbtree
and because it creates lock contention on the extent allocation tree
when lots of different procs are running delayed refs at the same time.

This commit changes things to grab a cluster of refs for processing,
using a cursor into the rbtree as the starting point of the next search.
This way we walk smoothly through the rbtree.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

fs/btrfs/ctree.h
fs/btrfs/delayed-ref.c
fs/btrfs/delayed-ref.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/transaction.c

index ced5fd8..08d9f8d 100644 (file)
@@ -720,6 +720,7 @@ struct btrfs_fs_info {
        struct mutex drop_mutex;
        struct mutex volume_mutex;
        struct mutex tree_reloc_mutex;
+
        struct list_head trans_list;
        struct list_head hashers;
        struct list_head dead_roots;
index 3e7eeaf..759fa24 100644 (file)
@@ -93,7 +93,8 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
  * ref if it was able to find one, or NULL if nothing was in that spot
  */
 static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
-                                                 u64 bytenr, u64 parent)
+                                 u64 bytenr, u64 parent,
+                                 struct btrfs_delayed_ref_node **last)
 {
        struct rb_node *n = root->rb_node;
        struct btrfs_delayed_ref_node *entry;
@@ -102,6 +103,8 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
        while (n) {
                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
                WARN_ON(!entry->in_tree);
+               if (last)
+                       *last = entry;
 
                cmp = comp_entry(entry, bytenr, parent);
                if (cmp < 0)
@@ -114,45 +117,99 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
        return NULL;
 }
 
-/*
- * Locking on delayed refs is done by taking a lock on the head node,
- * which has the (impossible) parent id of (u64)-1.  Once a lock is held
- * on the head node, you're allowed (and required) to process all the
- * delayed refs for a given byte number in the tree.
- *
- * This will walk forward in the rbtree until it finds a head node it
- * is able to lock.  It might not lock the delayed ref you asked for,
- * and so it will return the one it did lock in next_ret and return 0.
- *
- * If no locks are taken, next_ret is set to null and 1 is returned.  This
- * means there are no more unlocked head nodes in the rbtree.
- */
-int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans,
-                          struct btrfs_delayed_ref_node *ref,
-                          struct btrfs_delayed_ref_head **next_ret)
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+                          struct btrfs_delayed_ref_head *head)
 {
+       struct btrfs_delayed_ref_root *delayed_refs;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       assert_spin_locked(&delayed_refs->lock);
+       if (mutex_trylock(&head->mutex))
+               return 0;
+
+       atomic_inc(&head->node.refs);
+       spin_unlock(&delayed_refs->lock);
+
+       mutex_lock(&head->mutex);
+       spin_lock(&delayed_refs->lock);
+       if (!head->node.in_tree) {
+               mutex_unlock(&head->mutex);
+               btrfs_put_delayed_ref(&head->node);
+               return -EAGAIN;
+       }
+       btrfs_put_delayed_ref(&head->node);
+       return 0;
+}
+
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+                          struct list_head *cluster, u64 start)
+{
+       int count = 0;
+       struct btrfs_delayed_ref_root *delayed_refs;
        struct rb_node *node;
+       struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *head;
-       int ret = 0;
 
-       while (1) {
+       delayed_refs = &trans->transaction->delayed_refs;
+       if (start == 0) {
+               node = rb_first(&delayed_refs->root);
+       } else {
+               ref = NULL;
+               tree_search(&delayed_refs->root, start, (u64)-1, &ref);
+               if (ref) {
+                       struct btrfs_delayed_ref_node *tmp;
+
+                       node = rb_prev(&ref->rb_node);
+                       while (node) {
+                               tmp = rb_entry(node,
+                                              struct btrfs_delayed_ref_node,
+                                              rb_node);
+                               if (tmp->bytenr < start)
+                                       break;
+                               ref = tmp;
+                               node = rb_prev(&ref->rb_node);
+                       }
+                       node = &ref->rb_node;
+               } else
+                       node = rb_first(&delayed_refs->root);
+       }
+again:
+       while (node && count < 32) {
+               ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
                if (btrfs_delayed_ref_is_head(ref)) {
                        head = btrfs_delayed_node_to_head(ref);
-                       if (mutex_trylock(&head->mutex)) {
-                               *next_ret = head;
-                               ret = 0;
+                       if (list_empty(&head->cluster)) {
+                               list_add_tail(&head->cluster, cluster);
+                               delayed_refs->run_delayed_start =
+                                       head->node.bytenr;
+                               count++;
+
+                               WARN_ON(delayed_refs->num_heads_ready == 0);
+                               delayed_refs->num_heads_ready--;
+                       } else if (count) {
+                               /* the goal of the clustering is to find extents
+                                * that are likely to end up in the same extent
+                                * leaf on disk.  So, we don't want them spread
+                                * all over the tree.  Stop now if we've hit
+                                * a head that was already in use
+                                */
                                break;
                        }
                }
-               node = rb_next(&ref->rb_node);
-               if (!node) {
-                       ret = 1;
-                       *next_ret = NULL;
-                       break;
-               }
-               ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+               node = rb_next(node);
        }
-       return ret;
+       if (count) {
+               return 0;
+       } else if (start) {
+               /*
+                * we've gone to the end of the rbtree without finding any
+                * clusters.  start from the beginning and try again
+                */
+               start = 0;
+               node = rb_first(&delayed_refs->root);
+               goto again;
+       }
+       return 1;
 }
 
 /*
@@ -178,7 +235,7 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
 
-       ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
+       ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
        if (ref) {
                prev_node = rb_prev(&ref->rb_node);
                if (!prev_node)
@@ -240,7 +297,7 @@ again:
        }
 
        spin_lock(&delayed_refs->lock);
-       ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
+       ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
        if (ref) {
                head = btrfs_delayed_node_to_head(ref);
                if (mutex_trylock(&head->mutex)) {
@@ -384,7 +441,7 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 {
        struct btrfs_delayed_ref_node *existing;
        struct btrfs_delayed_ref *full_ref;
-       struct btrfs_delayed_ref_head *head_ref;
+       struct btrfs_delayed_ref_head *head_ref = NULL;
        struct btrfs_delayed_ref_root *delayed_refs;
        int count_mod = 1;
        int must_insert_reserved = 0;
@@ -428,6 +485,7 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
        if (btrfs_delayed_ref_is_head(ref)) {
                head_ref = btrfs_delayed_node_to_head(ref);
                head_ref->must_insert_reserved = must_insert_reserved;
+               INIT_LIST_HEAD(&head_ref->cluster);
                mutex_init(&head_ref->mutex);
        } else {
                full_ref = btrfs_delayed_node_to_ref(ref);
@@ -453,6 +511,10 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
                 */
                kfree(ref);
        } else {
+               if (btrfs_delayed_ref_is_head(ref)) {
+                       delayed_refs->num_heads++;
+                       delayed_refs->num_heads_ready++;
+               }
                delayed_refs->num_entries++;
                trans->delayed_ref_updates++;
        }
@@ -522,7 +584,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
        struct btrfs_delayed_ref_root *delayed_refs;
 
        delayed_refs = &trans->transaction->delayed_refs;
-       ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
+       ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
        if (ref)
                return btrfs_delayed_node_to_head(ref);
        return NULL;
index c345fee..57153fc 100644 (file)
@@ -68,6 +68,8 @@ struct btrfs_delayed_ref_head {
         */
        struct mutex mutex;
 
+       struct list_head cluster;
+
        /*
         * when a new extent is allocated, it is just reserved in memory
         * The actual extent isn't inserted into the extent allocation tree
@@ -115,12 +117,20 @@ struct btrfs_delayed_ref_root {
         */
        unsigned long num_entries;
 
+       /* total number of head nodes in tree */
+       unsigned long num_heads;
+
+       /* total number of head nodes ready for processing */
+       unsigned long num_heads_ready;
+
        /*
         * set when the tree is flushing before a transaction commit,
         * used by the throttling code to decide if new updates need
         * to be run right away
         */
        int flushing;
+
+       u64 run_delayed_start;
 };
 
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -140,9 +150,6 @@ int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans,
-                          struct btrfs_delayed_ref_node *ref,
-                          struct btrfs_delayed_ref_head **next_ret);
 int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root, u64 bytenr,
                            u64 num_bytes, u32 *refs);
@@ -151,6 +158,10 @@ int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
                          u64 parent, u64 orig_ref_root, u64 ref_root,
                          u64 orig_ref_generation, u64 ref_generation,
                          u64 owner_objectid, int pin);
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+                          struct btrfs_delayed_ref_head *head);
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+                          struct list_head *cluster, u64 search_start);
 /*
  * a node might live in a head or a regular ref, this lets you
  * test for the proper type to use.
index 4f43e22..1f1d89b 100644 (file)
@@ -1458,7 +1458,6 @@ static int transaction_kthread(void *arg)
        struct btrfs_root *root = arg;
        struct btrfs_trans_handle *trans;
        struct btrfs_transaction *cur;
-       struct btrfs_fs_info *info = root->fs_info;
        unsigned long now;
        unsigned long delay;
        int ret;
@@ -1481,24 +1480,8 @@ static int transaction_kthread(void *arg)
 
                now = get_seconds();
                if (now < cur->start_time || now - cur->start_time < 30) {
-                       unsigned long num_delayed;
-                       num_delayed = cur->delayed_refs.num_entries;
                        mutex_unlock(&root->fs_info->trans_mutex);
                        delay = HZ * 5;
-
-                       /*
-                        * we may have been woken up early to start
-                        * processing the delayed extent ref updates
-                        * If so, run some of them and then loop around again
-                        * to see if we need to force a commit
-                        */
-                       if (num_delayed > 64) {
-                               mutex_unlock(&info->transaction_kthread_mutex);
-                               trans = btrfs_start_transaction(root, 1);
-                               btrfs_run_delayed_refs(trans, root, 256);
-                               btrfs_end_transaction(trans, root);
-                               continue;
-                       }
                        goto sleep;
                }
                mutex_unlock(&root->fs_info->trans_mutex);
index 8471c79..3b8b6c2 100644 (file)
@@ -926,52 +926,41 @@ again:
        return NULL;
 }
 
-/*
- * this starts processing the delayed reference count updates and
- * extent insertions we have queued up so far.  count can be
- * 0, which means to process everything in the tree at the start
- * of the run (but not newly added entries), or it can be some target
- * number you'd like to process.
- */
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, unsigned long count)
+static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct list_head *cluster)
 {
-       struct rb_node *node;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *locked_ref = NULL;
        int ret;
+       int count = 0;
        int must_insert_reserved = 0;
-       int run_all = count == (unsigned long)-1;
-
-       if (root == root->fs_info->extent_root)
-               root = root->fs_info->tree_root;
 
        delayed_refs = &trans->transaction->delayed_refs;
-again:
-       spin_lock(&delayed_refs->lock);
-       if (count == 0)
-               count = delayed_refs->num_entries;
        while (1) {
                if (!locked_ref) {
-                       /*
-                        * no locked ref, go find something we can
-                        * process in the rbtree.  We start at
-                        * the beginning of the tree, there may be less
-                        * lock contention if we do something smarter here.
-                        */
-                       node = rb_first(&delayed_refs->root);
-                       if (!node) {
-                               spin_unlock(&delayed_refs->lock);
+                       /* pick a new head ref from the cluster list */
+                       if (list_empty(cluster))
                                break;
-                       }
 
-                       ref = rb_entry(node, struct btrfs_delayed_ref_node,
-                                      rb_node);
-                       ret = btrfs_lock_delayed_ref(trans, ref, &locked_ref);
-                       if (ret) {
-                               spin_unlock(&delayed_refs->lock);
-                               break;
+                       locked_ref = list_entry(cluster->next,
+                                    struct btrfs_delayed_ref_head, cluster);
+
+                       /* grab the lock that says we are going to process
+                        * all the refs for this head */
+                       ret = btrfs_delayed_ref_lock(trans, locked_ref);
+
+                       /*
+                        * we may have dropped the spin lock to get the head
+                        * mutex lock, and that might have given someone else
+                        * time to free the head.  If that's true, it has been
+                        * removed from our list and we can move on.
+                        */
+                       if (ret == -EAGAIN) {
+                               locked_ref = NULL;
+                               count++;
+                               continue;
                        }
                }
 
@@ -986,7 +975,6 @@ again:
                 * locked_ref is the head node, so we have to go one
                 * node back for any delayed ref updates
                 */
-
                ref = select_delayed_ref(locked_ref);
                if (!ref) {
                        /* All delayed refs have been processed, Go ahead
@@ -994,6 +982,7 @@ again:
                         * so that any accounting fixes can happen
                         */
                        ref = &locked_ref->node;
+                       list_del_init(&locked_ref->cluster);
                        locked_ref = NULL;
                }
 
@@ -1007,30 +996,72 @@ again:
                BUG_ON(ret);
                btrfs_put_delayed_ref(ref);
 
-               /* once we lock the head ref, we have to process all the
-                * entries for it.  So, we might end up doing more entries
-                * that count was asking us to do.
-                */
-               if (count > 0)
-                       count--;
+               count++;
+               cond_resched();
+               spin_lock(&delayed_refs->lock);
+       }
+       return count;
+}
+
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far.  count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, unsigned long count)
+{
+       struct rb_node *node;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct btrfs_delayed_ref_node *ref;
+       struct list_head cluster;
+       int ret;
+       int run_all = count == (unsigned long)-1;
+       int run_most = 0;
+
+       if (root == root->fs_info->extent_root)
+               root = root->fs_info->tree_root;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       INIT_LIST_HEAD(&cluster);
+again:
+       spin_lock(&delayed_refs->lock);
+       if (count == 0) {
+               count = delayed_refs->num_entries * 2;
+               run_most = 1;
+       }
+       while (1) {
+               if (!(run_all || run_most) &&
+                   delayed_refs->num_heads_ready < 64)
+                       break;
 
                /*
-                * we set locked_ref to null above if we're all done
-                * with this bytenr
+                * go find something we can process in the rbtree.  We start at
+                * the beginning of the tree, and then build a cluster
+                * of refs to process starting at the first one we are able to
+                * lock
                 */
-               if (!locked_ref && count == 0)
+               ret = btrfs_find_ref_cluster(trans, &cluster,
+                                            delayed_refs->run_delayed_start);
+               if (ret)
                        break;
 
-               cond_resched();
-               spin_lock(&delayed_refs->lock);
+               ret = run_clustered_refs(trans, root, &cluster);
+               BUG_ON(ret < 0);
+
+               count -= min_t(unsigned long, ret, count);
+
+               if (count == 0)
+                       break;
        }
+
        if (run_all) {
-               spin_lock(&delayed_refs->lock);
                node = rb_first(&delayed_refs->root);
-               if (!node) {
-                       spin_unlock(&delayed_refs->lock);
+               if (!node)
                        goto out;
-               }
+               count = (unsigned long)-1;
 
                while (node) {
                        ref = rb_entry(node, struct btrfs_delayed_ref_node,
@@ -1052,11 +1083,11 @@ again:
                        node = rb_next(node);
                }
                spin_unlock(&delayed_refs->lock);
-               count = (unsigned long)-1;
                schedule_timeout(1);
                goto again;
        }
 out:
+       spin_unlock(&delayed_refs->lock);
        return 0;
 }
 
@@ -2407,12 +2438,18 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
         */
        head->node.in_tree = 0;
        rb_erase(&head->node.rb_node, &delayed_refs->root);
+
        delayed_refs->num_entries--;
 
        /*
         * we don't take a ref on the node because we're removing it from the
         * tree, so we just steal the ref the tree was holding.
         */
+       delayed_refs->num_heads--;
+       if (list_empty(&head->cluster))
+               delayed_refs->num_heads_ready--;
+
+       list_del_init(&head->cluster);
        spin_unlock(&delayed_refs->lock);
 
        ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
@@ -3705,6 +3742,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
        struct btrfs_path *path;
        int i;
        int orig_level;
+       int update_count;
        struct btrfs_root_item *root_item = &root->root_item;
 
        WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
@@ -3746,6 +3784,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                }
        }
        while (1) {
+               unsigned long update;
                wret = walk_down_tree(trans, root, path, &level);
                if (wret > 0)
                        break;
@@ -3764,6 +3803,14 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                }
                atomic_inc(&root->fs_info->throttle_gen);
                wake_up(&root->fs_info->transaction_throttle);
+               for (update_count = 0; update_count < 16; update_count++) {
+                       update = trans->delayed_ref_updates;
+                       trans->delayed_ref_updates = 0;
+                       if (update)
+                               btrfs_run_delayed_refs(trans, root, update);
+                       else
+                               break;
+               }
        }
        for (i = 0; i <= orig_level; i++) {
                if (path->nodes[i]) {
index f94c2ad..903edab 100644 (file)
@@ -68,7 +68,10 @@ static noinline int join_transaction(struct btrfs_root *root)
 
                cur_trans->delayed_refs.root.rb_node = NULL;
                cur_trans->delayed_refs.num_entries = 0;
+               cur_trans->delayed_refs.num_heads_ready = 0;
+               cur_trans->delayed_refs.num_heads = 0;
                cur_trans->delayed_refs.flushing = 0;
+               cur_trans->delayed_refs.run_delayed_start = 0;
                spin_lock_init(&cur_trans->delayed_refs.lock);
 
                INIT_LIST_HEAD(&cur_trans->pending_snapshots);
@@ -287,13 +290,19 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 {
        struct btrfs_transaction *cur_trans;
        struct btrfs_fs_info *info = root->fs_info;
-
-       if (trans->delayed_ref_updates &&
-           (trans->transaction->delayed_refs.flushing ||
-           trans->transaction->delayed_refs.num_entries > 16384)) {
-               btrfs_run_delayed_refs(trans, root, trans->delayed_ref_updates);
-       } else if (trans->transaction->delayed_refs.num_entries > 64) {
-               wake_up_process(root->fs_info->transaction_kthread);
+       int count = 0;
+
+       while (count < 4) {
+               unsigned long cur = trans->delayed_ref_updates;
+               trans->delayed_ref_updates = 0;
+               if (cur &&
+                   trans->transaction->delayed_refs.num_heads_ready > 64) {
+                       trans->delayed_ref_updates = 0;
+                       btrfs_run_delayed_refs(trans, root, cur);
+               } else {
+                       break;
+               }
+               count++;
        }
 
        mutex_lock(&info->trans_mutex);
@@ -929,7 +938,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         */
        trans->transaction->delayed_refs.flushing = 1;
 
-       ret = btrfs_run_delayed_refs(trans, root, (u64)-1);
+       ret = btrfs_run_delayed_refs(trans, root, 0);
        BUG_ON(ret);
 
        INIT_LIST_HEAD(&dirty_fs_roots);