Btrfs: Add run time btree defrag, and an ioctl to force btree defrag
Chris Mason [Tue, 7 Aug 2007 20:15:09 +0000 (16:15 -0400)]
This adds two types of btree defrag, a run time form that tries to
defrag recently allocated blocks in the btree when they are still in ram,
and an ioctl that forces defrag of all btree blocks.

File data blocks are not defragged yet, but this can make a huge difference
in sequential btree reads.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

fs/btrfs/Makefile
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/file.c
fs/btrfs/inode.c
fs/btrfs/ioctl.h
fs/btrfs/transaction.c
fs/btrfs/transaction.h
fs/btrfs/tree-defrag.c [new file with mode: 0644]

index a4e2df6..9321438 100644 (file)
@@ -4,7 +4,7 @@ ifneq ($(KERNELRELEASE),)
 obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           hash.o file-item.o inode-item.o inode-map.o disk-io.o \
-          transaction.o bit-radix.o inode.o file.o
+          transaction.o bit-radix.o inode.o file.o tree-defrag.o
 
 #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 #        root-tree.o dir-item.o hash.o file-item.o inode-item.o \
index 7a08491..c7e47e7 100644 (file)
@@ -65,44 +65,44 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
        memset(p, 0, sizeof(*p));
 }
 
-static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
+static int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
                           *root, struct buffer_head *buf, struct buffer_head
                           *parent, int parent_slot, struct buffer_head
-                          **cow_ret)
+                          **cow_ret, u64 search_start, u64 empty_size)
 {
        struct buffer_head *cow;
        struct btrfs_node *cow_node;
-       int ret;
+       int ret = 0;
+       int different_trans = 0;
 
+       WARN_ON(root->ref_cows && trans->transid != root->last_trans);
        WARN_ON(!buffer_uptodate(buf));
-       if (trans->transaction != root->fs_info->running_transaction) {
-               printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
-                      root->fs_info->running_transaction->transid);
-               WARN_ON(1);
-       }
-       if (trans->transid != root->fs_info->generation) {
-               printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
-                      root->fs_info->generation);
-               WARN_ON(1);
-       }
-       if (btrfs_header_generation(btrfs_buffer_header(buf)) ==
-                                   trans->transid) {
-               *cow_ret = buf;
-               return 0;
-       }
-       cow = btrfs_alloc_free_block(trans, root, buf->b_blocknr);
+       cow = btrfs_alloc_free_block(trans, root, search_start, empty_size);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
+
        cow_node = btrfs_buffer_node(cow);
        if (buf->b_size != root->blocksize || cow->b_size != root->blocksize)
                WARN_ON(1);
+
        memcpy(cow_node, btrfs_buffer_node(buf), root->blocksize);
        btrfs_set_header_blocknr(&cow_node->header, bh_blocknr(cow));
        btrfs_set_header_generation(&cow_node->header, trans->transid);
        btrfs_set_header_owner(&cow_node->header, root->root_key.objectid);
-       ret = btrfs_inc_ref(trans, root, buf);
-       if (ret)
-               return ret;
+
+       WARN_ON(btrfs_header_generation(btrfs_buffer_header(buf)) >
+               trans->transid);
+       if (btrfs_header_generation(btrfs_buffer_header(buf)) !=
+                                   trans->transid) {
+               different_trans = 1;
+               ret = btrfs_inc_ref(trans, root, buf);
+               if (ret)
+                       return ret;
+       } else {
+               WARN_ON(!root->ref_cows);
+               clean_tree_block(trans, root, buf);
+       }
+
        if (buf == root->node) {
                root->node = cow;
                get_bh(cow);
@@ -114,6 +114,8 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
                btrfs_set_node_blockptr(btrfs_buffer_node(parent), parent_slot,
                                        bh_blocknr(cow));
                btrfs_mark_buffer_dirty(parent);
+               WARN_ON(btrfs_header_generation(btrfs_buffer_header(parent)) !=
+                                   trans->transid);
                btrfs_free_extent(trans, root, bh_blocknr(buf), 1, 1);
        }
        btrfs_block_release(root, buf);
@@ -122,6 +124,115 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
        return 0;
 }
 
+int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
+                          *root, struct buffer_head *buf, struct buffer_head
+                          *parent, int parent_slot, struct buffer_head
+                          **cow_ret)
+{
+       u64 search_start;
+       if (trans->transaction != root->fs_info->running_transaction) {
+               printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+                      root->fs_info->running_transaction->transid);
+               WARN_ON(1);
+       }
+       if (trans->transid != root->fs_info->generation) {
+               printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+                      root->fs_info->generation);
+               WARN_ON(1);
+       }
+       if (btrfs_header_generation(btrfs_buffer_header(buf)) ==
+                                   trans->transid) {
+               *cow_ret = buf;
+               return 0;
+       }
+
+       search_start = bh_blocknr(buf) & ~((u64)65535);
+       return __btrfs_cow_block(trans, root, buf, parent,
+                                parent_slot, cow_ret, search_start, 0);
+}
+
+static int close_blocks(u64 blocknr, u64 other)
+{
+       if (blocknr < other && other - blocknr < 8)
+               return 1;
+       if (blocknr > other && blocknr - other < 8)
+               return 1;
+       return 0;
+}
+
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root, struct buffer_head *parent,
+                      int cache_only)
+{
+       struct btrfs_node *parent_node;
+       struct buffer_head *cur_bh;
+       struct buffer_head *tmp_bh;
+       u64 blocknr;
+       u64 search_start = 0;
+       u64 other;
+       u32 parent_nritems;
+       int start_slot;
+       int end_slot;
+       int i;
+       int err = 0;
+
+       if (trans->transaction != root->fs_info->running_transaction) {
+               printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+                      root->fs_info->running_transaction->transid);
+               WARN_ON(1);
+       }
+       if (trans->transid != root->fs_info->generation) {
+               printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+                      root->fs_info->generation);
+               WARN_ON(1);
+       }
+       parent_node = btrfs_buffer_node(parent);
+       parent_nritems = btrfs_header_nritems(&parent_node->header);
+
+       start_slot = 0;
+       end_slot = parent_nritems;
+
+       if (parent_nritems == 1)
+               return 0;
+
+       for (i = start_slot; i < end_slot; i++) {
+               int close = 1;
+               blocknr = btrfs_node_blockptr(parent_node, i);
+               if (i > 0) {
+                       other = btrfs_node_blockptr(parent_node, i - 1);
+                       close = close_blocks(blocknr, other);
+               }
+               if (close && i < end_slot - 1) {
+                       other = btrfs_node_blockptr(parent_node, i + 1);
+                       close = close_blocks(blocknr, other);
+               }
+               if (close)
+                       continue;
+
+               cur_bh = btrfs_find_tree_block(root, blocknr);
+               if (!cur_bh || !buffer_uptodate(cur_bh) ||
+                   buffer_locked(cur_bh)) {
+                       if (cache_only) {
+                               brelse(cur_bh);
+                               continue;
+                       }
+                       brelse(cur_bh);
+                       cur_bh = read_tree_block(root, blocknr);
+               }
+               if (search_start == 0) {
+                       search_start = bh_blocknr(cur_bh) & ~((u64)65535);
+               }
+               err = __btrfs_cow_block(trans, root, cur_bh, parent, i,
+                                       &tmp_bh, search_start,
+                                       min(8, end_slot - i));
+               if (err)
+                       break;
+               search_start = bh_blocknr(tmp_bh);
+               brelse(tmp_bh);
+       }
+       return err;
+}
+
 /*
  * The leaf data grows from end-to-front in the node.
  * this returns the address of the start of the last item,
@@ -221,6 +332,7 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 
                parent_slot = path->slots[level + 1];
                parent_key = &parent->ptrs[parent_slot].key;
+
                BUG_ON(memcmp(parent_key, &leaf->items[0].key,
                       sizeof(struct btrfs_disk_key)));
                BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
@@ -643,7 +755,7 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
  * readahead one full node of leaves
  */
 static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
-                            int slot)
+                            int level, int slot)
 {
        struct btrfs_node *node;
        int i;
@@ -659,10 +771,13 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
        unsigned long gang[8];
        struct buffer_head *bh;
 
-       if (!path->nodes[1])
+       if (level == 0)
+               return;
+
+       if (!path->nodes[level])
                return;
 
-       node = btrfs_buffer_node(path->nodes[1]);
+       node = btrfs_buffer_node(path->nodes[level]);
        search = btrfs_node_blockptr(node, slot);
        bh = btrfs_find_tree_block(root, search);
        if (bh) {
@@ -690,7 +805,7 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
                for (i = 0; i < ret; i++) {
                        blocknr = gang[i];
                        clear_radix_bit(&found, blocknr);
-                       if (nread > 64)
+                       if (nread > 32)
                                continue;
                        if (direction > 0 && cluster_start <= blocknr &&
                            cluster_start + 8 > blocknr) {
@@ -726,7 +841,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        struct buffer_head *b;
        struct buffer_head *cow_buf;
        struct btrfs_node *c;
-       struct btrfs_root_item *root_item = &root->root_item;
        u64 blocknr;
        int slot;
        int ret;
@@ -734,11 +848,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        int should_reada = p->reada;
        u8 lowest_level = 0;
 
-       if (btrfs_root_refs(root_item) == 0 && root->ref_cows) {
-               lowest_level = root_item->drop_level;
-               WARN_ON(ins_len || cow);
-       }
-
+       lowest_level = p->lowest_level;
+       WARN_ON(lowest_level && ins_len);
        WARN_ON(p->nodes[0] != NULL);
        WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex));
 again:
@@ -798,8 +909,8 @@ again:
                        if (level == lowest_level)
                                break;
                        blocknr = btrfs_node_blockptr(c, slot);
-                       if (level == 1 && should_reada)
-                               reada_for_search(root, p, slot);
+                       if (should_reada)
+                               reada_for_search(root, p, level, slot);
                        b = read_tree_block(root, btrfs_node_blockptr(c, slot));
 
                } else {
@@ -960,7 +1071,7 @@ static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root
        BUG_ON(path->nodes[level]);
        BUG_ON(path->nodes[level-1] != root->node);
 
-       t = btrfs_alloc_free_block(trans, root, root->node->b_blocknr);
+       t = btrfs_alloc_free_block(trans, root, root->node->b_blocknr, 0);
        if (IS_ERR(t))
                return PTR_ERR(t);
        c = btrfs_buffer_node(t);
@@ -1070,7 +1181,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
        }
 
        c_nritems = btrfs_header_nritems(&c->header);
-       split_buffer = btrfs_alloc_free_block(trans, root, t->b_blocknr);
+       split_buffer = btrfs_alloc_free_block(trans, root, t->b_blocknr, 0);
        if (IS_ERR(split_buffer))
                return PTR_ERR(split_buffer);
 
@@ -1461,7 +1572,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
        nritems = btrfs_header_nritems(&l->header);
        mid = (nritems + 1)/ 2;
 
-       right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr);
+       right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr, 0);
        if (IS_ERR(right_buffer))
                return PTR_ERR(right_buffer);
 
@@ -1560,7 +1671,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 
        if (!double_split)
                return ret;
-       right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr);
+       right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr, 0);
        if (IS_ERR(right_buffer))
                return PTR_ERR(right_buffer);
 
@@ -1988,8 +2099,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
                blocknr = btrfs_node_blockptr(c_node, slot);
                if (next)
                        btrfs_block_release(root, next);
-               if (level == 1 && path->reada)
-                       reada_for_search(root, path, slot);
+               if (path->reada)
+                       reada_for_search(root, path, level, slot);
                next = read_tree_block(root, blocknr);
                break;
        }
@@ -2002,8 +2113,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
                path->slots[level] = 0;
                if (!level)
                        break;
-               if (level == 1 && path->reada)
-                       reada_for_search(root, path, slot);
+               if (path->reada)
+                       reada_for_search(root, path, level, slot);
                next = read_tree_block(root,
                       btrfs_node_blockptr(btrfs_buffer_node(next), 0));
        }
index c5a18d5..42aa203 100644 (file)
@@ -178,6 +178,7 @@ struct btrfs_path {
        struct buffer_head *nodes[BTRFS_MAX_LEVEL];
        int slots[BTRFS_MAX_LEVEL];
        int reada;
+       int lowest_level;
 };
 
 /*
@@ -338,6 +339,9 @@ struct btrfs_root {
        u64 highest_inode;
        u64 last_inode_alloc;
        int ref_cows;
+       struct btrfs_key defrag_progress;
+       int defrag_running;
+       int defrag_level;
 };
 
 /* the lower bits in the key flags defines the item type */
@@ -1031,10 +1035,11 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root);
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-                                           struct btrfs_root *root, u64 hint);
+                                           struct btrfs_root *root, u64 hint,
+                                           u64 empty_size);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, u64 owner,
-                      u64 num_blocks, u64 search_start,
+                      u64 num_blocks, u64 empty_size, u64 search_start,
                       u64 search_end, struct btrfs_key *ins, int data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                  struct buffer_head *buf);
@@ -1051,6 +1056,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_root *root);
 /* ctree.c */
+int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
+                          *root, struct buffer_head *buf, struct buffer_head
+                          *parent, int parent_slot, struct buffer_head
+                          **cow_ret);
 int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
                      *root, struct btrfs_path *path, u32 data_size);
 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
@@ -1060,6 +1069,9 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
                      *root, struct btrfs_key *key, struct btrfs_path *p, int
                      ins_len, int cow);
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root, struct buffer_head *parent,
+                      int cache_only);
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
@@ -1171,4 +1183,7 @@ extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct inode *inode,
                       u64 start, u64 end, u64 *hint_block);
+/* tree-defrag.c */
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, int cache_only);
 #endif
index 60db85b..c948416 100644 (file)
@@ -273,7 +273,9 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                     struct buffer_head *buf)
 {
        WARN_ON(atomic_read(&buf->b_count) == 0);
+       lock_buffer(buf);
        clear_buffer_dirty(buf);
+       unlock_buffer(buf);
        return 0;
 }
 
@@ -294,6 +296,9 @@ static int __setup_root(int blocksize,
        root->last_inode_alloc = 0;
        memset(&root->root_key, 0, sizeof(root->root_key));
        memset(&root->root_item, 0, sizeof(root->root_item));
+       memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
+       root->defrag_running = 0;
+       root->defrag_level = 0;
        root->root_key.objectid = objectid;
        return 0;
 }
@@ -585,6 +590,7 @@ int close_ctree(struct btrfs_root *root)
        fs_info->closing = 1;
        btrfs_transaction_flush_work(root);
        mutex_lock(&fs_info->fs_mutex);
+       btrfs_defrag_dirty_roots(root->fs_info);
        trans = btrfs_start_transaction(root, 1);
        ret = btrfs_commit_transaction(trans, root);
        /* run commit again to  drop the original snapshot */
@@ -616,7 +622,9 @@ void btrfs_mark_buffer_dirty(struct buffer_head *bh)
 {
        struct btrfs_root *root = BTRFS_I(bh->b_page->mapping->host)->root;
        u64 transid = btrfs_header_generation(btrfs_buffer_header(bh));
+
        WARN_ON(!atomic_read(&bh->b_count));
+
        if (transid != root->fs_info->generation) {
                printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n",
                        (unsigned long long)bh->b_blocknr,
index 5d4d5d8..26b8d34 100644 (file)
@@ -23,7 +23,8 @@
 #include "transaction.h"
 
 static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-                           *orig_root, u64 num_blocks, u64 search_start,
+                           *orig_root, u64 num_blocks, u64 empty_size,
+                           u64 search_start,
                            u64 search_end, u64 hint_block,
                            struct btrfs_key *ins, u64 exclude_start,
                            u64 exclude_nr, int data);
@@ -379,7 +380,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-       ret = find_free_extent(trans, root->fs_info->extent_root, 0, 0,
+       ret = find_free_extent(trans, root->fs_info->extent_root, 0, 0, 0,
                               (u64)-1, 0, &ins, 0, 0, 0);
        if (ret) {
                btrfs_free_path(path);
@@ -533,7 +534,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
        struct btrfs_block_group_item *bi;
        struct btrfs_key ins;
 
-       ret = find_free_extent(trans, extent_root, 0, 0, (u64)-1, 0, &ins,
+       ret = find_free_extent(trans, extent_root, 0, 0, 0, (u64)-1, 0, &ins,
                               0, 0, 0);
        /* FIXME, set bit to recalc cache groups on next mount */
        if (ret)
@@ -708,6 +709,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 static int try_remove_page(struct address_space *mapping, unsigned long index)
 {
        int ret;
+       return 0;
        ret = invalidate_mapping_pages(mapping, index, index);
        return ret;
 }
@@ -866,7 +868,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
        if (!path)
                return -ENOMEM;
 
-       ret = find_free_extent(trans, root, 0, 0, (u64)-1, 0, &ins, 0, 0, 0);
+       ret = find_free_extent(trans, root, 0, 0, 0, (u64)-1, 0, &ins, 0, 0, 0);
        if (ret) {
                btrfs_free_path(path);
                return ret;
@@ -983,8 +985,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
  * Any available blocks before search_start are skipped.
  */
 static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-                           *orig_root, u64 num_blocks, u64 search_start, u64
-                           search_end, u64 hint_block,
+                           *orig_root, u64 num_blocks, u64 empty_size,
+                           u64 search_start, u64 search_end, u64 hint_block,
                            struct btrfs_key *ins, u64 exclude_start,
                            u64 exclude_nr, int data)
 {
@@ -1042,6 +1044,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
                                                     data, 1);
        }
 
+       total_needed += empty_size;
        path = btrfs_alloc_path();
 
 check_failed:
@@ -1157,9 +1160,11 @@ check_pending:
                        goto error;
                }
                search_start = orig_search_start;
-               if (wrapped)
+               if (wrapped) {
+                       if (!full_scan)
+                               total_needed -= empty_size;
                        full_scan = 1;
-               else
+               } else
                        wrapped = 1;
                goto new_group;
        }
@@ -1238,9 +1243,11 @@ new_group:
                        ret = -ENOSPC;
                        goto error;
                }
-               if (wrapped)
+               if (wrapped) {
+                       if (!full_scan)
+                               total_needed -= empty_size;
                        full_scan = 1;
-               else
+               } else
                        wrapped = 1;
        }
        block_group = btrfs_lookup_block_group(info, search_start);
@@ -1264,7 +1271,7 @@ error:
  */
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, u64 owner,
-                      u64 num_blocks, u64 hint_block,
+                      u64 num_blocks, u64 empty_size, u64 hint_block,
                       u64 search_end, struct btrfs_key *ins, int data)
 {
        int ret;
@@ -1303,7 +1310,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
         * in the correct block group.
         */
        if (data) {
-               ret = find_free_extent(trans, root, 0, 0,
+               ret = find_free_extent(trans, root, 0, 0, 0,
                                       search_end, 0, &prealloc_key, 0, 0, 0);
                BUG_ON(ret);
                if (ret)
@@ -1313,8 +1320,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
        }
 
        /* do the real allocation */
-       ret = find_free_extent(trans, root, num_blocks, search_start,
-                              search_end, hint_block, ins,
+       ret = find_free_extent(trans, root, num_blocks, empty_size,
+                              search_start, search_end, hint_block, ins,
                               exclude_start, exclude_nr, data);
        BUG_ON(ret);
        if (ret)
@@ -1333,7 +1340,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
                exclude_start = ins->objectid;
                exclude_nr = ins->offset;
                hint_block = exclude_start + exclude_nr;
-               ret = find_free_extent(trans, root, 0, search_start,
+               ret = find_free_extent(trans, root, 0, 0, search_start,
                                       search_end, hint_block,
                                       &prealloc_key, exclude_start,
                                       exclude_nr, 0);
@@ -1368,14 +1375,16 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
  * returns the tree buffer or NULL.
  */
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-                                          struct btrfs_root *root, u64 hint)
+                                          struct btrfs_root *root, u64 hint,
+                                          u64 empty_size)
 {
        struct btrfs_key ins;
        int ret;
        struct buffer_head *buf;
 
        ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
-                                1, hint, (unsigned long)-1, &ins, 0);
+                                1, empty_size, hint,
+                                (unsigned long)-1, &ins, 0);
        if (ret) {
                BUG_ON(ret > 0);
                return ERR_PTR(ret);
@@ -1385,6 +1394,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                btrfs_free_extent(trans, root, ins.objectid, 1, 0);
                return ERR_PTR(-ENOMEM);
        }
+       WARN_ON(buffer_dirty(buf));
        set_buffer_uptodate(buf);
        set_buffer_checked(buf);
        set_radix_bit(&trans->transaction->dirty_pages, buf->b_page->index);
@@ -1591,13 +1601,15 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                struct btrfs_key key;
                struct btrfs_disk_key *found_key;
                struct btrfs_node *node;
+
                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+               level = root_item->drop_level;
+               path->lowest_level = level;
                wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-               if (ret < 0) {
+               if (wret < 0) {
                        ret = wret;
                        goto out;
                }
-               level = root_item->drop_level;
                node = btrfs_buffer_node(path->nodes[level]);
                found_key = &node->ptrs[path->slots[level]].key;
                WARN_ON(memcmp(found_key, &root_item->drop_progress,
@@ -1617,8 +1629,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                        ret = wret;
                num_walks++;
                if (num_walks > 10) {
-                       struct btrfs_key key;
-                       btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
                        ret = -EAGAIN;
                        get_bh(root->node);
                        break;
@@ -1627,6 +1637,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
        for (i = 0; i <= orig_level; i++) {
                if (path->nodes[i]) {
                        btrfs_block_release(root, path->nodes[i]);
+                       path->nodes[i] = 0;
                }
        }
 out:
index 1fe38fe..00b118a 100644 (file)
@@ -512,7 +512,7 @@ static int prepare_pages(struct btrfs_root *root,
        if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size ||
            pos + write_bytes - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
                err = btrfs_alloc_extent(trans, root, inode->i_ino,
-                                        num_blocks, hint_block, (u64)-1,
+                                        num_blocks, 0, hint_block, (u64)-1,
                                         &ins, 1);
                if (err)
                        goto failed_truncate;
index 3889032..12aa043 100644 (file)
@@ -554,7 +554,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
                                 &alloc_hint);
        if (ret)
                goto out;
-       ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1,
+       ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, 0,
                                 alloc_hint, (u64)-1, &ins, 1);
        if (ret)
                goto out;
@@ -1360,7 +1360,7 @@ not_found:
        if (create & BTRFS_GET_BLOCK_CREATE) {
                struct btrfs_key ins;
                ret = btrfs_alloc_extent(trans, root, inode->i_ino,
-                                        1, alloc_hint, (u64)-1,
+                                        1, 0, alloc_hint, (u64)-1,
                                         &ins, 1);
                if (ret) {
                        err = ret;
@@ -1998,7 +1998,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
        trans = btrfs_start_transaction(root, 1);
        BUG_ON(!trans);
 
-       subvol = btrfs_alloc_free_block(trans, root, 0);
+       subvol = btrfs_alloc_free_block(trans, root, 0, 0);
        if (IS_ERR(subvol))
                return PTR_ERR(subvol);
        leaf = btrfs_buffer_leaf(subvol);
@@ -2159,7 +2159,9 @@ int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ioctl_vol_args vol_args;
+       struct btrfs_trans_handle *trans;
        int ret = 0;
+       int err;
        struct btrfs_dir_item *di;
        int namelen;
        struct btrfs_path *path;
@@ -2196,6 +2198,31 @@ int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
                else
                        ret = create_snapshot(root, vol_args.name, namelen);
                break;
+
+       case BTRFS_IOC_DEFRAG:
+               mutex_lock(&root->fs_info->fs_mutex);
+               trans = btrfs_start_transaction(root, 1);
+               memset(&root->defrag_progress, 0,
+                      sizeof(root->defrag_progress));
+               while (1) {
+                       root->defrag_running = 1;
+                       err = btrfs_defrag_leaves(trans, root, 0);
+
+                       btrfs_end_transaction(trans, root);
+                       mutex_unlock(&root->fs_info->fs_mutex);
+
+                       btrfs_btree_balance_dirty(root);
+
+                       mutex_lock(&root->fs_info->fs_mutex);
+                       trans = btrfs_start_transaction(root, 1);
+                       if (err != -EAGAIN)
+                               break;
+               }
+               root->defrag_running = 0;
+               btrfs_end_transaction(trans, root);
+               mutex_unlock(&root->fs_info->fs_mutex);
+               ret = 0;
+               break;
        default:
                return -ENOTTY;
        }
index 23bed48..8bc47de 100644 (file)
@@ -28,6 +28,6 @@ struct btrfs_ioctl_vol_args {
 
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
                                   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_ADD_DISK _IOW(BTRFS_IOCTL_MAGIC, 2, \
+#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
                                   struct btrfs_ioctl_vol_args)
 #endif
index 4986264..338a719 100644 (file)
@@ -29,6 +29,7 @@ extern struct kmem_cache *btrfs_transaction_cachep;
 static struct workqueue_struct *trans_wq;
 
 #define BTRFS_ROOT_TRANS_TAG 0
+#define BTRFS_ROOT_DEFRAG_TAG 1
 
 static void put_transaction(struct btrfs_transaction *transaction)
 {
@@ -69,35 +70,41 @@ static int join_transaction(struct btrfs_root *root)
        return 0;
 }
 
+static int record_root_in_trans(struct btrfs_root *root)
+{
+       u64 running_trans_id = root->fs_info->running_transaction->transid;
+       if (root->ref_cows && root->last_trans < running_trans_id) {
+               WARN_ON(root == root->fs_info->extent_root);
+               if (root->root_item.refs != 0) {
+                       radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+                                  (unsigned long)root->root_key.objectid,
+                                  BTRFS_ROOT_TRANS_TAG);
+                       radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+                                  (unsigned long)root->root_key.objectid,
+                                  BTRFS_ROOT_DEFRAG_TAG);
+                       root->commit_root = root->node;
+                       get_bh(root->node);
+               } else {
+                       WARN_ON(1);
+               }
+               root->last_trans = running_trans_id;
+       }
+       return 0;
+}
+
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
                                                   int num_blocks)
 {
        struct btrfs_trans_handle *h =
                kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
        int ret;
-       u64 running_trans_id;
 
        mutex_lock(&root->fs_info->trans_mutex);
        ret = join_transaction(root);
        BUG_ON(ret);
-       running_trans_id = root->fs_info->running_transaction->transid;
 
-       if (root != root->fs_info->tree_root && root->last_trans <
-           running_trans_id) {
-               WARN_ON(root == root->fs_info->extent_root);
-               WARN_ON(root->ref_cows != 1);
-               if (root->root_item.refs != 0) {
-                       radix_tree_tag_set(&root->fs_info->fs_roots_radix,
-                                          (unsigned long)root->root_key.objectid,
-                                          BTRFS_ROOT_TRANS_TAG);
-                       root->commit_root = root->node;
-                       get_bh(root->node);
-               } else {
-                       WARN_ON(1);
-               }
-       }
-       root->last_trans = running_trans_id;
-       h->transid = running_trans_id;
+       record_root_in_trans(root);
+       h->transid = root->fs_info->running_transaction->transid;
        h->transaction = root->fs_info->running_transaction;
        h->blocks_reserved = num_blocks;
        h->blocks_used = 0;
@@ -155,6 +162,15 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
                                              gang[i]);
                        if (!page)
                                continue;
+                       if (PageWriteback(page)) {
+                               if (PageDirty(page))
+                                       wait_on_page_writeback(page);
+                               else {
+                                       unlock_page(page);
+                                       page_cache_release(page);
+                                       continue;
+                               }
+                       }
                        err = write_one_page(page, 0);
                        if (err)
                                werr = err;
@@ -299,6 +315,58 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
        return err;
 }
 
+int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info)
+{
+       struct btrfs_root *gang[1];
+       struct btrfs_root *root;
+       struct btrfs_root *tree_root = info->tree_root;
+       struct btrfs_trans_handle *trans;
+       int i;
+       int ret;
+       int err = 0;
+       u64 last = 0;
+
+       trans = btrfs_start_transaction(tree_root, 1);
+       while(1) {
+               ret = radix_tree_gang_lookup_tag(&info->fs_roots_radix,
+                                                (void **)gang, last,
+                                                ARRAY_SIZE(gang),
+                                                BTRFS_ROOT_DEFRAG_TAG);
+               if (ret == 0)
+                       break;
+               for (i = 0; i < ret; i++) {
+                       root = gang[i];
+                       last = root->root_key.objectid + 1;
+                       radix_tree_tag_clear(&info->fs_roots_radix,
+                                    (unsigned long)root->root_key.objectid,
+                                    BTRFS_ROOT_DEFRAG_TAG);
+                       if (root->defrag_running)
+                               continue;
+
+                       while (1) {
+                               mutex_lock(&root->fs_info->trans_mutex);
+                               record_root_in_trans(root);
+                               mutex_unlock(&root->fs_info->trans_mutex);
+
+                               root->defrag_running = 1;
+                               err = btrfs_defrag_leaves(trans, root, 1);
+                               btrfs_end_transaction(trans, tree_root);
+                               mutex_unlock(&info->fs_mutex);
+
+                               btrfs_btree_balance_dirty(root);
+
+                               mutex_lock(&info->fs_mutex);
+                               trans = btrfs_start_transaction(tree_root, 1);
+                               if (err != -EAGAIN)
+                                       break;
+                       }
+                       root->defrag_running = 0;
+               }
+       }
+       btrfs_end_transaction(trans, tree_root);
+       return err;
+}
+
 static int drop_dirty_roots(struct btrfs_root *tree_root,
                            struct list_head *list)
 {
@@ -475,6 +543,7 @@ void btrfs_transaction_cleaner(struct work_struct *work)
                goto out;
        }
        mutex_unlock(&root->fs_info->trans_mutex);
+       btrfs_defrag_dirty_roots(root->fs_info);
        trans = btrfs_start_transaction(root, 1);
        ret = btrfs_commit_transaction(trans, root);
 out:
index ebf44f3..8b2714e 100644 (file)
@@ -70,5 +70,6 @@ void btrfs_transaction_queue_work(struct btrfs_root *root, int delay);
 void btrfs_init_transaction_sys(void);
 void btrfs_exit_transaction_sys(void);
 int btrfs_add_dead_root(struct btrfs_root *root, struct list_head *dead_list);
+int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info);
 
 #endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644 (file)
index 0000000..15d0a48
--- /dev/null
@@ -0,0 +1,222 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+
+static void reada_defrag(struct btrfs_root *root,
+                        struct btrfs_node *node)
+{
+       int i;
+       u32 nritems;
+       u64 blocknr;
+       int ret;
+
+       nritems = btrfs_header_nritems(&node->header);
+       for (i = 0; i < nritems; i++) {
+               blocknr = btrfs_node_blockptr(node, i);
+               ret = readahead_tree_block(root, blocknr);
+               if (ret)
+                       break;
+       }
+}
+
+static int defrag_walk_down(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           struct btrfs_path *path, int *level,
+                           int cache_only)
+{
+       struct buffer_head *next;
+       struct buffer_head *cur;
+       u64 blocknr;
+       int ret = 0;
+
+       WARN_ON(*level < 0);
+       WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+       while(*level > 0) {
+               WARN_ON(*level < 0);
+               WARN_ON(*level >= BTRFS_MAX_LEVEL);
+               cur = path->nodes[*level];
+
+               if (!cache_only && *level > 1 && path->slots[*level] == 0)
+                       reada_defrag(root, btrfs_buffer_node(cur));
+
+               if (btrfs_header_level(btrfs_buffer_header(cur)) != *level)
+                       WARN_ON(1);
+
+               if (path->slots[*level] >=
+                   btrfs_header_nritems(btrfs_buffer_header(cur)))
+                       break;
+
+               if (*level == 1) {
+                       ret = btrfs_realloc_node(trans, root,
+                                                path->nodes[*level],
+                                                cache_only);
+                       break;
+               }
+               blocknr = btrfs_node_blockptr(btrfs_buffer_node(cur),
+                                             path->slots[*level]);
+
+               if (cache_only) {
+                       next = btrfs_find_tree_block(root, blocknr);
+                       if (!next || !buffer_uptodate(next) ||
+                          buffer_locked(next)) {
+                               brelse(next);
+                               path->slots[*level]++;
+                               continue;
+                       }
+               } else {
+                       next = read_tree_block(root, blocknr);
+               }
+               ret = btrfs_cow_block(trans, root, next, path->nodes[*level],
+                                     path->slots[*level], &next);
+               BUG_ON(ret);
+               ret = btrfs_realloc_node(trans, root, next, cache_only);
+               BUG_ON(ret);
+               WARN_ON(*level <= 0);
+               if (path->nodes[*level-1])
+                       btrfs_block_release(root, path->nodes[*level-1]);
+               path->nodes[*level-1] = next;
+               *level = btrfs_header_level(btrfs_buffer_header(next));
+               path->slots[*level] = 0;
+       }
+       WARN_ON(*level < 0);
+       WARN_ON(*level >= BTRFS_MAX_LEVEL);
+       btrfs_block_release(root, path->nodes[*level]);
+       path->nodes[*level] = NULL;
+       *level += 1;
+       WARN_ON(ret);
+       return 0;
+}
+
+static int defrag_walk_up(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         struct btrfs_path *path, int *level,
+                         int cache_only)
+{
+       int i;
+       int slot;
+       struct btrfs_node *node;
+
+       for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+               slot = path->slots[i];
+               if (slot < btrfs_header_nritems(
+                   btrfs_buffer_header(path->nodes[i])) - 1) {
+                       path->slots[i]++;
+                       *level = i;
+                       node = btrfs_buffer_node(path->nodes[i]);
+                       WARN_ON(i == 0);
+                       btrfs_disk_key_to_cpu(&root->defrag_progress,
+                                             &node->ptrs[path->slots[i]].key);
+                       root->defrag_level = i;
+                       return 0;
+               } else {
+                       btrfs_block_release(root, path->nodes[*level]);
+                       path->nodes[*level] = NULL;
+                       *level = i + 1;
+               }
+       }
+       return 1;
+}
+
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, int cache_only)
+{
+       struct btrfs_path *path = NULL;
+       struct buffer_head *tmp;
+       int ret = 0;
+       int wret;
+       int level;
+       int orig_level;
+       int i;
+       int num_runs = 0;
+
+       if (root->ref_cows == 0) {
+               goto out;
+       }
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       level = btrfs_header_level(btrfs_buffer_header(root->node));
+       orig_level = level;
+       if (level == 0) {
+               goto out;
+       }
+       if (root->defrag_progress.objectid == 0) {
+               get_bh(root->node);
+               ret = btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
+               BUG_ON(ret);
+               ret = btrfs_realloc_node(trans, root, root->node, cache_only);
+               BUG_ON(ret);
+               path->nodes[level] = root->node;
+               path->slots[level] = 0;
+       } else {
+               level = root->defrag_level;
+               path->lowest_level = level;
+               wret = btrfs_search_slot(trans, root, &root->defrag_progress,
+                                        path, 0, 1);
+
+               if (wret < 0) {
+                       ret = wret;
+                       goto out;
+               }
+               while(level > 0 && !path->nodes[level])
+                       level--;
+               if (!path->nodes[level]) {
+                       ret = 0;
+                       goto out;
+               }
+       }
+
+       while(1) {
+               wret = defrag_walk_down(trans, root, path, &level, cache_only);
+               if (wret > 0)
+                       break;
+               if (wret < 0)
+                       ret = wret;
+
+               wret = defrag_walk_up(trans, root, path, &level, cache_only);
+               if (wret > 0)
+                       break;
+               if (wret < 0)
+                       ret = wret;
+               if (num_runs++ > 8) {
+                       ret = -EAGAIN;
+                       break;
+               }
+       }
+       for (i = 0; i <= orig_level; i++) {
+               if (path->nodes[i]) {
+                       btrfs_block_release(root, path->nodes[i]);
+                       path->nodes[i] = 0;
+               }
+       }
+out:
+       if (path)
+               btrfs_free_path(path);
+       if (ret != -EAGAIN) {
+               memset(&root->defrag_progress, 0,
+                      sizeof(root->defrag_progress));
+       }
+       return ret;
+}