Btrfs: create special free space cache inode
Josef Bacik [Mon, 21 Jun 2010 18:48:16 +0000 (14:48 -0400)]
In order to save free space cache, we need an inode to hold the data, and we
need a special item to point at the right inode for the right block group.  So
first, create a special item that will point to the right inode, and the number
of extent entries we will have and the number of bitmaps we will have.  We
truncate and pre-allocate space everytime to make sure it's uptodate.

This feature will be turned on as soon as you mount with -o space_cache, however
it is safe to boot into old kernels, they will just generate the cache the old
fashion way.  When you boot back into a newer kernel we will notice that we
modified and not the cache and automatically discard the cache.

Signed-off-by: Josef Bacik <josef@redhat.com>

fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/free-space-cache.c
fs/btrfs/free-space-cache.h
fs/btrfs/inode.c
fs/btrfs/relocation.c
fs/btrfs/super.c
fs/btrfs/transaction.c
fs/btrfs/transaction.h

index eaf286a..46f52e1 100644 (file)
@@ -99,6 +99,9 @@ struct btrfs_ordered_sum;
  */
 #define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
 
+/* For storing free space cache */
+#define BTRFS_FREE_SPACE_OBJECTID -11ULL
+
 /* dummy objectid represents multiple objectids */
 #define BTRFS_MULTIPLE_OBJECTIDS -255ULL
 
@@ -265,6 +268,22 @@ struct btrfs_chunk {
        /* additional stripes go here */
 } __attribute__ ((__packed__));
 
+#define BTRFS_FREE_SPACE_EXTENT        1
+#define BTRFS_FREE_SPACE_BITMAP        2
+
+struct btrfs_free_space_entry {
+       __le64 offset;
+       __le64 bytes;
+       u8 type;
+} __attribute__ ((__packed__));
+
+struct btrfs_free_space_header {
+       struct btrfs_disk_key location;
+       __le64 generation;
+       __le64 num_entries;
+       __le64 num_bitmaps;
+} __attribute__ ((__packed__));
+
 static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 {
        BUG_ON(num_stripes == 0);
@@ -365,8 +384,10 @@ struct btrfs_super_block {
 
        char label[BTRFS_LABEL_SIZE];
 
+       __le64 cache_generation;
+
        /* future expansion */
-       __le64 reserved[32];
+       __le64 reserved[31];
        u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 } __attribute__ ((__packed__));
 
@@ -375,12 +396,12 @@ struct btrfs_super_block {
  * ones specified below then we will fail to mount
  */
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF   (1ULL << 0)
-#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL  (2ULL << 0)
+#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL  (1ULL << 1)
 
 #define BTRFS_FEATURE_COMPAT_SUPP              0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP           0ULL
-#define BTRFS_FEATURE_INCOMPAT_SUPP            \
-       (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
+#define BTRFS_FEATURE_INCOMPAT_SUPP                    \
+       (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |         \
         BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)
 
 /*
@@ -750,6 +771,14 @@ enum btrfs_caching_type {
        BTRFS_CACHE_FINISHED    = 2,
 };
 
+enum btrfs_disk_cache_state {
+       BTRFS_DC_WRITTEN        = 0,
+       BTRFS_DC_ERROR          = 1,
+       BTRFS_DC_CLEAR          = 2,
+       BTRFS_DC_SETUP          = 3,
+       BTRFS_DC_NEED_WRITE     = 4,
+};
+
 struct btrfs_caching_control {
        struct list_head list;
        struct mutex mutex;
@@ -763,6 +792,7 @@ struct btrfs_block_group_cache {
        struct btrfs_key key;
        struct btrfs_block_group_item item;
        struct btrfs_fs_info *fs_info;
+       struct inode *inode;
        spinlock_t lock;
        u64 pinned;
        u64 reserved;
@@ -773,8 +803,11 @@ struct btrfs_block_group_cache {
        int extents_thresh;
        int free_extents;
        int total_bitmaps;
-       int ro;
-       int dirty;
+       int ro:1;
+       int dirty:1;
+       int iref:1;
+
+       int disk_cache_state;
 
        /* cache tracking stuff */
        int cached;
@@ -1192,6 +1225,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_NOSSD              (1 << 9)
 #define BTRFS_MOUNT_DISCARD            (1 << 10)
 #define BTRFS_MOUNT_FORCE_COMPRESS      (1 << 11)
+#define BTRFS_MOUNT_SPACE_CACHE                (1 << 12)
 
 #define btrfs_clear_opt(o, opt)                ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)          ((o) |= BTRFS_MOUNT_##opt)
@@ -1665,6 +1699,27 @@ static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
        write_eb_member(eb, item, struct btrfs_dir_item, location, key);
 }
 
+BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
+                  num_entries, 64);
+BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
+                  num_bitmaps, 64);
+BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
+                  generation, 64);
+
+static inline void btrfs_free_space_key(struct extent_buffer *eb,
+                                       struct btrfs_free_space_header *h,
+                                       struct btrfs_disk_key *key)
+{
+       read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
+static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
+                                           struct btrfs_free_space_header *h,
+                                           struct btrfs_disk_key *key)
+{
+       write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
 /* struct btrfs_disk_key */
 BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
                         objectid, 64);
@@ -1876,6 +1931,8 @@ BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
                         incompat_flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
                         csum_type, 16);
+BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
+                        cache_generation, 64);
 
 static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
 {
@@ -2115,6 +2172,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache);
 int btrfs_set_block_group_rw(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache);
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@ -2426,6 +2484,10 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root);
 int btrfs_prealloc_file_range(struct inode *inode, int mode,
                              u64 start, u64 num_bytes, u64 min_size,
                              loff_t actual_len, u64 *alloc_hint);
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+                                   struct btrfs_trans_handle *trans, int mode,
+                                   u64 start, u64 num_bytes, u64 min_size,
+                                   loff_t actual_len, u64 *alloc_hint);
 extern const struct dentry_operations btrfs_dentry_operations;
 
 /* ioctl.c */
index 64f1008..45cf64f 100644 (file)
@@ -1685,7 +1685,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        __setup_root(4096, 4096, 4096, 4096, tree_root,
                     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
-
        bh = btrfs_read_dev_super(fs_devices->latest_bdev);
        if (!bh)
                goto fail_iput;
@@ -1993,6 +1992,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (!(sb->s_flags & MS_RDONLY)) {
                down_read(&fs_info->cleanup_work_sem);
                btrfs_orphan_cleanup(fs_info->fs_root);
+               btrfs_orphan_cleanup(fs_info->tree_root);
                up_read(&fs_info->cleanup_work_sem);
        }
 
@@ -2421,6 +2421,7 @@ int close_ctree(struct btrfs_root *root)
        fs_info->closing = 1;
        smp_mb();
 
+       btrfs_put_block_group_cache(fs_info);
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                ret =  btrfs_commit_super(root);
                if (ret)
index 32d0940..aab40fb 100644 (file)
@@ -2688,6 +2688,109 @@ next_block_group(struct btrfs_root *root,
        return cache;
 }
 
+static int cache_save_setup(struct btrfs_block_group_cache *block_group,
+                           struct btrfs_trans_handle *trans,
+                           struct btrfs_path *path)
+{
+       struct btrfs_root *root = block_group->fs_info->tree_root;
+       struct inode *inode = NULL;
+       u64 alloc_hint = 0;
+       int num_pages = 0;
+       int retries = 0;
+       int ret = 0;
+
+       /*
+        * If this block group is smaller than 100 megs don't bother caching the
+        * block group.
+        */
+       if (block_group->key.offset < (100 * 1024 * 1024)) {
+               spin_lock(&block_group->lock);
+               block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+               spin_unlock(&block_group->lock);
+               return 0;
+       }
+
+again:
+       inode = lookup_free_space_inode(root, block_group, path);
+       if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+               ret = PTR_ERR(inode);
+               btrfs_release_path(root, path);
+               goto out;
+       }
+
+       if (IS_ERR(inode)) {
+               BUG_ON(retries);
+               retries++;
+
+               if (block_group->ro)
+                       goto out_free;
+
+               ret = create_free_space_inode(root, trans, block_group, path);
+               if (ret)
+                       goto out_free;
+               goto again;
+       }
+
+       /*
+        * We want to set the generation to 0, that way if anything goes wrong
+        * from here on out we know not to trust this cache when we load up next
+        * time.
+        */
+       BTRFS_I(inode)->generation = 0;
+       ret = btrfs_update_inode(trans, root, inode);
+       WARN_ON(ret);
+
+       if (i_size_read(inode) > 0) {
+               ret = btrfs_truncate_free_space_cache(root, trans, path,
+                                                     inode);
+               if (ret)
+                       goto out_put;
+       }
+
+       spin_lock(&block_group->lock);
+       if (block_group->cached != BTRFS_CACHE_FINISHED) {
+               spin_unlock(&block_group->lock);
+               goto out_put;
+       }
+       spin_unlock(&block_group->lock);
+
+       num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
+       if (!num_pages)
+               num_pages = 1;
+
+       /*
+        * Just to make absolutely sure we have enough space, we're going to
+        * preallocate 12 pages worth of space for each block group.  In
+        * practice we ought to use at most 8, but we need extra space so we can
+        * add our header and have a terminator between the extents and the
+        * bitmaps.
+        */
+       num_pages *= 16;
+       num_pages *= PAGE_CACHE_SIZE;
+
+       ret = btrfs_check_data_free_space(inode, num_pages);
+       if (ret)
+               goto out_put;
+
+       ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
+                                             num_pages, num_pages,
+                                             &alloc_hint);
+       btrfs_free_reserved_data_space(inode, num_pages);
+out_put:
+       iput(inode);
+out_free:
+       btrfs_release_path(root, path);
+out:
+       spin_lock(&block_group->lock);
+       if (ret)
+               block_group->disk_cache_state = BTRFS_DC_ERROR;
+       else
+               block_group->disk_cache_state = BTRFS_DC_SETUP;
+       spin_unlock(&block_group->lock);
+
+       return ret;
+}
+
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
@@ -2700,6 +2803,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
 
+again:
+       while (1) {
+               cache = btrfs_lookup_first_block_group(root->fs_info, last);
+               while (cache) {
+                       if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+                               break;
+                       cache = next_block_group(root, cache);
+               }
+               if (!cache) {
+                       if (last == 0)
+                               break;
+                       last = 0;
+                       continue;
+               }
+               err = cache_save_setup(cache, trans, path);
+               last = cache->key.objectid + cache->key.offset;
+               btrfs_put_block_group(cache);
+       }
+
        while (1) {
                if (last == 0) {
                        err = btrfs_run_delayed_refs(trans, root,
@@ -2709,6 +2831,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 
                cache = btrfs_lookup_first_block_group(root->fs_info, last);
                while (cache) {
+                       if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
+                               btrfs_put_block_group(cache);
+                               goto again;
+                       }
+
                        if (cache->dirty)
                                break;
                        cache = next_block_group(root, cache);
@@ -2883,11 +3010,16 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
        struct btrfs_space_info *data_sinfo;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 used;
-       int ret = 0, committed = 0;
+       int ret = 0, committed = 0, alloc_chunk = 1;
 
        /* make sure bytes are sectorsize aligned */
        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
 
+       if (root == root->fs_info->tree_root) {
+               alloc_chunk = 0;
+               committed = 1;
+       }
+
        data_sinfo = BTRFS_I(inode)->space_info;
        if (!data_sinfo)
                goto alloc;
@@ -2906,7 +3038,7 @@ again:
                 * if we don't have enough free bytes in this space then we need
                 * to alloc a new chunk.
                 */
-               if (!data_sinfo->full) {
+               if (!data_sinfo->full && alloc_chunk) {
                        u64 alloc_target;
 
                        data_sinfo->force_alloc = 1;
@@ -3777,12 +3909,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc)
 {
-       struct btrfs_block_group_cache *cache;
+       struct btrfs_block_group_cache *cache = NULL;
        struct btrfs_fs_info *info = root->fs_info;
-       int factor;
        u64 total = num_bytes;
        u64 old_val;
        u64 byte_in_group;
+       int factor;
 
        /* block accounting for super block */
        spin_lock(&info->delalloc_lock);
@@ -3804,11 +3936,17 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        factor = 2;
                else
                        factor = 1;
+
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
 
                spin_lock(&cache->space_info->lock);
                spin_lock(&cache->lock);
+
+               if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
+                   cache->disk_cache_state < BTRFS_DC_CLEAR)
+                       cache->disk_cache_state = BTRFS_DC_CLEAR;
+
                cache->dirty = 1;
                old_val = btrfs_block_group_used(&cache->item);
                num_bytes = min(total, cache->key.offset - byte_in_group);
@@ -7814,6 +7952,40 @@ out:
        return ret;
 }
 
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
+{
+       struct btrfs_block_group_cache *block_group;
+       u64 last = 0;
+
+       while (1) {
+               struct inode *inode;
+
+               block_group = btrfs_lookup_first_block_group(info, last);
+               while (block_group) {
+                       spin_lock(&block_group->lock);
+                       if (block_group->iref)
+                               break;
+                       spin_unlock(&block_group->lock);
+                       block_group = next_block_group(info->tree_root,
+                                                      block_group);
+               }
+               if (!block_group) {
+                       if (last == 0)
+                               break;
+                       last = 0;
+                       continue;
+               }
+
+               inode = block_group->inode;
+               block_group->iref = 0;
+               block_group->inode = NULL;
+               spin_unlock(&block_group->lock);
+               iput(inode);
+               last = block_group->key.objectid + block_group->key.offset;
+               btrfs_put_block_group(block_group);
+       }
+}
+
 int btrfs_free_block_groups(struct btrfs_fs_info *info)
 {
        struct btrfs_block_group_cache *block_group;
@@ -7897,6 +8069,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
        struct btrfs_key key;
        struct btrfs_key found_key;
        struct extent_buffer *leaf;
+       int need_clear = 0;
+       u64 cache_gen;
 
        root = info->extent_root;
        key.objectid = 0;
@@ -7906,6 +8080,11 @@ int btrfs_read_block_groups(struct btrfs_root *root)
        if (!path)
                return -ENOMEM;
 
+       cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+       if (cache_gen != 0 &&
+           btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+               need_clear = 1;
+
        while (1) {
                ret = find_first_block_group(root, path, &key);
                if (ret > 0)
@@ -7928,6 +8107,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                INIT_LIST_HEAD(&cache->list);
                INIT_LIST_HEAD(&cache->cluster_list);
 
+               if (need_clear)
+                       cache->disk_cache_state = BTRFS_DC_CLEAR;
+
                /*
                 * we only want to have 32k of ram per block group for keeping
                 * track of free space, and if we pass 1/2 of that we want to
@@ -8032,6 +8214,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->key.offset = size;
        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
        cache->sectorsize = root->sectorsize;
+       cache->fs_info = root->fs_info;
 
        /*
         * we only want to have 32k of ram per block group for keeping track
@@ -8088,7 +8271,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        struct btrfs_path *path;
        struct btrfs_block_group_cache *block_group;
        struct btrfs_free_cluster *cluster;
+       struct btrfs_root *tree_root = root->fs_info->tree_root;
        struct btrfs_key key;
+       struct inode *inode;
        int ret;
 
        root = root->fs_info->extent_root;
@@ -8097,8 +8282,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        BUG_ON(!block_group);
        BUG_ON(!block_group->ro);
 
-       memcpy(&key, &block_group->key, sizeof(key));
-
        /* make sure this block group isn't part of an allocation cluster */
        cluster = &root->fs_info->data_alloc_cluster;
        spin_lock(&cluster->refill_lock);
@@ -8117,6 +8300,40 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        BUG_ON(!path);
 
+       inode = lookup_free_space_inode(root, block_group, path);
+       if (!IS_ERR(inode)) {
+               btrfs_orphan_add(trans, inode);
+               clear_nlink(inode);
+               /* One for the block groups ref */
+               spin_lock(&block_group->lock);
+               if (block_group->iref) {
+                       block_group->iref = 0;
+                       block_group->inode = NULL;
+                       spin_unlock(&block_group->lock);
+                       iput(inode);
+               } else {
+                       spin_unlock(&block_group->lock);
+               }
+               /* One for our lookup ref */
+               iput(inode);
+       }
+
+       key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+       key.offset = block_group->key.objectid;
+       key.type = 0;
+
+       ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+       if (ret < 0)
+               goto out;
+       if (ret > 0)
+               btrfs_release_path(tree_root, path);
+       if (ret == 0) {
+               ret = btrfs_del_item(trans, tree_root, path);
+               if (ret)
+                       goto out;
+               btrfs_release_path(tree_root, path);
+       }
+
        spin_lock(&root->fs_info->block_group_cache_lock);
        rb_erase(&block_group->cache_node,
                 &root->fs_info->block_group_cache_tree);
@@ -8140,6 +8357,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        block_group->space_info->bytes_readonly -= block_group->key.offset;
        spin_unlock(&block_group->space_info->lock);
 
+       memcpy(&key, &block_group->key, sizeof(key));
+
        btrfs_clear_space_info_full(root->fs_info);
 
        btrfs_put_block_group(block_group);
index f488fac..05efcc7 100644 (file)
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
+#include "disk-io.h"
 
 #define BITS_PER_BITMAP                (PAGE_CACHE_SIZE * 8)
 #define MAX_CACHE_BYTES_PER_GIG        (32 * 1024)
 
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+                                     struct btrfs_block_group_cache
+                                     *block_group, struct btrfs_path *path)
+{
+       struct btrfs_key key;
+       struct btrfs_key location;
+       struct btrfs_disk_key disk_key;
+       struct btrfs_free_space_header *header;
+       struct extent_buffer *leaf;
+       struct inode *inode = NULL;
+       int ret;
+
+       spin_lock(&block_group->lock);
+       if (block_group->inode)
+               inode = igrab(block_group->inode);
+       spin_unlock(&block_group->lock);
+       if (inode)
+               return inode;
+
+       key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+       key.offset = block_group->key.objectid;
+       key.type = 0;
+
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               return ERR_PTR(ret);
+       if (ret > 0) {
+               btrfs_release_path(root, path);
+               return ERR_PTR(-ENOENT);
+       }
+
+       leaf = path->nodes[0];
+       header = btrfs_item_ptr(leaf, path->slots[0],
+                               struct btrfs_free_space_header);
+       btrfs_free_space_key(leaf, header, &disk_key);
+       btrfs_disk_key_to_cpu(&location, &disk_key);
+       btrfs_release_path(root, path);
+
+       inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
+       if (!inode)
+               return ERR_PTR(-ENOENT);
+       if (IS_ERR(inode))
+               return inode;
+       if (is_bad_inode(inode)) {
+               iput(inode);
+               return ERR_PTR(-ENOENT);
+       }
+
+       spin_lock(&block_group->lock);
+       if (!root->fs_info->closing) {
+               block_group->inode = igrab(inode);
+               block_group->iref = 1;
+       }
+       spin_unlock(&block_group->lock);
+
+       return inode;
+}
+
+int create_free_space_inode(struct btrfs_root *root,
+                           struct btrfs_trans_handle *trans,
+                           struct btrfs_block_group_cache *block_group,
+                           struct btrfs_path *path)
+{
+       struct btrfs_key key;
+       struct btrfs_disk_key disk_key;
+       struct btrfs_free_space_header *header;
+       struct btrfs_inode_item *inode_item;
+       struct extent_buffer *leaf;
+       u64 objectid;
+       int ret;
+
+       ret = btrfs_find_free_objectid(trans, root, 0, &objectid);
+       if (ret < 0)
+               return ret;
+
+       ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+       if (ret)
+               return ret;
+
+       leaf = path->nodes[0];
+       inode_item = btrfs_item_ptr(leaf, path->slots[0],
+                                   struct btrfs_inode_item);
+       btrfs_item_key(leaf, &disk_key, path->slots[0]);
+       memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
+                            sizeof(*inode_item));
+       btrfs_set_inode_generation(leaf, inode_item, trans->transid);
+       btrfs_set_inode_size(leaf, inode_item, 0);
+       btrfs_set_inode_nbytes(leaf, inode_item, 0);
+       btrfs_set_inode_uid(leaf, inode_item, 0);
+       btrfs_set_inode_gid(leaf, inode_item, 0);
+       btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
+       btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
+                             BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM);
+       btrfs_set_inode_nlink(leaf, inode_item, 1);
+       btrfs_set_inode_transid(leaf, inode_item, trans->transid);
+       btrfs_set_inode_block_group(leaf, inode_item,
+                                   block_group->key.objectid);
+       btrfs_mark_buffer_dirty(leaf);
+       btrfs_release_path(root, path);
+
+       key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+       key.offset = block_group->key.objectid;
+       key.type = 0;
+
+       ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                     sizeof(struct btrfs_free_space_header));
+       if (ret < 0) {
+               btrfs_release_path(root, path);
+               return ret;
+       }
+       leaf = path->nodes[0];
+       header = btrfs_item_ptr(leaf, path->slots[0],
+                               struct btrfs_free_space_header);
+       memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
+       btrfs_set_free_space_key(leaf, header, &disk_key);
+       btrfs_mark_buffer_dirty(leaf);
+       btrfs_release_path(root, path);
+
+       return 0;
+}
+
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+                                   struct btrfs_trans_handle *trans,
+                                   struct btrfs_path *path,
+                                   struct inode *inode)
+{
+       loff_t oldsize;
+       int ret = 0;
+
+       trans->block_rsv = root->orphan_block_rsv;
+       ret = btrfs_block_rsv_check(trans, root,
+                                   root->orphan_block_rsv,
+                                   0, 5);
+       if (ret)
+               return ret;
+
+       oldsize = i_size_read(inode);
+       btrfs_i_size_write(inode, 0);
+       truncate_pagecache(inode, oldsize, 0);
+
+       /*
+        * We don't need an orphan item because truncating the free space cache
+        * will never be split across transactions.
+        */
+       ret = btrfs_truncate_inode_items(trans, root, inode,
+                                        0, BTRFS_EXTENT_DATA_KEY);
+       if (ret) {
+               WARN_ON(1);
+               return ret;
+       }
+
+       return btrfs_update_inode(trans, root, inode);
+}
+
 static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
                                          u64 offset)
 {
index 890a8e7..45be29e 100644 (file)
@@ -27,6 +27,17 @@ struct btrfs_free_space {
        struct list_head list;
 };
 
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+                                     struct btrfs_block_group_cache
+                                     *block_group, struct btrfs_path *path);
+int create_free_space_inode(struct btrfs_root *root,
+                           struct btrfs_trans_handle *trans,
+                           struct btrfs_block_group_cache *block_group,
+                           struct btrfs_path *path);
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+                                   struct btrfs_trans_handle *trans,
+                                   struct btrfs_path *path,
+                                   struct inode *inode);
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
                         u64 bytenr, u64 size);
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
index c038644..1af1ea8 100644 (file)
@@ -1700,6 +1700,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                                ordered_extent->len);
                BUG_ON(ret);
        } else {
+               BUG_ON(root == root->fs_info->tree_root);
                ret = insert_reserved_file_extent(trans, inode,
                                                ordered_extent->file_offset,
                                                ordered_extent->start,
@@ -3196,7 +3197,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 
        BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
 
-       if (root->ref_cows)
+       if (root->ref_cows || root == root->fs_info->tree_root)
                btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
 
        path = btrfs_alloc_path();
@@ -3344,7 +3345,8 @@ delete:
                } else {
                        break;
                }
-               if (found_extent && root->ref_cows) {
+               if (found_extent && (root->ref_cows ||
+                                    root == root->fs_info->tree_root)) {
                        btrfs_set_path_blocking(path);
                        ret = btrfs_free_extent(trans, root, extent_start,
                                                extent_num_bytes, 0,
@@ -3675,7 +3677,8 @@ void btrfs_evict_inode(struct inode *inode)
        int ret;
 
        truncate_inode_pages(&inode->i_data, 0);
-       if (inode->i_nlink && btrfs_root_refs(&root->root_item) != 0)
+       if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
+                              root == root->fs_info->tree_root))
                goto no_delete;
 
        if (is_bad_inode(inode)) {
@@ -3888,7 +3891,14 @@ static void inode_tree_del(struct inode *inode)
        }
        spin_unlock(&root->inode_lock);
 
-       if (empty && btrfs_root_refs(&root->root_item) == 0) {
+       /*
+        * Free space cache has inodes in the tree root, but the tree root has a
+        * root_refs of 0, so this could end up dropping the tree root as a
+        * snapshot, so we need the extra !root->fs_info->tree_root check to
+        * make sure we don't drop it.
+        */
+       if (empty && btrfs_root_refs(&root->root_item) == 0 &&
+           root != root->fs_info->tree_root) {
                synchronize_srcu(&root->fs_info->subvol_srcu);
                spin_lock(&root->inode_lock);
                empty = RB_EMPTY_ROOT(&root->inode_tree);
@@ -4282,14 +4292,24 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        int ret = 0;
+       bool nolock = false;
 
        if (BTRFS_I(inode)->dummy_inode)
                return 0;
 
+       smp_mb();
+       nolock = (root->fs_info->closing && root == root->fs_info->tree_root);
+
        if (wbc->sync_mode == WB_SYNC_ALL) {
-               trans = btrfs_join_transaction(root, 1);
+               if (nolock)
+                       trans = btrfs_join_transaction_nolock(root, 1);
+               else
+                       trans = btrfs_join_transaction(root, 1);
                btrfs_set_trans_block_group(trans, inode);
-               ret = btrfs_commit_transaction(trans, root);
+               if (nolock)
+                       ret = btrfs_end_transaction_nolock(trans, root);
+               else
+                       ret = btrfs_commit_transaction(trans, root);
        }
        return ret;
 }
@@ -6308,6 +6328,21 @@ void btrfs_destroy_inode(struct inode *inode)
                spin_unlock(&root->fs_info->ordered_extent_lock);
        }
 
+       if (root == root->fs_info->tree_root) {
+               struct btrfs_block_group_cache *block_group;
+
+               block_group = btrfs_lookup_block_group(root->fs_info,
+                                               BTRFS_I(inode)->block_group);
+               if (block_group && block_group->inode == inode) {
+                       spin_lock(&block_group->lock);
+                       block_group->inode = NULL;
+                       spin_unlock(&block_group->lock);
+                       btrfs_put_block_group(block_group);
+               } else if (block_group) {
+                       btrfs_put_block_group(block_group);
+               }
+       }
+
        spin_lock(&root->orphan_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
                printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
@@ -6340,7 +6375,8 @@ int btrfs_drop_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
 
-       if (btrfs_root_refs(&root->root_item) == 0)
+       if (btrfs_root_refs(&root->root_item) == 0 &&
+           root != root->fs_info->tree_root)
                return 1;
        else
                return generic_drop_inode(inode);
@@ -6757,27 +6793,33 @@ out_unlock:
        return err;
 }
 
-int btrfs_prealloc_file_range(struct inode *inode, int mode,
-                             u64 start, u64 num_bytes, u64 min_size,
-                             loff_t actual_len, u64 *alloc_hint)
+static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
+                                      u64 start, u64 num_bytes, u64 min_size,
+                                      loff_t actual_len, u64 *alloc_hint,
+                                      struct btrfs_trans_handle *trans)
 {
-       struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 cur_offset = start;
        int ret = 0;
+       bool own_trans = true;
 
+       if (trans)
+               own_trans = false;
        while (num_bytes > 0) {
-               trans = btrfs_start_transaction(root, 3);
-               if (IS_ERR(trans)) {
-                       ret = PTR_ERR(trans);
-                       break;
+               if (own_trans) {
+                       trans = btrfs_start_transaction(root, 3);
+                       if (IS_ERR(trans)) {
+                               ret = PTR_ERR(trans);
+                               break;
+                       }
                }
 
                ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
                                           0, *alloc_hint, (u64)-1, &ins, 1);
                if (ret) {
-                       btrfs_end_transaction(trans, root);
+                       if (own_trans)
+                               btrfs_end_transaction(trans, root);
                        break;
                }
 
@@ -6810,11 +6852,30 @@ int btrfs_prealloc_file_range(struct inode *inode, int mode,
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
 
-               btrfs_end_transaction(trans, root);
+               if (own_trans)
+                       btrfs_end_transaction(trans, root);
        }
        return ret;
 }
 
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+                             u64 start, u64 num_bytes, u64 min_size,
+                             loff_t actual_len, u64 *alloc_hint)
+{
+       return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+                                          min_size, actual_len, alloc_hint,
+                                          NULL);
+}
+
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+                                   struct btrfs_trans_handle *trans, int mode,
+                                   u64 start, u64 num_bytes, u64 min_size,
+                                   loff_t actual_len, u64 *alloc_hint)
+{
+       return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+                                          min_size, actual_len, alloc_hint, trans);
+}
+
 static long btrfs_fallocate(struct inode *inode, int mode,
                            loff_t offset, loff_t len)
 {
index b37d723..af339ee 100644 (file)
@@ -29,6 +29,7 @@
 #include "locking.h"
 #include "btrfs_inode.h"
 #include "async-thread.h"
+#include "free-space-cache.h"
 
 /*
  * backref_node, mapping_node and tree_block start with this
@@ -3191,6 +3192,54 @@ static int block_use_full_backref(struct reloc_control *rc,
        return ret;
 }
 
+static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
+                                   struct inode *inode, u64 ino)
+{
+       struct btrfs_key key;
+       struct btrfs_path *path;
+       struct btrfs_root *root = fs_info->tree_root;
+       struct btrfs_trans_handle *trans;
+       unsigned long nr;
+       int ret = 0;
+
+       if (inode)
+               goto truncate;
+
+       key.objectid = ino;
+       key.type = BTRFS_INODE_ITEM_KEY;
+       key.offset = 0;
+
+       inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+       if (!inode || IS_ERR(inode) || is_bad_inode(inode)) {
+               if (inode && !IS_ERR(inode))
+                       iput(inode);
+               return -ENOENT;
+       }
+
+truncate:
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       trans = btrfs_join_transaction(root, 0);
+       if (IS_ERR(trans)) {
+               btrfs_free_path(path);
+               goto out;
+       }
+
+       ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+
+       btrfs_free_path(path);
+       nr = trans->blocks_used;
+       btrfs_end_transaction(trans, root);
+       btrfs_btree_balance_dirty(root, nr);
+out:
+       iput(inode);
+       return ret;
+}
+
 /*
  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
  * this function scans fs tree to find blocks reference the data extent
@@ -3217,15 +3266,27 @@ static int find_data_references(struct reloc_control *rc,
        int counted;
        int ret;
 
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-
        ref_root = btrfs_extent_data_ref_root(leaf, ref);
        ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
        ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
        ref_count = btrfs_extent_data_ref_count(leaf, ref);
 
+       /*
+        * This is an extent belonging to the free space cache, lets just delete
+        * it and redo the search.
+        */
+       if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
+               ret = delete_block_group_cache(rc->extent_root->fs_info,
+                                              NULL, ref_objectid);
+               if (ret != -ENOENT)
+                       return ret;
+               ret = 0;
+       }
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
        root = read_fs_root(rc->extent_root->fs_info, ref_root);
        if (IS_ERR(root)) {
                err = PTR_ERR(root);
@@ -3860,6 +3921,8 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 {
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
        struct reloc_control *rc;
+       struct inode *inode;
+       struct btrfs_path *path;
        int ret;
        int rw = 0;
        int err = 0;
@@ -3882,6 +3945,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
                rw = 1;
        }
 
+       path = btrfs_alloc_path();
+       if (!path) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
+                                       path);
+       btrfs_free_path(path);
+
+       if (!IS_ERR(inode))
+               ret = delete_block_group_cache(fs_info, inode, 0);
+       else
+               ret = PTR_ERR(inode);
+
+       if (ret && ret != -ENOENT) {
+               err = ret;
+               goto out;
+       }
+
        rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
        if (IS_ERR(rc->data_inode)) {
                err = PTR_ERR(rc->data_inode);
index 1776dbd..5c23eb8 100644 (file)
@@ -68,7 +68,7 @@ enum {
        Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
        Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
        Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
-       Opt_discard, Opt_err,
+       Opt_discard, Opt_space_cache, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -92,6 +92,7 @@ static match_table_t tokens = {
        {Opt_flushoncommit, "flushoncommit"},
        {Opt_ratio, "metadata_ratio=%d"},
        {Opt_discard, "discard"},
+       {Opt_space_cache, "space_cache"},
        {Opt_err, NULL},
 };
 
@@ -235,6 +236,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_discard:
                        btrfs_set_opt(info->mount_opt, DISCARD);
                        break;
+               case Opt_space_cache:
+                       printk(KERN_INFO "btrfs: enabling disk space caching\n");
+                       btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+                       break;
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
index 66e4c66..e7144c4 100644 (file)
@@ -163,6 +163,7 @@ enum btrfs_trans_type {
        TRANS_START,
        TRANS_JOIN,
        TRANS_USERSPACE,
+       TRANS_JOIN_NOLOCK,
 };
 
 static int may_wait_transaction(struct btrfs_root *root, int type)
@@ -186,7 +187,8 @@ again:
        if (!h)
                return ERR_PTR(-ENOMEM);
 
-       mutex_lock(&root->fs_info->trans_mutex);
+       if (type != TRANS_JOIN_NOLOCK)
+               mutex_lock(&root->fs_info->trans_mutex);
        if (may_wait_transaction(root, type))
                wait_current_trans(root);
 
@@ -195,7 +197,8 @@ again:
 
        cur_trans = root->fs_info->running_transaction;
        cur_trans->use_count++;
-       mutex_unlock(&root->fs_info->trans_mutex);
+       if (type != TRANS_JOIN_NOLOCK)
+               mutex_unlock(&root->fs_info->trans_mutex);
 
        h->transid = cur_trans->transid;
        h->transaction = cur_trans;
@@ -224,9 +227,11 @@ again:
                }
        }
 
-       mutex_lock(&root->fs_info->trans_mutex);
+       if (type != TRANS_JOIN_NOLOCK)
+               mutex_lock(&root->fs_info->trans_mutex);
        record_root_in_trans(h, root);
-       mutex_unlock(&root->fs_info->trans_mutex);
+       if (type != TRANS_JOIN_NOLOCK)
+               mutex_unlock(&root->fs_info->trans_mutex);
 
        if (!current->journal_info && type != TRANS_USERSPACE)
                current->journal_info = h;
@@ -244,6 +249,12 @@ struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
        return start_transaction(root, 0, TRANS_JOIN);
 }
 
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
+                                                         int num_blocks)
+{
+       return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
+}
+
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
                                                         int num_blocks)
 {
@@ -348,7 +359,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 }
 
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root, int throttle)
+                         struct btrfs_root *root, int throttle, int lock)
 {
        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_fs_info *info = root->fs_info;
@@ -376,18 +387,19 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
        btrfs_trans_release_metadata(trans, root);
 
-       if (!root->fs_info->open_ioctl_trans &&
+       if (lock && !root->fs_info->open_ioctl_trans &&
            should_end_transaction(trans, root))
                trans->transaction->blocked = 1;
 
-       if (cur_trans->blocked && !cur_trans->in_commit) {
+       if (lock && cur_trans->blocked && !cur_trans->in_commit) {
                if (throttle)
                        return btrfs_commit_transaction(trans, root);
                else
                        wake_up_process(info->transaction_kthread);
        }
 
-       mutex_lock(&info->trans_mutex);
+       if (lock)
+               mutex_lock(&info->trans_mutex);
        WARN_ON(cur_trans != info->running_transaction);
        WARN_ON(cur_trans->num_writers < 1);
        cur_trans->num_writers--;
@@ -395,7 +407,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        if (waitqueue_active(&cur_trans->writer_wait))
                wake_up(&cur_trans->writer_wait);
        put_transaction(cur_trans);
-       mutex_unlock(&info->trans_mutex);
+       if (lock)
+               mutex_unlock(&info->trans_mutex);
 
        if (current->journal_info == trans)
                current->journal_info = NULL;
@@ -411,13 +424,19 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root)
 {
-       return __btrfs_end_transaction(trans, root, 0);
+       return __btrfs_end_transaction(trans, root, 0, 1);
 }
 
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
-       return __btrfs_end_transaction(trans, root, 1);
+       return __btrfs_end_transaction(trans, root, 1, 1);
+}
+
+int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root)
+{
+       return __btrfs_end_transaction(trans, root, 0, 0);
 }
 
 /*
@@ -966,6 +985,8 @@ static void update_super_roots(struct btrfs_root *root)
        super->root = root_item->bytenr;
        super->generation = root_item->generation;
        super->root_level = root_item->level;
+       if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
+               super->cache_generation = root_item->generation;
 }
 
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
index e104986..15f83e1 100644 (file)
@@ -87,10 +87,14 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root);
+int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
                                                   int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
                                                  int num_blocks);
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
+                                                         int num_blocks);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
                                                         int num_blocks);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,