f2fs: refactor flush_nat_entries to remove costly reorganizing ops
Jaegeuk Kim [Mon, 22 Sep 2014 18:40:48 +0000 (11:40 -0700)]
Previously, f2fs tries to reorganize the dirty nat entries into multiple sets
according to its nid ranges. This can improve the flushing nat pages, however,
if there are a lot of cached nat entries, it becomes a bottleneck.

This patch introduces a new set management flow by removing dirty nat list and
adding a series of set operations when the nat entry becomes dirty.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>

fs/f2fs/f2fs.h
fs/f2fs/node.c
fs/f2fs/node.h

index 64bd5fc..46c1d94 100644 (file)
@@ -167,6 +167,9 @@ struct fsync_inode_entry {
 #define sit_in_journal(sum, i)         (sum->sit_j.entries[i].se)
 #define segno_in_journal(sum, i)       (sum->sit_j.entries[i].segno)
 
+#define MAX_NAT_JENTRIES(sum)  (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum))
+#define MAX_SIT_JENTRIES(sum)  (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum))
+
 static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
 {
        int before = nats_in_cursum(rs);
@@ -185,9 +188,8 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
                                                                int type)
 {
        if (type == NAT_JOURNAL)
-               return nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES;
-
-       return sits_in_cursum(sum) + size <= SIT_JOURNAL_ENTRIES;
+               return size <= MAX_NAT_JENTRIES(sum);
+       return size <= MAX_SIT_JENTRIES(sum);
 }
 
 /*
@@ -297,11 +299,10 @@ struct f2fs_nm_info {
 
        /* NAT cache management */
        struct radix_tree_root nat_root;/* root of the nat entry cache */
+       struct radix_tree_root nat_set_root;/* root of the nat set cache */
        rwlock_t nat_tree_lock;         /* protect nat_tree_lock */
-       unsigned int nat_cnt;           /* the # of cached nat entries */
        struct list_head nat_entries;   /* cached nat entry list (clean) */
-       struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
-       struct list_head nat_entry_set; /* nat entry set list */
+       unsigned int nat_cnt;           /* the # of cached nat entries */
        unsigned int dirty_nat_cnt;     /* total num of nat entries in set */
 
        /* free node ids management */
index f639474..19bad24 100644 (file)
@@ -123,6 +123,57 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
        kmem_cache_free(nat_entry_slab, e);
 }
 
+static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+                                               struct nat_entry *ne)
+{
+       nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;
+       struct nat_entry_set *head;
+
+       if (get_nat_flag(ne, IS_DIRTY))
+               return;
+retry:
+       head = radix_tree_lookup(&nm_i->nat_set_root, set);
+       if (!head) {
+               head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
+
+               INIT_LIST_HEAD(&head->entry_list);
+               INIT_LIST_HEAD(&head->set_list);
+               head->set = set;
+               head->entry_cnt = 0;
+
+               if (radix_tree_insert(&nm_i->nat_set_root, set, head)) {
+                       cond_resched();
+                       goto retry;
+               }
+       }
+       list_move_tail(&ne->list, &head->entry_list);
+       nm_i->dirty_nat_cnt++;
+       head->entry_cnt++;
+       set_nat_flag(ne, IS_DIRTY, true);
+}
+
+static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+                                               struct nat_entry *ne)
+{
+       nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;
+       struct nat_entry_set *head;
+
+       head = radix_tree_lookup(&nm_i->nat_set_root, set);
+       if (head) {
+               list_move_tail(&ne->list, &nm_i->nat_entries);
+               set_nat_flag(ne, IS_DIRTY, false);
+               head->entry_cnt--;
+               nm_i->dirty_nat_cnt--;
+       }
+}
+
+static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
+               nid_t start, unsigned int nr, struct nat_entry_set **ep)
+{
+       return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
+                                                       start, nr);
+}
+
 bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
@@ -1760,79 +1811,6 @@ skip:
        return err;
 }
 
-static struct nat_entry_set *grab_nat_entry_set(void)
-{
-       struct nat_entry_set *nes =
-                       f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
-
-       nes->entry_cnt = 0;
-       INIT_LIST_HEAD(&nes->set_list);
-       INIT_LIST_HEAD(&nes->entry_list);
-       return nes;
-}
-
-static void release_nat_entry_set(struct nat_entry_set *nes,
-                                               struct f2fs_nm_info *nm_i)
-{
-       nm_i->dirty_nat_cnt -= nes->entry_cnt;
-       list_del(&nes->set_list);
-       kmem_cache_free(nat_entry_set_slab, nes);
-}
-
-static void adjust_nat_entry_set(struct nat_entry_set *nes,
-                                               struct list_head *head)
-{
-       struct nat_entry_set *next = nes;
-
-       if (list_is_last(&nes->set_list, head))
-               return;
-
-       list_for_each_entry_continue(next, head, set_list)
-               if (nes->entry_cnt <= next->entry_cnt)
-                       break;
-
-       list_move_tail(&nes->set_list, &next->set_list);
-}
-
-static void add_nat_entry(struct nat_entry *ne, struct list_head *head)
-{
-       struct nat_entry_set *nes;
-       nid_t start_nid = START_NID(ne->ni.nid);
-
-       list_for_each_entry(nes, head, set_list) {
-               if (nes->start_nid == start_nid) {
-                       list_move_tail(&ne->list, &nes->entry_list);
-                       nes->entry_cnt++;
-                       adjust_nat_entry_set(nes, head);
-                       return;
-               }
-       }
-
-       nes = grab_nat_entry_set();
-
-       nes->start_nid = start_nid;
-       list_move_tail(&ne->list, &nes->entry_list);
-       nes->entry_cnt++;
-       list_add(&nes->set_list, head);
-}
-
-static void merge_nats_in_set(struct f2fs_sb_info *sbi)
-{
-       struct f2fs_nm_info *nm_i = NM_I(sbi);
-       struct list_head *dirty_list = &nm_i->dirty_nat_entries;
-       struct list_head *set_list = &nm_i->nat_entry_set;
-       struct nat_entry *ne, *tmp;
-
-       write_lock(&nm_i->nat_tree_lock);
-       list_for_each_entry_safe(ne, tmp, dirty_list, list) {
-               if (nat_get_blkaddr(ne) == NEW_ADDR)
-                       continue;
-               add_nat_entry(ne, set_list);
-               nm_i->dirty_nat_cnt++;
-       }
-       write_unlock(&nm_i->nat_tree_lock);
-}
-
 static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
@@ -1867,101 +1845,129 @@ found:
        mutex_unlock(&curseg->curseg_mutex);
 }
 
-/*
- * This function is called during the checkpointing process.
- */
-void flush_nat_entries(struct f2fs_sb_info *sbi)
+static void __adjust_nat_entry_set(struct nat_entry_set *nes,
+                                               struct list_head *head, int max)
 {
-       struct f2fs_nm_info *nm_i = NM_I(sbi);
-       struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
-       struct f2fs_summary_block *sum = curseg->sum_blk;
-       struct nat_entry_set *nes, *tmp;
-       struct list_head *head = &nm_i->nat_entry_set;
-       bool to_journal = true;
+       struct nat_entry_set *cur;
+       nid_t dirty_cnt = 0;
 
-       /* merge nat entries of dirty list to nat entry set temporarily */
-       merge_nats_in_set(sbi);
+       if (nes->entry_cnt >= max)
+               goto add_out;
 
-       /*
-        * if there are no enough space in journal to store dirty nat
-        * entries, remove all entries from journal and merge them
-        * into nat entry set.
-        */
-       if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) {
-               remove_nats_in_journal(sbi);
-
-               /*
-                * merge nat entries of dirty list to nat entry set temporarily
-                */
-               merge_nats_in_set(sbi);
+       list_for_each_entry(cur, head, set_list) {
+               dirty_cnt += cur->entry_cnt;
+               if (dirty_cnt > max)
+                       break;
+               if (cur->entry_cnt >= nes->entry_cnt) {
+                       list_add(&nes->set_list, cur->set_list.prev);
+                       return;
+               }
        }
+add_out:
+       list_add_tail(&nes->set_list, head);
+}
 
-       if (!nm_i->dirty_nat_cnt)
-               return;
+static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
+                                       struct nat_entry_set *set)
+{
+       struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+       struct f2fs_summary_block *sum = curseg->sum_blk;
+       nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
+       bool to_journal = true;
+       struct f2fs_nat_block *nat_blk;
+       struct nat_entry *ne, *cur;
+       struct page *page = NULL;
 
        /*
         * there are two steps to flush nat entries:
         * #1, flush nat entries to journal in current hot data summary block.
         * #2, flush nat entries to nat page.
         */
-       list_for_each_entry_safe(nes, tmp, head, set_list) {
-               struct f2fs_nat_block *nat_blk;
-               struct nat_entry *ne, *cur;
-               struct page *page;
-               nid_t start_nid = nes->start_nid;
+       if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL))
+               to_journal = false;
 
-               if (to_journal &&
-                       !__has_cursum_space(sum, nes->entry_cnt, NAT_JOURNAL))
-                       to_journal = false;
+       if (to_journal) {
+               mutex_lock(&curseg->curseg_mutex);
+       } else {
+               page = get_next_nat_page(sbi, start_nid);
+               nat_blk = page_address(page);
+               f2fs_bug_on(sbi, !nat_blk);
+       }
+
+       /* flush dirty nats in nat entry set */
+       list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
+               struct f2fs_nat_entry *raw_ne;
+               nid_t nid = nat_get_nid(ne);
+               int offset;
 
                if (to_journal) {
-                       mutex_lock(&curseg->curseg_mutex);
+                       offset = lookup_journal_in_cursum(sum,
+                                                       NAT_JOURNAL, nid, 1);
+                       f2fs_bug_on(sbi, offset < 0);
+                       raw_ne = &nat_in_journal(sum, offset);
+                       nid_in_journal(sum, offset) = cpu_to_le32(nid);
                } else {
-                       page = get_next_nat_page(sbi, start_nid);
-                       nat_blk = page_address(page);
-                       f2fs_bug_on(sbi, !nat_blk);
+                       raw_ne = &nat_blk->entries[nid - start_nid];
                }
+               raw_nat_from_node_info(raw_ne, &ne->ni);
 
-               /* flush dirty nats in nat entry set */
-               list_for_each_entry_safe(ne, cur, &nes->entry_list, list) {
-                       struct f2fs_nat_entry *raw_ne;
-                       nid_t nid = nat_get_nid(ne);
-                       int offset;
+               write_lock(&NM_I(sbi)->nat_tree_lock);
+               nat_reset_flag(ne);
+               __clear_nat_cache_dirty(NM_I(sbi), ne);
+               write_unlock(&NM_I(sbi)->nat_tree_lock);
 
-                       if (to_journal) {
-                               offset = lookup_journal_in_cursum(sum,
-                                                       NAT_JOURNAL, nid, 1);
-                               f2fs_bug_on(sbi, offset < 0);
-                               raw_ne = &nat_in_journal(sum, offset);
-                               nid_in_journal(sum, offset) = cpu_to_le32(nid);
-                       } else {
-                               raw_ne = &nat_blk->entries[nid - start_nid];
-                       }
-                       raw_nat_from_node_info(raw_ne, &ne->ni);
+               if (nat_get_blkaddr(ne) == NULL_ADDR)
+                       add_free_nid(sbi, nid, false);
+       }
 
-                       if (nat_get_blkaddr(ne) == NULL_ADDR &&
-                               add_free_nid(sbi, nid, false) <= 0) {
-                               write_lock(&nm_i->nat_tree_lock);
-                               __del_from_nat_cache(nm_i, ne);
-                               write_unlock(&nm_i->nat_tree_lock);
-                       } else {
-                               write_lock(&nm_i->nat_tree_lock);
-                               nat_reset_flag(ne);
-                               __clear_nat_cache_dirty(nm_i, ne);
-                               write_unlock(&nm_i->nat_tree_lock);
-                       }
-               }
+       if (to_journal)
+               mutex_unlock(&curseg->curseg_mutex);
+       else
+               f2fs_put_page(page, 1);
 
-               if (to_journal)
-                       mutex_unlock(&curseg->curseg_mutex);
-               else
-                       f2fs_put_page(page, 1);
+       f2fs_bug_on(sbi, set->entry_cnt);
+       radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
+       kmem_cache_free(nat_entry_set_slab, set);
+}
 
-               f2fs_bug_on(sbi, !list_empty(&nes->entry_list));
-               release_nat_entry_set(nes, nm_i);
+/*
+ * This function is called during the checkpointing process.
+ */
+void flush_nat_entries(struct f2fs_sb_info *sbi)
+{
+       struct f2fs_nm_info *nm_i = NM_I(sbi);
+       struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+       struct f2fs_summary_block *sum = curseg->sum_blk;
+       struct nat_entry_set *setvec[NATVEC_SIZE];
+       struct nat_entry_set *set, *tmp;
+       unsigned int found;
+       nid_t set_idx = 0;
+       LIST_HEAD(sets);
+
+       /*
+        * if there are no enough space in journal to store dirty nat
+        * entries, remove all entries from journal and merge them
+        * into nat entry set.
+        */
+       if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
+               remove_nats_in_journal(sbi);
+
+       if (!nm_i->dirty_nat_cnt)
+               return;
+
+       while ((found = __gang_lookup_nat_set(nm_i,
+                                       set_idx, NATVEC_SIZE, setvec))) {
+               unsigned idx;
+               set_idx = setvec[found - 1]->set + 1;
+               for (idx = 0; idx < found; idx++)
+                       __adjust_nat_entry_set(setvec[idx], &sets,
+                                                       MAX_NAT_JENTRIES(sum));
        }
 
-       f2fs_bug_on(sbi, !list_empty(head));
+       /* flush dirty nats in nat entry set */
+       list_for_each_entry_safe(set, tmp, &sets, set_list)
+               __flush_nat_entry_set(sbi, set);
+
        f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
 }
 
@@ -1989,9 +1995,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
        INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
        INIT_LIST_HEAD(&nm_i->free_nid_list);
        INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
+       INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC);
        INIT_LIST_HEAD(&nm_i->nat_entries);
-       INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
-       INIT_LIST_HEAD(&nm_i->nat_entry_set);
 
        mutex_init(&nm_i->build_lock);
        spin_lock_init(&nm_i->free_nid_list_lock);
index b8ba63c..bd826d9 100644 (file)
@@ -43,6 +43,7 @@ enum {
        IS_CHECKPOINTED,        /* is it checkpointed before? */
        HAS_FSYNCED_INODE,      /* is the inode fsynced before? */
        HAS_LAST_FSYNC,         /* has the latest node fsync mark? */
+       IS_DIRTY,               /* this nat entry is dirty? */
 };
 
 struct nat_entry {
@@ -60,10 +61,6 @@ struct nat_entry {
 #define nat_get_version(nat)           (nat->ni.version)
 #define nat_set_version(nat, v)                (nat->ni.version = v)
 
-#define __set_nat_cache_dirty(nm_i, ne)                                        \
-               list_move_tail(&ne->list, &nm_i->dirty_nat_entries);
-#define __clear_nat_cache_dirty(nm_i, ne)                              \
-               list_move_tail(&ne->list, &nm_i->nat_entries);
 #define inc_node_version(version)      (++version)
 
 static inline void set_nat_flag(struct nat_entry *ne,
@@ -113,9 +110,9 @@ enum mem_type {
 };
 
 struct nat_entry_set {
-       struct list_head set_list;      /* link with all nat sets */
+       struct list_head set_list;      /* link with other nat sets */
        struct list_head entry_list;    /* link with dirty nat entries */
-       nid_t start_nid;                /* start nid of nats in set */
+       nid_t set;                      /* set number*/
        unsigned int entry_cnt;         /* the # of nat entries in set */
 };