Commit 309cc2b6 authored by Jaegeuk Kim's avatar Jaegeuk Kim

f2fs: refactor flush_nat_entries to remove costly reorganizing ops

Previously, f2fs tries to reorganize the dirty nat entries into multiple sets
according to its nid ranges. This can improve the flushing nat pages, however,
if there are a lot of cached nat entries, it becomes a bottleneck.

This patch introduces a new set management flow by removing dirty nat list and
adding a series of set operations when the nat entry becomes dirty.
Signed-off-by: default avatarJaegeuk Kim <jaegeuk@kernel.org>
parent 4b2fecc8
...@@ -164,6 +164,9 @@ struct fsync_inode_entry { ...@@ -164,6 +164,9 @@ struct fsync_inode_entry {
#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se) #define sit_in_journal(sum, i) (sum->sit_j.entries[i].se)
#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno) #define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno)
#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum))
#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum))
static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i) static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
{ {
int before = nats_in_cursum(rs); int before = nats_in_cursum(rs);
...@@ -182,9 +185,8 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, ...@@ -182,9 +185,8 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
int type) int type)
{ {
if (type == NAT_JOURNAL) if (type == NAT_JOURNAL)
return nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES; return size <= MAX_NAT_JENTRIES(sum);
return size <= MAX_SIT_JENTRIES(sum);
return sits_in_cursum(sum) + size <= SIT_JOURNAL_ENTRIES;
} }
/* /*
...@@ -292,11 +294,10 @@ struct f2fs_nm_info { ...@@ -292,11 +294,10 @@ struct f2fs_nm_info {
/* NAT cache management */ /* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */ struct radix_tree_root nat_root;/* root of the nat entry cache */
struct radix_tree_root nat_set_root;/* root of the nat set cache */
rwlock_t nat_tree_lock; /* protect nat_tree_lock */ rwlock_t nat_tree_lock; /* protect nat_tree_lock */
unsigned int nat_cnt; /* the # of cached nat entries */
struct list_head nat_entries; /* cached nat entry list (clean) */ struct list_head nat_entries; /* cached nat entry list (clean) */
struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ unsigned int nat_cnt; /* the # of cached nat entries */
struct list_head nat_entry_set; /* nat entry set list */
unsigned int dirty_nat_cnt; /* total num of nat entries in set */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */
/* free node ids management */ /* free node ids management */
......
...@@ -123,6 +123,57 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) ...@@ -123,6 +123,57 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
kmem_cache_free(nat_entry_slab, e); kmem_cache_free(nat_entry_slab, e);
} }
static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
struct nat_entry *ne)
{
nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
struct nat_entry_set *head;
if (get_nat_flag(ne, IS_DIRTY))
return;
retry:
head = radix_tree_lookup(&nm_i->nat_set_root, set);
if (!head) {
head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
INIT_LIST_HEAD(&head->entry_list);
INIT_LIST_HEAD(&head->set_list);
head->set = set;
head->entry_cnt = 0;
if (radix_tree_insert(&nm_i->nat_set_root, set, head)) {
cond_resched();
goto retry;
}
}
list_move_tail(&ne->list, &head->entry_list);
nm_i->dirty_nat_cnt++;
head->entry_cnt++;
set_nat_flag(ne, IS_DIRTY, true);
}
static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
struct nat_entry *ne)
{
nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;
struct nat_entry_set *head;
head = radix_tree_lookup(&nm_i->nat_set_root, set);
if (head) {
list_move_tail(&ne->list, &nm_i->nat_entries);
set_nat_flag(ne, IS_DIRTY, false);
head->entry_cnt--;
nm_i->dirty_nat_cnt--;
}
}
static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
nid_t start, unsigned int nr, struct nat_entry_set **ep)
{
return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
start, nr);
}
bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
{ {
struct f2fs_nm_info *nm_i = NM_I(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi);
...@@ -1739,79 +1790,6 @@ int restore_node_summary(struct f2fs_sb_info *sbi, ...@@ -1739,79 +1790,6 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
return err; return err;
} }
static struct nat_entry_set *grab_nat_entry_set(void)
{
struct nat_entry_set *nes =
f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
nes->entry_cnt = 0;
INIT_LIST_HEAD(&nes->set_list);
INIT_LIST_HEAD(&nes->entry_list);
return nes;
}
static void release_nat_entry_set(struct nat_entry_set *nes,
struct f2fs_nm_info *nm_i)
{
nm_i->dirty_nat_cnt -= nes->entry_cnt;
list_del(&nes->set_list);
kmem_cache_free(nat_entry_set_slab, nes);
}
static void adjust_nat_entry_set(struct nat_entry_set *nes,
struct list_head *head)
{
struct nat_entry_set *next = nes;
if (list_is_last(&nes->set_list, head))
return;
list_for_each_entry_continue(next, head, set_list)
if (nes->entry_cnt <= next->entry_cnt)
break;
list_move_tail(&nes->set_list, &next->set_list);
}
static void add_nat_entry(struct nat_entry *ne, struct list_head *head)
{
struct nat_entry_set *nes;
nid_t start_nid = START_NID(ne->ni.nid);
list_for_each_entry(nes, head, set_list) {
if (nes->start_nid == start_nid) {
list_move_tail(&ne->list, &nes->entry_list);
nes->entry_cnt++;
adjust_nat_entry_set(nes, head);
return;
}
}
nes = grab_nat_entry_set();
nes->start_nid = start_nid;
list_move_tail(&ne->list, &nes->entry_list);
nes->entry_cnt++;
list_add(&nes->set_list, head);
}
static void merge_nats_in_set(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct list_head *dirty_list = &nm_i->dirty_nat_entries;
struct list_head *set_list = &nm_i->nat_entry_set;
struct nat_entry *ne, *tmp;
write_lock(&nm_i->nat_tree_lock);
list_for_each_entry_safe(ne, tmp, dirty_list, list) {
if (nat_get_blkaddr(ne) == NEW_ADDR)
continue;
add_nat_entry(ne, set_list);
nm_i->dirty_nat_cnt++;
}
write_unlock(&nm_i->nat_tree_lock);
}
static void remove_nats_in_journal(struct f2fs_sb_info *sbi) static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
{ {
struct f2fs_nm_info *nm_i = NM_I(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi);
...@@ -1846,101 +1824,129 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ...@@ -1846,101 +1824,129 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
mutex_unlock(&curseg->curseg_mutex); mutex_unlock(&curseg->curseg_mutex);
} }
/* static void __adjust_nat_entry_set(struct nat_entry_set *nes,
* This function is called during the checkpointing process. struct list_head *head, int max)
*/
void flush_nat_entries(struct f2fs_sb_info *sbi)
{ {
struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry_set *cur;
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
struct f2fs_summary_block *sum = curseg->sum_blk;
struct nat_entry_set *nes, *tmp;
struct list_head *head = &nm_i->nat_entry_set;
bool to_journal = true;
/* merge nat entries of dirty list to nat entry set temporarily */ if (nes->entry_cnt >= max)
merge_nats_in_set(sbi); goto add_out;
/* list_for_each_entry(cur, head, set_list) {
* if there are no enough space in journal to store dirty nat if (cur->entry_cnt >= nes->entry_cnt) {
* entries, remove all entries from journal and merge them list_add(&nes->set_list, cur->set_list.prev);
* into nat entry set. return;
*/ }
if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) {
remove_nats_in_journal(sbi);
/*
* merge nat entries of dirty list to nat entry set temporarily
*/
merge_nats_in_set(sbi);
} }
add_out:
list_add_tail(&nes->set_list, head);
}
if (!nm_i->dirty_nat_cnt) static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
return; struct nat_entry_set *set)
{
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
struct f2fs_summary_block *sum = curseg->sum_blk;
nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
bool to_journal = true;
struct f2fs_nat_block *nat_blk;
struct nat_entry *ne, *cur;
struct page *page = NULL;
/* /*
* there are two steps to flush nat entries: * there are two steps to flush nat entries:
* #1, flush nat entries to journal in current hot data summary block. * #1, flush nat entries to journal in current hot data summary block.
* #2, flush nat entries to nat page. * #2, flush nat entries to nat page.
*/ */
list_for_each_entry_safe(nes, tmp, head, set_list) { if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL))
struct f2fs_nat_block *nat_blk; to_journal = false;
struct nat_entry *ne, *cur;
struct page *page; if (to_journal) {
nid_t start_nid = nes->start_nid; mutex_lock(&curseg->curseg_mutex);
} else {
page = get_next_nat_page(sbi, start_nid);
nat_blk = page_address(page);
f2fs_bug_on(sbi, !nat_blk);
}
if (to_journal && /* flush dirty nats in nat entry set */
!__has_cursum_space(sum, nes->entry_cnt, NAT_JOURNAL)) list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
to_journal = false; struct f2fs_nat_entry *raw_ne;
nid_t nid = nat_get_nid(ne);
int offset;
if (nat_get_blkaddr(ne) == NEW_ADDR)
continue;
if (to_journal) { if (to_journal) {
mutex_lock(&curseg->curseg_mutex); offset = lookup_journal_in_cursum(sum,
NAT_JOURNAL, nid, 1);
f2fs_bug_on(sbi, offset < 0);
raw_ne = &nat_in_journal(sum, offset);
nid_in_journal(sum, offset) = cpu_to_le32(nid);
} else { } else {
page = get_next_nat_page(sbi, start_nid); raw_ne = &nat_blk->entries[nid - start_nid];
nat_blk = page_address(page);
f2fs_bug_on(sbi, !nat_blk);
} }
raw_nat_from_node_info(raw_ne, &ne->ni);
/* flush dirty nats in nat entry set */ write_lock(&NM_I(sbi)->nat_tree_lock);
list_for_each_entry_safe(ne, cur, &nes->entry_list, list) { nat_reset_flag(ne);
struct f2fs_nat_entry *raw_ne; __clear_nat_cache_dirty(NM_I(sbi), ne);
nid_t nid = nat_get_nid(ne); write_unlock(&NM_I(sbi)->nat_tree_lock);
int offset;
if (to_journal) { if (nat_get_blkaddr(ne) == NULL_ADDR)
offset = lookup_journal_in_cursum(sum, add_free_nid(sbi, nid, false);
NAT_JOURNAL, nid, 1); }
f2fs_bug_on(sbi, offset < 0);
raw_ne = &nat_in_journal(sum, offset);
nid_in_journal(sum, offset) = cpu_to_le32(nid);
} else {
raw_ne = &nat_blk->entries[nid - start_nid];
}
raw_nat_from_node_info(raw_ne, &ne->ni);
if (nat_get_blkaddr(ne) == NULL_ADDR && if (to_journal)
add_free_nid(sbi, nid, false) <= 0) { mutex_unlock(&curseg->curseg_mutex);
write_lock(&nm_i->nat_tree_lock); else
__del_from_nat_cache(nm_i, ne); f2fs_put_page(page, 1);
write_unlock(&nm_i->nat_tree_lock);
} else {
write_lock(&nm_i->nat_tree_lock);
nat_reset_flag(ne);
__clear_nat_cache_dirty(nm_i, ne);
write_unlock(&nm_i->nat_tree_lock);
}
}
if (to_journal) if (!set->entry_cnt) {
mutex_unlock(&curseg->curseg_mutex); radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
else kmem_cache_free(nat_entry_set_slab, set);
f2fs_put_page(page, 1); }
}
/*
* This function is called during the checkpointing process.
*/
void flush_nat_entries(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
struct f2fs_summary_block *sum = curseg->sum_blk;
struct nat_entry_set *setvec[NATVEC_SIZE];
struct nat_entry_set *set, *tmp;
unsigned int found;
nid_t set_idx = 0;
LIST_HEAD(sets);
f2fs_bug_on(sbi, !list_empty(&nes->entry_list)); /*
release_nat_entry_set(nes, nm_i); * if there are no enough space in journal to store dirty nat
* entries, remove all entries from journal and merge them
* into nat entry set.
*/
if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
remove_nats_in_journal(sbi);
if (!nm_i->dirty_nat_cnt)
return;
while ((found = __gang_lookup_nat_set(nm_i,
set_idx, NATVEC_SIZE, setvec))) {
unsigned idx;
set_idx = setvec[found - 1]->set + 1;
for (idx = 0; idx < found; idx++)
__adjust_nat_entry_set(setvec[idx], &sets,
MAX_NAT_JENTRIES(sum));
} }
f2fs_bug_on(sbi, !list_empty(head)); /* flush dirty nats in nat entry set */
list_for_each_entry_safe(set, tmp, &sets, set_list)
__flush_nat_entry_set(sbi, set);
f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
} }
...@@ -1968,9 +1974,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi) ...@@ -1968,9 +1974,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->free_nid_list); INIT_LIST_HEAD(&nm_i->free_nid_list);
INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->nat_entries); INIT_LIST_HEAD(&nm_i->nat_entries);
INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
INIT_LIST_HEAD(&nm_i->nat_entry_set);
mutex_init(&nm_i->build_lock); mutex_init(&nm_i->build_lock);
spin_lock_init(&nm_i->free_nid_list_lock); spin_lock_init(&nm_i->free_nid_list_lock);
......
...@@ -43,6 +43,7 @@ enum { ...@@ -43,6 +43,7 @@ enum {
IS_CHECKPOINTED, /* is it checkpointed before? */ IS_CHECKPOINTED, /* is it checkpointed before? */
HAS_FSYNCED_INODE, /* is the inode fsynced before? */ HAS_FSYNCED_INODE, /* is the inode fsynced before? */
HAS_LAST_FSYNC, /* has the latest node fsync mark? */ HAS_LAST_FSYNC, /* has the latest node fsync mark? */
IS_DIRTY, /* this nat entry is dirty? */
}; };
struct nat_entry { struct nat_entry {
...@@ -60,10 +61,6 @@ struct nat_entry { ...@@ -60,10 +61,6 @@ struct nat_entry {
#define nat_get_version(nat) (nat->ni.version) #define nat_get_version(nat) (nat->ni.version)
#define nat_set_version(nat, v) (nat->ni.version = v) #define nat_set_version(nat, v) (nat->ni.version = v)
#define __set_nat_cache_dirty(nm_i, ne) \
list_move_tail(&ne->list, &nm_i->dirty_nat_entries);
#define __clear_nat_cache_dirty(nm_i, ne) \
list_move_tail(&ne->list, &nm_i->nat_entries);
#define inc_node_version(version) (++version) #define inc_node_version(version) (++version)
static inline void set_nat_flag(struct nat_entry *ne, static inline void set_nat_flag(struct nat_entry *ne,
...@@ -113,9 +110,9 @@ enum mem_type { ...@@ -113,9 +110,9 @@ enum mem_type {
}; };
struct nat_entry_set { struct nat_entry_set {
struct list_head set_list; /* link with all nat sets */ struct list_head set_list; /* link with other nat sets */
struct list_head entry_list; /* link with dirty nat entries */ struct list_head entry_list; /* link with dirty nat entries */
nid_t start_nid; /* start nid of nats in set */ nid_t set; /* set number*/
unsigned int entry_cnt; /* the # of nat entries in set */ unsigned int entry_cnt; /* the # of nat entries in set */
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment