Commit b5e8a699 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Improved inode create optimization

This shards new inodes into different btree nodes by using the processor
ID for the high bits of the new inode number. Much faster than the
previous inode create optimization - this also helps with sharding in
the other btrees that index by inode number.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent df082b3a
...@@ -814,11 +814,8 @@ mempool_t bio_bounce_pages; ...@@ -814,11 +814,8 @@ mempool_t bio_bounce_pages;
struct mutex verify_lock; struct mutex verify_lock;
#endif #endif
struct mutex inode_create_lock; u64 *unused_inode_hints;
unsigned unused_inodes_nr; unsigned inode_shard_bits;
u64 unused_inodes[64];
u32 unused_inodes_gens[64];
u64 unused_inode_hint;
/* /*
* A btree node on disk could have too many bsets for an iterator to fit * A btree node on disk could have too many bsets for an iterator to fit
......
...@@ -361,55 +361,6 @@ static inline u32 bkey_generation(struct bkey_s_c k) ...@@ -361,55 +361,6 @@ static inline u32 bkey_generation(struct bkey_s_c k)
} }
} }
static int scan_free_inums(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct btree_iter *iter = NULL;
struct bkey_s_c k;
u64 min = BLOCKDEV_INODE_MAX;
u64 max = c->opts.inodes_32bit
? S32_MAX : S64_MAX;
u64 start = max(min, READ_ONCE(c->unused_inode_hint));
int ret = 0;
iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, start),
BTREE_ITER_SLOTS);
if (IS_ERR(iter))
return PTR_ERR(iter);
again:
for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) {
if (bkey_cmp(iter->pos, POS(0, max)) > 0)
break;
/*
* This doesn't check the btree key cache, but we don't care:
* we have to recheck with an intent lock held on the slot we're
* inserting to anyways:
*/
if (k.k->type != KEY_TYPE_inode) {
if (c->unused_inodes_nr < ARRAY_SIZE(c->unused_inodes)) {
c->unused_inodes[c->unused_inodes_nr] = k.k->p.offset;
c->unused_inodes_gens[c->unused_inodes_nr] = bkey_generation(k);
c->unused_inodes_nr++;
}
if (c->unused_inodes_nr == ARRAY_SIZE(c->unused_inodes))
goto out;
}
}
if (!ret && start != min) {
max = start;
start = min;
bch2_btree_iter_set_pos(iter, POS(0, start));
goto again;
}
out:
c->unused_inode_hint = iter->pos.offset;
bch2_trans_iter_put(trans, iter);
return ret;
}
int bch2_inode_create(struct btree_trans *trans, int bch2_inode_create(struct btree_trans *trans,
struct bch_inode_unpacked *inode_u) struct bch_inode_unpacked *inode_u)
{ {
...@@ -417,64 +368,68 @@ int bch2_inode_create(struct btree_trans *trans, ...@@ -417,64 +368,68 @@ int bch2_inode_create(struct btree_trans *trans,
struct bkey_inode_buf *inode_p; struct bkey_inode_buf *inode_p;
struct btree_iter *iter = NULL; struct btree_iter *iter = NULL;
struct bkey_s_c k; struct bkey_s_c k;
u64 inum; u64 min, max, start, *hint;
u32 generation; int ret;
int ret = 0;
unsigned cpu = raw_smp_processor_id();
unsigned bits = (c->opts.inodes_32bit
? 31 : 63) - c->inode_shard_bits;
min = (cpu << bits);
max = (cpu << bits) | ~(ULLONG_MAX << bits);
min = max_t(u64, min, BLOCKDEV_INODE_MAX);
hint = c->unused_inode_hints + cpu;
start = READ_ONCE(*hint);
if (start >= max || start < min)
start = min;
inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
if (IS_ERR(inode_p)) if (IS_ERR(inode_p))
return PTR_ERR(inode_p); return PTR_ERR(inode_p);
again:
for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
if (bkey_cmp(iter->pos, POS(0, max)) > 0)
break;
iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS_MIN, /*
BTREE_ITER_CACHED| * There's a potential cache coherency issue with the btree key
BTREE_ITER_INTENT); * cache code here - we're iterating over the btree, skipping
if (IS_ERR(iter)) * that cache. We should never see an empty slot that isn't
return PTR_ERR(iter); * actually empty due to a pending update in the key cache
retry: * because the update that creates the inode isn't done with a
if (!mutex_trylock(&c->inode_create_lock)) { * cached iterator, but - better safe than sorry, check the
bch2_trans_unlock(trans); * cache before using a slot:
mutex_lock(&c->inode_create_lock); */
if (!bch2_trans_relock(trans)) { if (k.k->type != KEY_TYPE_inode &&
mutex_unlock(&c->inode_create_lock); !bch2_btree_key_cache_find(c, BTREE_ID_INODES, iter->pos))
ret = -EINTR; goto found_slot;
goto err;
}
}
if (!c->unused_inodes_nr)
ret = scan_free_inums(trans);
if (!ret && !c->unused_inodes_nr)
ret = -ENOSPC;
if (!ret) {
--c->unused_inodes_nr;
inum = c->unused_inodes[c->unused_inodes_nr];
generation = c->unused_inodes_gens[c->unused_inodes_nr];
} }
mutex_unlock(&c->inode_create_lock); bch2_trans_iter_put(trans, iter);
if (ret)
goto err;
bch2_btree_iter_set_pos(iter, POS(0, inum));
/* Recheck that the slot is free with an intent lock held: */
k = bch2_btree_iter_peek_cached(iter);
ret = bkey_err(k);
if (ret) if (ret)
goto err; return ret;
if (k.k->type == KEY_TYPE_inode) if (start != min) {
goto retry; /* Retry from start */
start = min;
goto again;
}
inode_u->bi_inum = inum; return -ENOSPC;
inode_u->bi_generation = generation; found_slot:
*hint = k.k->p.offset;
inode_u->bi_inum = k.k->p.offset;
inode_u->bi_generation = bkey_generation(k);
bch2_inode_pack(inode_p, inode_u); bch2_inode_pack(inode_p, inode_u);
bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
err:
bch2_trans_iter_put(trans, iter); bch2_trans_iter_put(trans, iter);
return ret; return 0;
} }
int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
......
...@@ -493,6 +493,7 @@ static void __bch2_fs_free(struct bch_fs *c) ...@@ -493,6 +493,7 @@ static void __bch2_fs_free(struct bch_fs *c)
kfree(c->replicas_gc.entries); kfree(c->replicas_gc.entries);
kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(rcu_dereference_protected(c->disk_groups, 1));
kfree(c->journal_seq_blacklist_table); kfree(c->journal_seq_blacklist_table);
kfree(c->unused_inode_hints);
free_heap(&c->copygc_heap); free_heap(&c->copygc_heap);
if (c->journal_reclaim_wq) if (c->journal_reclaim_wq)
...@@ -703,8 +704,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) ...@@ -703,8 +704,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
seqcount_init(&c->usage_lock); seqcount_init(&c->usage_lock);
mutex_init(&c->inode_create_lock);
c->copy_gc_enabled = 1; c->copy_gc_enabled = 1;
c->rebalance.enabled = 1; c->rebalance.enabled = 1;
c->promote_whole_extents = true; c->promote_whole_extents = true;
...@@ -746,6 +745,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) ...@@ -746,6 +745,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
(btree_blocks(c) + 1) * 2 * (btree_blocks(c) + 1) * 2 *
sizeof(struct sort_iter_set); sizeof(struct sort_iter_set);
c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
if (!(c->wq = alloc_workqueue("bcachefs", if (!(c->wq = alloc_workqueue("bcachefs",
WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc", !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
...@@ -764,6 +765,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) ...@@ -764,6 +765,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) || btree_bytes(c)) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
!(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
sizeof(u64), GFP_KERNEL)) ||
bch2_io_clock_init(&c->io_clock[READ]) || bch2_io_clock_init(&c->io_clock[READ]) ||
bch2_io_clock_init(&c->io_clock[WRITE]) || bch2_io_clock_init(&c->io_clock[WRITE]) ||
bch2_fs_journal_init(&c->journal) || bch2_fs_journal_init(&c->journal) ||
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment