Commit 7a51608d authored by Kent Overstreet's avatar Kent Overstreet

bcachefs: Rework btree node pinning

In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers

Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).

Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.

Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 91ddd715
...@@ -752,10 +752,12 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, ...@@ -752,10 +752,12 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
s64 mem_may_pin = mem_may_pin_bytes(c); s64 mem_may_pin = mem_may_pin_bytes(c);
int ret = 0; int ret = 0;
bch2_btree_cache_unpin(c);
btree_interior_mask |= btree_leaf_mask; btree_interior_mask |= btree_leaf_mask;
c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask; c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask;
c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask; c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask;
c->btree_cache.pinned_nodes_start = start; c->btree_cache.pinned_nodes_start = start;
c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX;
...@@ -777,6 +779,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, ...@@ -777,6 +779,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
BBPOS(btree, b->key.k.p); BBPOS(btree, b->key.k.p);
break; break;
} }
bch2_node_pin(c, b);
0; 0;
})); }));
} }
...@@ -936,8 +939,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) ...@@ -936,8 +939,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
bch2_trans_put(trans); bch2_trans_put(trans);
bch2_bkey_buf_exit(&s.last_flushed, c); bch2_bkey_buf_exit(&s.last_flushed, c);
c->btree_cache.pinned_nodes_leaf_mask = 0; bch2_btree_cache_unpin(c);
c->btree_cache.pinned_nodes_interior_mask = 0;
bch_err_fn(c, ret); bch_err_fn(c, ret);
return ret; return ret;
...@@ -1053,8 +1055,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) ...@@ -1053,8 +1055,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
} }
bch2_trans_put(trans); bch2_trans_put(trans);
c->btree_cache.pinned_nodes_leaf_mask = 0; bch2_btree_cache_unpin(c);
c->btree_cache.pinned_nodes_interior_mask = 0;
bch_err_fn(c, ret); bch_err_fn(c, ret);
return ret; return ret;
......
This diff is collapsed.
...@@ -19,6 +19,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); ...@@ -19,6 +19,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id); unsigned, enum btree_id);
void bch2_node_pin(struct bch_fs *, struct btree *);
void bch2_btree_cache_unpin(struct bch_fs *);
void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned, void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_i *); struct bkey_s_c, struct bkey_i *);
......
...@@ -147,8 +147,7 @@ struct btree { ...@@ -147,8 +147,7 @@ struct btree {
x(noevict) \ x(noevict) \
x(write_blocked) \ x(write_blocked) \
x(will_make_reachable) \ x(will_make_reachable) \
x(access_bit) \ x(access_bit)
x(pinned) \
enum bch_btree_cache_not_freed_reasons { enum bch_btree_cache_not_freed_reasons {
#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n, #define x(n) BCH_BTREE_CACHE_NOT_FREED_##n,
...@@ -157,6 +156,13 @@ enum bch_btree_cache_not_freed_reasons { ...@@ -157,6 +156,13 @@ enum bch_btree_cache_not_freed_reasons {
BCH_BTREE_CACHE_NOT_FREED_REASONS_NR, BCH_BTREE_CACHE_NOT_FREED_REASONS_NR,
}; };
struct btree_cache_list {
unsigned idx;
struct shrinker *shrink;
struct list_head list;
size_t nr;
};
struct btree_cache { struct btree_cache {
struct rhashtable table; struct rhashtable table;
bool table_init_done; bool table_init_done;
...@@ -174,12 +180,11 @@ struct btree_cache { ...@@ -174,12 +180,11 @@ struct btree_cache {
* should never grow past ~2-3 nodes in practice. * should never grow past ~2-3 nodes in practice.
*/ */
struct mutex lock; struct mutex lock;
struct list_head live;
struct list_head freeable; struct list_head freeable;
struct list_head freed_pcpu; struct list_head freed_pcpu;
struct list_head freed_nonpcpu; struct list_head freed_nonpcpu;
struct btree_cache_list live[2];
size_t nr_live;
size_t nr_freeable; size_t nr_freeable;
size_t nr_reserve; size_t nr_reserve;
size_t nr_by_btree[BTREE_ID_NR]; size_t nr_by_btree[BTREE_ID_NR];
...@@ -188,7 +193,6 @@ struct btree_cache { ...@@ -188,7 +193,6 @@ struct btree_cache {
/* shrinker stats */ /* shrinker stats */
size_t nr_freed; size_t nr_freed;
u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR]; u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR];
struct shrinker *shrink;
/* /*
* If we need to allocate memory for a new btree node and that * If we need to allocate memory for a new btree node and that
...@@ -201,8 +205,8 @@ struct btree_cache { ...@@ -201,8 +205,8 @@ struct btree_cache {
struct bbpos pinned_nodes_start; struct bbpos pinned_nodes_start;
struct bbpos pinned_nodes_end; struct bbpos pinned_nodes_end;
u64 pinned_nodes_leaf_mask; /* btree id mask: 0 for leaves, 1 for interior */
u64 pinned_nodes_interior_mask; u64 pinned_nodes_mask[2];
}; };
struct btree_node_iter { struct btree_node_iter {
...@@ -594,7 +598,8 @@ enum btree_write_type { ...@@ -594,7 +598,8 @@ enum btree_write_type {
x(dying) \ x(dying) \
x(fake) \ x(fake) \
x(need_rewrite) \ x(need_rewrite) \
x(never_write) x(never_write) \
x(pinned)
enum btree_flags { enum btree_flags {
/* First bits for btree node write type */ /* First bits for btree node write type */
......
...@@ -1904,7 +1904,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans * ...@@ -1904,7 +1904,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans *
six_unlock_intent(&n->c.lock); six_unlock_intent(&n->c.lock);
mutex_lock(&c->btree_cache.lock); mutex_lock(&c->btree_cache.lock);
list_add_tail(&b->list, &c->btree_cache.live); list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list);
mutex_unlock(&c->btree_cache.lock); mutex_unlock(&c->btree_cache.lock);
bch2_trans_verify_locks(trans); bch2_trans_verify_locks(trans);
......
...@@ -641,6 +641,7 @@ static u64 journal_seq_to_flush(struct journal *j) ...@@ -641,6 +641,7 @@ static u64 journal_seq_to_flush(struct journal *j)
static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
{ {
struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct btree_cache *bc = &c->btree_cache;
bool kthread = (current->flags & PF_KTHREAD) != 0; bool kthread = (current->flags & PF_KTHREAD) != 0;
u64 seq_to_flush; u64 seq_to_flush;
size_t min_nr, min_key_cache, nr_flushed; size_t min_nr, min_key_cache, nr_flushed;
...@@ -681,7 +682,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) ...@@ -681,7 +682,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
if (j->watermark != BCH_WATERMARK_stripe) if (j->watermark != BCH_WATERMARK_stripe)
min_nr = 1; min_nr = 1;
if (atomic_long_read(&c->btree_cache.nr_dirty) * 2 > c->btree_cache.nr_live) size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr;
if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live)
min_nr = 1; min_nr = 1;
min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
...@@ -689,8 +691,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) ...@@ -689,8 +691,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
trace_and_count(c, journal_reclaim_start, c, trace_and_count(c, journal_reclaim_start, c,
direct, kicked, direct, kicked,
min_nr, min_key_cache, min_nr, min_key_cache,
atomic_long_read(&c->btree_cache.nr_dirty), atomic_long_read(&bc->nr_dirty), btree_cache_live,
c->btree_cache.nr_live,
atomic_long_read(&c->btree_key_cache.nr_dirty), atomic_long_read(&c->btree_key_cache.nr_dirty),
atomic_long_read(&c->btree_key_cache.nr_keys)); atomic_long_read(&c->btree_key_cache.nr_keys));
......
...@@ -244,14 +244,18 @@ static struct attribute sysfs_state_rw = { ...@@ -244,14 +244,18 @@ static struct attribute sysfs_state_rw = {
static size_t bch2_btree_cache_size(struct bch_fs *c) static size_t bch2_btree_cache_size(struct bch_fs *c)
{ {
struct btree_cache *bc = &c->btree_cache;
size_t ret = 0; size_t ret = 0;
struct btree *b; struct btree *b;
mutex_lock(&c->btree_cache.lock); mutex_lock(&bc->lock);
list_for_each_entry(b, &c->btree_cache.live, list) list_for_each_entry(b, &bc->live[0].list, list)
ret += btree_buf_bytes(b); ret += btree_buf_bytes(b);
list_for_each_entry(b, &bc->live[1].list, list)
mutex_unlock(&c->btree_cache.lock); ret += btree_buf_bytes(b);
list_for_each_entry(b, &bc->freeable, list)
ret += btree_buf_bytes(b);
mutex_unlock(&bc->lock);
return ret; return ret;
} }
...@@ -444,11 +448,12 @@ STORE(bch2_fs) ...@@ -444,11 +448,12 @@ STORE(bch2_fs)
return -EROFS; return -EROFS;
if (attr == &sysfs_trigger_btree_cache_shrink) { if (attr == &sysfs_trigger_btree_cache_shrink) {
struct btree_cache *bc = &c->btree_cache;
struct shrink_control sc; struct shrink_control sc;
sc.gfp_mask = GFP_KERNEL; sc.gfp_mask = GFP_KERNEL;
sc.nr_to_scan = strtoul_or_return(buf); sc.nr_to_scan = strtoul_or_return(buf);
c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc); bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc);
} }
if (attr == &sysfs_trigger_btree_key_cache_shrink) { if (attr == &sysfs_trigger_btree_key_cache_shrink) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment