Commit a0b73c1c authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Add (partial) support for fixing btree topology

When we walk the btrees during recovery, part of that is checking that
btree topology is correct: for every interior btree node, its child
nodes should exactly span the range the parent node covers.

Previously, we had checks for this, but not repair code. Now that we
have the ability to do btree updates during initial GC, this patch adds
that repair code.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 5b593ee1
......@@ -509,7 +509,8 @@ enum {
BCH_FS_ERRORS_FIXED,
/* misc: */
BCH_FS_FIXED_GENS,
BCH_FS_NEED_ANOTHER_GC,
BCH_FS_DELETED_NODES,
BCH_FS_NEED_ALLOC_WRITE,
BCH_FS_REBUILD_REPLICAS,
BCH_FS_HOLD_BTREE_WRITES,
......
......@@ -7,6 +7,7 @@
#include "btree_iter.h"
#include "btree_locking.h"
#include "debug.h"
#include "error.h"
#include "trace.h"
#include <linux/prefetch.h>
......@@ -813,9 +814,12 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
return ERR_PTR(-EIO);
}
EBUG_ON(b->c.btree_id != iter->btree_id ||
BTREE_NODE_LEVEL(b->data) != level ||
bkey_cmp(b->data->max_key, k->k.p));
EBUG_ON(b->c.btree_id != iter->btree_id);
EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
bkey_cmp(b->data->min_key,
bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
return b;
}
......@@ -823,7 +827,8 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
const struct bkey_i *k,
enum btree_id btree_id,
unsigned level)
unsigned level,
bool nofill)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
......@@ -838,6 +843,9 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
retry:
b = btree_cache_find(bc, k);
if (unlikely(!b)) {
if (nofill)
return NULL;
b = bch2_btree_node_fill(c, NULL, k, btree_id,
level, SIX_LOCK_read, true);
......@@ -884,9 +892,12 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
return ERR_PTR(-EIO);
}
EBUG_ON(b->c.btree_id != btree_id ||
BTREE_NODE_LEVEL(b->data) != level ||
bkey_cmp(b->data->max_key, k->k.p));
EBUG_ON(b->c.btree_id != btree_id);
EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
bkey_cmp(b->data->min_key,
bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
return b;
}
......@@ -996,8 +1007,22 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
if (sib != btree_prev_sib)
swap(n1, n2);
BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p),
n2->data->min_key));
if (bkey_cmp(bkey_successor(n1->key.k.p),
n2->data->min_key)) {
char buf1[200], buf2[200];
bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key));
bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key));
bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n"
"prev: %s\n"
"next: %s\n",
bch2_btree_ids[iter->btree_id], level,
buf1, buf2);
six_unlock_intent(&ret->c.lock);
ret = NULL;
}
}
bch2_btree_trans_verify_locks(trans);
......
......@@ -26,7 +26,7 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
enum six_lock_type, unsigned long);
struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
enum btree_id, unsigned);
enum btree_id, unsigned, bool);
struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
struct btree *, enum btree_node_sibling);
......
......@@ -50,6 +50,10 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
__gc_pos_set(c, new_pos);
}
/*
* Missing: if an interior btree node is empty, we need to do something -
* perhaps just kill it
*/
static int bch2_gc_check_topology(struct bch_fs *c,
struct btree *b,
struct bkey_buf *prev,
......@@ -62,6 +66,8 @@ static int bch2_gc_check_topology(struct bch_fs *c,
? node_start
: bkey_successor(prev->k->k.p);
char buf1[200], buf2[200];
bool update_min = false;
bool update_max = false;
int ret = 0;
if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
......@@ -75,22 +81,79 @@ static int bch2_gc_check_topology(struct bch_fs *c,
bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c,
"btree node with incorrect min_key:\n prev %s\n cur %s",
"btree node with incorrect min_key at btree %s level %u:\n"
" prev %s\n"
" cur %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
buf1,
(bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) {
BUG();
}
(bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)))
update_min = true;
}
if (fsck_err_on(is_last &&
bkey_cmp(cur.k->k.p, node_end), c,
"btree node with incorrect max_key:\n %s\n expected %s",
"btree node with incorrect max_key at btree %s level %u:\n"
" %s\n"
" expected %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
(bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
(bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) {
BUG();
}
(bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)))
update_max = true;
bch2_bkey_buf_copy(prev, c, cur.k);
if (update_min || update_max) {
struct bkey_i *new;
struct bkey_i_btree_ptr_v2 *bp = NULL;
struct btree *n;
if (update_max) {
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur.k->k.p);
if (ret)
return ret;
}
new = kmalloc(bkey_bytes(&cur.k->k), GFP_KERNEL);
if (!new)
return -ENOMEM;
bkey_copy(new, cur.k);
if (new->k.type == KEY_TYPE_btree_ptr_v2)
bp = bkey_i_to_btree_ptr_v2(new);
if (update_min)
bp->v.min_key = expected_start;
if (update_max)
new->k.p = node_end;
if (bp)
SET_BTREE_PTR_RANGE_UPDATED(&bp->v, true);
ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level, new);
if (ret) {
kfree(new);
return ret;
}
n = bch2_btree_node_get_noiter(c, cur.k, b->c.btree_id,
b->c.level - 1, true);
if (n) {
mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, n);
bkey_copy(&n->key, new);
if (update_min)
n->data->min_key = expected_start;
if (update_max)
n->data->max_key = node_end;
ret = __bch2_btree_node_hash_insert(&c->btree_cache, n);
BUG_ON(ret);
mutex_unlock(&c->btree_cache.lock);
six_unlock_read(&n->c.lock);
}
}
fsck_err:
return ret;
}
......@@ -147,12 +210,13 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
ptr->dev, PTR_BUCKET_NR(ca, ptr),
bch2_data_types[ptr_data_type(k.k, ptr)],
ptr->gen, g->mark.gen)) {
/* XXX if it's a cached ptr, drop it */
g2->_mark.gen = g->_mark.gen = ptr->gen;
g2->gen_valid = g->gen_valid = true;
g2->_mark.data_type = 0;
g2->_mark.dirty_sectors = 0;
g2->_mark.cached_sectors = 0;
set_bit(BCH_FS_FIXED_GENS, &c->flags);
set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
}
}
......@@ -298,8 +362,6 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
break;
if (b->c.level) {
struct btree *child;
bch2_bkey_buf_reassemble(&cur, c, k);
k = bkey_i_to_s_c(cur.k);
......@@ -310,26 +372,49 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
!bch2_btree_and_journal_iter_peek(&iter).k);
if (ret)
break;
} else {
bch2_btree_and_journal_iter_advance(&iter);
}
}
if (b->c.level > target_depth) {
child = bch2_btree_node_get_noiter(c, cur.k,
b->c.btree_id, b->c.level - 1);
ret = PTR_ERR_OR_ZERO(child);
if (ret)
break;
if (b->c.level > target_depth) {
bch2_btree_and_journal_iter_exit(&iter);
bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
ret = bch2_gc_btree_init_recurse(c, child,
target_depth);
six_unlock_read(&child->c.lock);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
struct btree *child;
bch2_bkey_buf_reassemble(&cur, c, k);
bch2_btree_and_journal_iter_advance(&iter);
child = bch2_btree_node_get_noiter(c, cur.k,
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(child);
if (fsck_err_on(ret == -EIO, c,
"unreadable btree node")) {
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur.k->k.p);
if (ret)
break;
return ret;
set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
continue;
}
} else {
bch2_btree_and_journal_iter_advance(&iter);
if (ret)
break;
ret = bch2_gc_btree_init_recurse(c, child,
target_depth);
six_unlock_read(&child->c.lock);
if (ret)
break;
}
}
fsck_err:
bch2_bkey_buf_exit(&cur, c);
bch2_bkey_buf_exit(&prev, c);
bch2_btree_and_journal_iter_exit(&iter);
......@@ -816,16 +901,15 @@ int bch2_gc(struct bch_fs *c, bool initial)
bch2_mark_allocator_buckets(c);
c->gc_count++;
out:
if (!ret &&
(test_bit(BCH_FS_FIXED_GENS, &c->flags) ||
(!iter && bch2_test_restart_gc))) {
if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
(!iter && bch2_test_restart_gc)) {
/*
* XXX: make sure gens we fixed got saved
*/
if (iter++ <= 2) {
bch_info(c, "Fixed gens, restarting mark and sweep:");
clear_bit(BCH_FS_FIXED_GENS, &c->flags);
bch_info(c, "Second GC pass needed, restarting:");
clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
percpu_down_write(&c->mark_lock);
......@@ -840,7 +924,7 @@ int bch2_gc(struct bch_fs *c, bool initial)
bch_info(c, "Unable to fix bucket gens, looping");
ret = -EINVAL;
}
out:
if (!ret) {
bch2_journal_block(&c->journal);
......
......@@ -342,7 +342,8 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
bch2_btree_and_journal_iter_advance(&iter);
child = bch2_btree_node_get_noiter(c, tmp.k,
b->c.btree_id, b->c.level - 1);
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(child);
if (ret)
......@@ -766,7 +767,8 @@ static int bch2_journal_replay(struct bch_fs *c,
bch2_journal_flush_all_pins(j);
return bch2_journal_error(j);
err:
bch_err(c, "journal replay: error %d while replaying key", ret);
bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
ret, bch2_btree_ids[i->btree_id], i->level);
return ret;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment