Commit 43f5ea46 authored by Kent Overstreet's avatar Kent Overstreet

bcachefs: Topology repair now uses nodes found by scanning to fill holes

With the new btree node scan code, we can now recover from corrupt btree
roots - simply create a new fake root at depth 1, and then insert all
the leaves we found.

If the root wasn't corrupt but there's corruption elsewhere in the
btree, we can fill in holes as needed with the newest version of a given
node(s) from the scan; we also check if a given btree node is older than
what we found from the scan.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 4409b808
......@@ -13,6 +13,7 @@
#include "btree_journal_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_node_scan.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "btree_gc.h"
......@@ -41,6 +42,7 @@
#define DROP_THIS_NODE 10
#define DROP_PREV_NODE 11
#define DID_FILL_FROM_SCAN 12
static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
{
......@@ -129,6 +131,17 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
struct bkey_i_btree_ptr_v2 *new;
int ret;
if (c->opts.verbose) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
prt_str(&buf, " -> ");
bch2_bpos_to_text(&buf, new_min);
bch_info(c, "%s(): %s", __func__, buf.buf);
printbuf_exit(&buf);
}
new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
if (!new)
return -BCH_ERR_ENOMEM_gc_repair_key;
......@@ -154,6 +167,17 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
struct bkey_i_btree_ptr_v2 *new;
int ret;
if (c->opts.verbose) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
prt_str(&buf, " -> ");
bch2_bpos_to_text(&buf, new_max);
bch_info(c, "%s(): %s", __func__, buf.buf);
printbuf_exit(&buf);
}
ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
if (ret)
return ret;
......@@ -185,127 +209,138 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
return 0;
}
static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
struct btree *prev, struct btree *cur)
static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b,
struct btree *prev, struct btree *cur,
struct bpos *pulled_from_scan)
{
struct bpos expected_start = !prev
? b->data->min_key
: bpos_successor(prev->key.k.p);
struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
struct printbuf buf = PRINTBUF;
int ret = 0;
if (!prev) {
prt_printf(&buf1, "start of node: ");
bch2_bpos_to_text(&buf1, b->data->min_key);
} else {
bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
!bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
b->data->min_key));
if (bpos_eq(expected_start, cur->data->min_key))
return 0;
prt_printf(&buf, " at btree %s level %u:\n parent: ",
bch2_btree_id_str(b->c.btree_id), b->c.level);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
if (prev) {
prt_printf(&buf, "\n prev: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
}
bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
if (prev &&
bpos_gt(expected_start, cur->data->min_key) &&
BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {
/* cur overwrites prev: */
if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key,
cur->data->min_key), c,
btree_node_topology_overwritten_by_next_node,
"btree node overwritten by next node at btree %s level %u:\n"
" node %s\n"
" next %s",
bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf)) {
ret = DROP_PREV_NODE;
goto out;
}
prt_str(&buf, "\n next: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p,
bpos_predecessor(cur->data->min_key)), c,
btree_node_topology_bad_max_key,
"btree node with incorrect max_key at btree %s level %u:\n"
" node %s\n"
" next %s",
bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf))
ret = set_node_max(c, prev,
bpos_predecessor(cur->data->min_key));
} else {
/* prev overwrites cur: */
if (mustfix_fsck_err_on(bpos_ge(expected_start,
cur->data->max_key), c,
btree_node_topology_overwritten_by_prev_node,
"btree node overwritten by prev node at btree %s level %u:\n"
" prev %s\n"
" node %s",
bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf)) {
ret = DROP_THIS_NODE;
goto out;
}
if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */
if (b->c.level == 1 &&
bpos_lt(*pulled_from_scan, cur->data->min_key)) {
ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
expected_start,
bpos_predecessor(cur->data->min_key));
if (ret)
goto err;
if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
btree_node_topology_bad_min_key,
"btree node with incorrect min_key at btree %s level %u:\n"
" prev %s\n"
" node %s",
bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf))
ret = set_node_min(c, cur, expected_start);
*pulled_from_scan = cur->data->min_key;
ret = DID_FILL_FROM_SCAN;
} else {
if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
"btree node with incorrect min_key%s", buf.buf))
ret = set_node_min(c, cur, expected_start);
}
} else { /* overlap */
if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */
if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */
if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node,
"btree node overwritten by next node%s", buf.buf))
ret = DROP_PREV_NODE;
} else {
if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
"btree node with incorrect max_key%s", buf.buf))
ret = set_node_max(c, prev,
bpos_predecessor(cur->data->min_key));
}
} else {
if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */
if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node,
"btree node overwritten by prev node%s", buf.buf))
ret = DROP_THIS_NODE;
} else {
if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
"btree node with incorrect min_key%s", buf.buf))
ret = set_node_min(c, cur, expected_start);
}
}
}
out:
err:
fsck_err:
printbuf_exit(&buf2);
printbuf_exit(&buf1);
printbuf_exit(&buf);
return ret;
}
static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
struct btree *child)
struct btree *child, struct bpos *pulled_from_scan)
{
struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
struct printbuf buf = PRINTBUF;
int ret = 0;
bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
bch2_bpos_to_text(&buf2, b->key.k.p);
if (bpos_eq(child->key.k.p, b->key.k.p))
return 0;
if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c,
btree_node_topology_bad_max_key,
"btree node with incorrect max_key at btree %s level %u:\n"
" %s\n"
" expected %s",
bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf)) {
ret = set_node_max(c, child, b->key.k.p);
if (ret)
goto err;
prt_printf(&buf, "at btree %s level %u:\n parent: ",
bch2_btree_id_str(b->c.btree_id), b->c.level);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
prt_str(&buf, "\n child: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
"btree node with incorrect max_key%s", buf.buf)) {
if (b->c.level == 1 &&
bpos_lt(*pulled_from_scan, b->key.k.p)) {
ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
bpos_successor(child->key.k.p), b->key.k.p);
if (ret)
goto err;
*pulled_from_scan = b->key.k.p;
ret = DID_FILL_FROM_SCAN;
} else {
ret = set_node_max(c, child, b->key.k.p);
}
}
err:
fsck_err:
printbuf_exit(&buf2);
printbuf_exit(&buf1);
printbuf_exit(&buf);
return ret;
}
static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b)
static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
struct bpos *pulled_from_scan)
{
struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_buf prev_k, cur_k;
struct btree *prev = NULL, *cur = NULL;
bool have_child, dropped_children = false;
bool have_child, new_pass = false;
struct printbuf buf = PRINTBUF;
int ret = 0;
if (!b->c.level)
return 0;
again:
prev = NULL;
have_child = dropped_children = false;
bch2_bkey_buf_init(&prev_k);
bch2_bkey_buf_init(&cur_k);
again:
cur = prev = NULL;
have_child = new_pass = false;
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true;
......@@ -332,9 +367,10 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
b->c.level - 1,
buf.buf)) {
bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
cur = NULL;
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?:
bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
if (ret)
break;
continue;
......@@ -344,7 +380,23 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
if (ret)
break;
ret = btree_repair_node_boundaries(c, b, prev, cur);
if (bch2_btree_node_is_stale(c, cur)) {
bch_info(c, "btree node %s older than nodes found by scanning", buf.buf);
six_unlock_read(&cur->c.lock);
bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
cur = NULL;
if (ret)
break;
continue;
}
ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan);
if (ret == DID_FILL_FROM_SCAN) {
new_pass = true;
ret = 0;
}
if (ret == DROP_THIS_NODE) {
six_unlock_read(&cur->c.lock);
......@@ -370,8 +422,6 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
break;
bch2_btree_and_journal_iter_exit(&iter);
bch2_bkey_buf_exit(&prev_k, c);
bch2_bkey_buf_exit(&cur_k, c);
goto again;
} else if (ret)
break;
......@@ -383,7 +433,11 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
if (!ret && !IS_ERR_OR_NULL(prev)) {
BUG_ON(cur);
ret = btree_repair_node_end(c, b, prev);
ret = btree_repair_node_end(c, b, prev, pulled_from_scan);
if (ret == DID_FILL_FROM_SCAN) {
new_pass = true;
ret = 0;
}
}
if (!IS_ERR_OR_NULL(prev))
......@@ -397,6 +451,10 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
goto err;
bch2_btree_and_journal_iter_exit(&iter);
if (new_pass)
goto again;
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true;
......@@ -413,7 +471,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
if (ret)
goto err;
ret = bch2_btree_repair_topology_recurse(trans, cur);
ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
six_unlock_read(&cur->c.lock);
cur = NULL;
......@@ -421,7 +479,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
dropped_children = true;
new_pass = true;
}
if (ret)
......@@ -448,12 +506,14 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
six_unlock_read(&cur->c.lock);
bch2_btree_and_journal_iter_exit(&iter);
bch2_bkey_buf_exit(&prev_k, c);
bch2_bkey_buf_exit(&cur_k, c);
if (!ret && dropped_children)
if (!ret && new_pass)
goto again;
BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
bch2_bkey_buf_exit(&prev_k, c);
bch2_bkey_buf_exit(&cur_k, c);
printbuf_exit(&buf);
return ret;
}
......@@ -461,32 +521,63 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
int bch2_check_topology(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
struct btree *b;
unsigned i;
struct bpos pulled_from_scan = POS_MIN;
int ret = 0;
for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
bool reconstructed_root = false;
if (!r->alive)
continue;
if (r->error) {
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
if (ret)
break;
reconstruct_root:
bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i));
b = r->b;
if (btree_node_fake(b))
continue;
r->alive = false;
r->error = 0;
if (!bch2_btree_has_scanned_nodes(c, i)) {
mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing,
"no nodes found for btree %s, continue?", bch2_btree_id_str(i));
bch2_btree_root_alloc_fake(c, i, 0);
} else {
bch2_btree_root_alloc_fake(c, i, 1);
ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
if (ret)
break;
}
bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
reconstructed_root = true;
}
struct btree *b = r->b;
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
ret = bch2_btree_repair_topology_recurse(trans, b);
ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
six_unlock_read(&b->c.lock);
if (ret == DROP_THIS_NODE) {
bch_err(c, "empty btree root - repair unimplemented");
ret = -BCH_ERR_fsck_repair_unimplemented;
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_lock(&c->btree_cache.lock);
list_move(&b->list, &c->btree_cache.freeable);
mutex_unlock(&c->btree_cache.lock);
r->b = NULL;
if (!reconstructed_root)
goto reconstruct_root;
bch_err(c, "empty btree root %s", bch2_btree_id_str(i));
bch2_btree_root_alloc_fake(c, i, 0);
r->alive = false;
ret = 0;
}
}
fsck_err:
bch2_trans_put(trans);
return ret;
}
......
......@@ -267,7 +267,8 @@
x(subvol_unreachable, 259) \
x(btree_node_bkey_bad_u64s, 260) \
x(btree_node_topology_empty_interior_node, 261) \
x(btree_ptr_v2_min_key_bad, 262)
x(btree_ptr_v2_min_key_bad, 262) \
x(btree_root_unreadable_and_scan_found_nothing, 263)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment