Commit e3e464ac authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Move extent overwrite handling out of core btree code

Ever since the btree code was first written, handling of overwriting
existing extents - including partially overwriting and splittin existing
extents - was handled as part of the core btree insert path. The modern
transaction and iterator infrastructure didn't exist then, so that was
the only way for it to be done.

This patch moves that outside of the core btree code to a pass that runs
at transaction commit time.

This is a significant simplification to the btree code and overall
reduction in code size, but more importantly it gets us much closer to
the core btree code being completely independent of extents and is
important prep work for snapshots.

This introduces a new feature bit; the old and new extent update models
are incompatible when the filesystem needs journal replay.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 57b0b3db
......@@ -1315,12 +1315,14 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
x(inline_data, 8) \
x(new_extent_overwrite, 9) \
x(incompressible, 10) \
x(btree_ptr_v2, 11)
x(btree_ptr_v2, 11) \
x(extents_above_btree_updates, 12)
#define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \
(1ULL << BCH_FEATURE_new_extent_overwrite)| \
(1ULL << BCH_FEATURE_btree_ptr_v2))
(1ULL << BCH_FEATURE_btree_ptr_v2)| \
(1ULL << BCH_FEATURE_extents_above_btree_updates))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
......
......@@ -186,8 +186,16 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
return ret;
}
static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
u8 *max_stale, bool initial)
static bool pos_in_journal_keys(struct journal_keys *journal_keys,
enum btree_id id, struct bpos pos)
{
struct journal_key *k = journal_key_search(journal_keys, id, pos);
return k && k->btree_id == id && !bkey_cmp(k->k->k.p, pos);
}
static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
struct journal_keys *journal_keys, bool initial)
{
struct btree_node_iter iter;
struct bkey unpacked;
......@@ -201,6 +209,10 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
for_each_btree_node_key_unpack(b, k, &iter,
&unpacked) {
if (!b->c.level && journal_keys &&
pos_in_journal_keys(journal_keys, b->c.btree_id, k.k->p))
continue;
bch2_bkey_debugcheck(c, b, k);
ret = bch2_gc_mark_key(c, k, max_stale, initial);
......@@ -212,6 +224,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
}
static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
struct journal_keys *journal_keys,
bool initial, bool metadata_only)
{
struct btree_trans trans;
......@@ -239,7 +252,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
gc_pos_set(c, gc_pos_btree_node(b));
ret = btree_gc_mark_node(c, b, &max_stale, initial);
ret = btree_gc_mark_node(c, b, &max_stale,
journal_keys, initial);
if (ret)
break;
......@@ -281,36 +295,6 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
(int) btree_id_to_gc_phase(r);
}
static int mark_journal_key(struct bch_fs *c, enum btree_id id,
struct bkey_i *insert)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
u8 max_stale;
int ret = 0;
ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true);
if (ret)
return ret;
bch2_trans_init(&trans, c, 0, 0);
for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k),
BTREE_ITER_SLOTS, k, ret) {
percpu_down_read(&c->mark_lock);
ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL,
BTREE_TRIGGER_GC|
BTREE_TRIGGER_NOATOMIC);
percpu_up_read(&c->mark_lock);
if (!ret)
break;
}
return bch2_trans_exit(&trans) ?: ret;
}
static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
bool initial, bool metadata_only)
{
......@@ -325,18 +309,21 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
enum btree_id id = ids[i];
enum btree_node_type type = __btree_node_type(0, id);
int ret = bch2_gc_btree(c, id, initial, metadata_only);
int ret = bch2_gc_btree(c, id, journal_keys,
initial, metadata_only);
if (ret)
return ret;
if (journal_keys && !metadata_only &&
btree_node_type_needs_gc(type)) {
struct journal_key *j;
u8 max_stale;
int ret;
for_each_journal_key(*journal_keys, j)
if (j->btree_id == id) {
ret = mark_journal_key(c, id, j->k);
ret = bch2_gc_mark_key(c, bkey_i_to_s_c(j->k),
&max_stale, initial);
if (ret)
return ret;
}
......
......@@ -708,9 +708,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
unsigned *whiteout_u64s, int write,
bool have_retry)
{
struct bkey_packed *k;
struct bkey prev = KEY(0, 0, 0);
struct bpos prev_data = POS_MIN;
struct bkey_packed *k, *prev = NULL;
bool seen_non_whiteout = false;
unsigned version;
const char *err;
......@@ -852,15 +850,15 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
if (!seen_non_whiteout &&
(!bkey_whiteout(k) ||
(bkey_cmp(prev.p, bkey_start_pos(u.k)) > 0))) {
(prev && bkey_iter_cmp(b, prev, k) > 0))) {
*whiteout_u64s = k->_data - i->_data;
seen_non_whiteout = true;
} else if (bkey_cmp(prev_data, bkey_start_pos(u.k)) > 0 ||
bkey_cmp(prev.p, u.k->p) > 0) {
} else if (prev && bkey_iter_cmp(b, prev, k) > 0) {
char buf1[80];
char buf2[80];
struct bkey up = bkey_unpack_key(b, prev);
bch2_bkey_to_text(&PBUF(buf1), &prev);
bch2_bkey_to_text(&PBUF(buf1), &up);
bch2_bkey_to_text(&PBUF(buf2), u.k);
bch2_dump_bset(b, i, 0);
......@@ -870,10 +868,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
/* XXX: repair this */
}
if (!bkey_deleted(u.k))
prev_data = u.k->p;
prev = *u.k;
prev = k;
k = bkey_next_skip_noops(k, vstruct_last(i));
}
......
......@@ -1504,12 +1504,12 @@ static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter)
struct btree_trans *trans = iter->trans;
struct btree_insert_entry *i;
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
if ((cmp_int(iter->btree_id, i->iter->btree_id) ?:
bkey_cmp(pos, i->k->k.p)) <= 0)
break;
return i < trans->updates + trans->nr_updates &&
return i < trans->updates2 + trans->nr_updates2 &&
iter->btree_id == i->iter->btree_id
? bkey_i_to_s_c(i->k)
: bkey_s_c_null;
......@@ -1821,7 +1821,7 @@ int bch2_trans_iter_free(struct btree_trans *trans,
static int bch2_trans_realloc_iters(struct btree_trans *trans,
unsigned new_size)
{
void *new_iters, *new_updates;
void *p, *new_iters, *new_updates, *new_updates2;
size_t iters_bytes;
size_t updates_bytes;
......@@ -1839,21 +1839,27 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans,
iters_bytes = sizeof(struct btree_iter) * new_size;
updates_bytes = sizeof(struct btree_insert_entry) * new_size;
new_iters = kmalloc(iters_bytes + updates_bytes, GFP_NOFS);
if (new_iters)
p = kmalloc(iters_bytes +
updates_bytes +
updates_bytes, GFP_NOFS);
if (p)
goto success;
new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
new_size = BTREE_ITER_MAX;
trans->used_mempool = true;
success:
new_updates = new_iters + iters_bytes;
new_iters = p; p += iters_bytes;
new_updates = p; p += updates_bytes;
new_updates2 = p; p += updates_bytes;
memcpy(new_iters, trans->iters,
sizeof(struct btree_iter) * trans->nr_iters);
memcpy(new_updates, trans->updates,
sizeof(struct btree_insert_entry) * trans->nr_updates);
memcpy(new_updates2, trans->updates2,
sizeof(struct btree_insert_entry) * trans->nr_updates2);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
memset(trans->iters, POISON_FREE,
......@@ -1865,6 +1871,7 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans,
trans->iters = new_iters;
trans->updates = new_updates;
trans->updates2 = new_updates2;
trans->size = new_size;
if (trans->iters_live) {
......@@ -2126,6 +2133,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
trans->need_reset = 0;
trans->nr_updates = 0;
trans->nr_updates2 = 0;
trans->mem_top = 0;
if (trans->fs_usage_deltas) {
......@@ -2157,6 +2165,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
trans->size = ARRAY_SIZE(trans->iters_onstack);
trans->iters = trans->iters_onstack;
trans->updates = trans->updates_onstack;
trans->updates2 = trans->updates2_onstack;
trans->fs_usage_deltas = NULL;
if (expected_nr_iters > trans->size)
......@@ -2194,5 +2203,5 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
sizeof(struct btree_iter) * nr +
sizeof(struct btree_insert_entry) * nr +
sizeof(u8) * nr);
sizeof(struct btree_insert_entry) * nr);
}
......@@ -283,6 +283,7 @@ struct btree_trans {
u8 nr_iters;
u8 nr_updates;
u8 nr_updates2;
u8 size;
unsigned used_mempool:1;
unsigned error:1;
......@@ -295,6 +296,7 @@ struct btree_trans {
struct btree_iter *iters;
struct btree_insert_entry *updates;
struct btree_insert_entry *updates2;
/* update path: */
struct journal_res journal_res;
......@@ -308,6 +310,7 @@ struct btree_trans {
struct btree_iter iters_onstack[2];
struct btree_insert_entry updates_onstack[2];
struct btree_insert_entry updates2_onstack[2];
};
#define BTREE_FLAG(flag) \
......
......@@ -132,4 +132,9 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
(_i) < (_trans)->updates + (_trans)->nr_updates; \
(_i)++)
#define trans_for_each_update2(_trans, _i) \
for ((_i) = (_trans)->updates2; \
(_i) < (_trans)->updates2 + (_trans)->nr_updates2; \
(_i)++)
#endif /* _BCACHEFS_BTREE_UPDATE_H */
......@@ -303,18 +303,23 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
}
static inline void push_whiteout(struct bch_fs *c, struct btree *b,
struct bkey_packed *k)
struct bpos pos)
{
unsigned u64s = bkeyp_key_u64s(&b->format, k);
struct bkey_packed *dst;
struct bkey_packed k;
BUG_ON(u64s > bch_btree_keys_u64s_remaining(c, b));
BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
b->whiteout_u64s += bkeyp_key_u64s(&b->format, k);
dst = unwritten_whiteouts_start(c, b);
memcpy_u64s(dst, k, u64s);
dst->u64s = u64s;
dst->type = KEY_TYPE_deleted;
if (!bkey_pack_pos(&k, pos, b)) {
struct bkey *u = (void *) &k;
bkey_init(u);
u->p = pos;
}
k.needs_whiteout = true;
b->whiteout_u64s += k.u64s;
bkey_copy(unwritten_whiteouts_start(c, b), &k);
}
/*
......
......@@ -23,11 +23,10 @@
static inline bool same_leaf_as_prev(struct btree_trans *trans,
struct btree_insert_entry *i)
{
return i != trans->updates &&
return i != trans->updates2 &&
i[0].iter->l[0].b == i[-1].iter->l[0].b;
}
inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
struct btree_iter *iter)
{
......@@ -61,6 +60,9 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
bkey_cmp(insert->k.p, b->data->max_key) > 0);
EBUG_ON(insert->k.u64s >
bch_btree_keys_u64s_remaining(iter->trans->c, b));
EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
k = bch2_btree_node_iter_peek_all(node_iter, b);
if (k && bkey_cmp_packed(b, k, &insert->k))
......@@ -79,7 +81,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
k->type = KEY_TYPE_deleted;
if (k->needs_whiteout)
push_whiteout(iter->trans->c, b, k);
push_whiteout(iter->trans->c, b, insert->k.p);
k->needs_whiteout = false;
if (k >= btree_bset_last(b)->start) {
......@@ -195,20 +197,6 @@ void bch2_btree_journal_key(struct btree_trans *trans,
set_btree_node_dirty(b);
}
static void bch2_insert_fixup_key(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
{
struct btree_iter_level *l = &iter->l[0];
EBUG_ON(iter->level);
EBUG_ON(insert->k.u64s >
bch_btree_keys_u64s_remaining(trans->c, l->b));
if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, insert)))
bch2_btree_journal_key(trans, iter, insert);
}
/**
* btree_insert_key - insert a key one key into a leaf node
*/
......@@ -223,12 +211,12 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
EBUG_ON(iter->level);
insert->k.needs_whiteout = false;
if (!btree_node_is_extents(b))
bch2_insert_fixup_key(trans, iter, insert);
else
bch2_insert_fixup_extent(trans, iter, insert);
if (likely(bch2_btree_bset_insert_key(iter, b, &iter->l[0].iter, insert)))
bch2_btree_journal_key(trans, iter, insert);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) bset_u64s(t) - old_u64s;
......@@ -254,12 +242,8 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
struct bch_fs *c = trans->c;
BUG_ON(iter->level);
BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), iter->pos));
EBUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
bkey_cmp(insert->k.p, iter->l[0].b->key.k.p) > 0);
BUG_ON(bkey_cmp(insert->k.p, iter->pos));
BUG_ON(debug_check_bkeys(c) &&
!bkey_deleted(&insert->k) &&
bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id));
}
......@@ -312,9 +296,16 @@ btree_key_can_insert(struct btree_trans *trans,
if (unlikely(btree_node_fake(b)))
return BTREE_INSERT_BTREE_NODE_FULL;
/*
* old bch2_extent_sort_fix_overlapping() algorithm won't work with new
* style extent updates:
*/
if (unlikely(btree_node_old_extent_overwrite(b)))
return BTREE_INSERT_BTREE_NODE_FULL;
ret = !btree_node_is_extents(b)
? BTREE_INSERT_OK
: bch2_extent_can_insert(trans, iter, insert, u64s);
: bch2_extent_can_insert(trans, iter, insert);
if (ret)
return ret;
......@@ -383,7 +374,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
prefetch(&trans->c->journal.flags);
trans_for_each_update(trans, i) {
trans_for_each_update2(trans, i) {
/* Multiple inserts might go to same leaf: */
if (!same_leaf_as_prev(trans, i))
u64s = 0;
......@@ -422,10 +413,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
if (journal_seq_verify(c))
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
i->k->k.version.lo = trans->journal_res.seq;
else if (inject_invalid_keys(c))
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
i->k->k.version = MAX_VERSION;
}
......@@ -448,7 +439,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
if (unlikely(c->gc_pos.phase))
bch2_trans_mark_gc(trans);
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
do_btree_insert_one(trans, i->iter, i->k);
err:
if (marking) {
......@@ -469,7 +460,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
struct btree_iter *iter;
int ret;
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
BUG_ON(!btree_node_intent_locked(i->iter, 0));
ret = bch2_journal_preres_get(&trans->c->journal,
......@@ -497,18 +488,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
}
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
btree_insert_entry_checks(trans, i->iter, i->k);
bch2_btree_trans_verify_locks(trans);
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_lock_for_insert(trans->c,
i->iter->l[0].b, i->iter);
ret = bch2_trans_commit_write_locked(trans, stopped_at);
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
i->iter);
......@@ -525,14 +516,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
if (trans->flags & BTREE_INSERT_NOUNLOCK)
trans->nounlock = true;
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_foreground_maybe_merge(trans->c, i->iter,
0, trans->flags);
trans->nounlock = false;
trans_for_each_update(trans, i)
trans_for_each_update2(trans, i)
bch2_btree_iter_downgrade(i->iter);
return 0;
......@@ -655,6 +646,135 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
return 0;
}
static void bch2_trans_update2(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
{
struct btree_insert_entry *i, n = (struct btree_insert_entry) {
.iter = iter, .k = insert
};
btree_insert_entry_checks(trans, n.iter, n.k);
BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
EBUG_ON(trans->nr_updates2 >= trans->nr_iters);
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
trans_for_each_update2(trans, i) {
if (btree_iter_cmp(n.iter, i->iter) == 0) {
*i = n;
return;
}
if (btree_iter_cmp(n.iter, i->iter) <= 0)
break;
}
array_insert_item(trans->updates2, trans->nr_updates2,
i - trans->updates2, n);
}
static int extent_update_to_keys(struct btree_trans *trans,
struct btree_iter *orig_iter,
struct bkey_i *insert)
{
struct btree_iter *iter;
if (bkey_deleted(&insert->k))
return 0;
iter = bch2_trans_copy_iter(trans, orig_iter);
if (IS_ERR(iter))
return PTR_ERR(iter);
iter->flags |= BTREE_ITER_INTENT;
__bch2_btree_iter_set_pos(iter, insert->k.p, false);
bch2_trans_update2(trans, iter, insert);
bch2_trans_iter_put(trans, iter);
return 0;
}
static int extent_handle_overwrites(struct btree_trans *trans,
enum btree_id btree_id,
struct bpos start, struct bpos end)
{
struct btree_iter *iter = NULL, *update_iter;
struct bkey_i *update;
struct bkey_s_c k;
int ret = 0;
iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT);
ret = PTR_ERR_OR_ZERO(iter);
if (ret)
return ret;
k = bch2_btree_iter_peek_with_updates(iter);
while (k.k && !(ret = bkey_err(k))) {
if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0)
break;
if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
update_iter = bch2_trans_copy_iter(trans, iter);
if ((ret = PTR_ERR_OR_ZERO(update_iter)))
goto err;
update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
if ((ret = PTR_ERR_OR_ZERO(update)))
goto err;
bkey_reassemble(update, k);
bch2_cut_back(start, update);
__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
bch2_trans_update2(trans, update_iter, update);
bch2_trans_iter_put(trans, update_iter);
}
if (bkey_cmp(k.k->p, end) > 0) {
update_iter = bch2_trans_copy_iter(trans, iter);
if ((ret = PTR_ERR_OR_ZERO(update_iter)))
goto err;
update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
if ((ret = PTR_ERR_OR_ZERO(update)))
goto err;
bkey_reassemble(update, k);
bch2_cut_front(end, update);
__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
bch2_trans_update2(trans, update_iter, update);
bch2_trans_iter_put(trans, update_iter);
} else {
update_iter = bch2_trans_copy_iter(trans, iter);
if ((ret = PTR_ERR_OR_ZERO(update_iter)))
goto err;
update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
if ((ret = PTR_ERR_OR_ZERO(update)))
goto err;
update->k = *k.k;
set_bkey_val_u64s(&update->k, 0);
update->k.type = KEY_TYPE_deleted;
update->k.size = 0;
__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
bch2_trans_update2(trans, update_iter, update);
bch2_trans_iter_put(trans, update_iter);
}
k = bch2_btree_iter_next_with_updates(iter);
}
err:
if (!IS_ERR_OR_NULL(iter))
bch2_trans_iter_put(trans, iter);
return ret;
}
int __bch2_trans_commit(struct btree_trans *trans)
{
struct btree_insert_entry *i = NULL;
......@@ -724,7 +844,36 @@ int __bch2_trans_commit(struct btree_trans *trans)
}
} while (trans_trigger_run);
/* Turn extents updates into keys: */
trans_for_each_update(trans, i)
if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
struct bpos start = bkey_start_pos(&i->k->k);
while (i + 1 < trans->updates + trans->nr_updates &&
i[0].iter->btree_id == i[1].iter->btree_id &&
!bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)))
i++;
ret = extent_handle_overwrites(trans, i->iter->btree_id,
start, i->k->k.p);
if (ret)
goto out;
}
trans_for_each_update(trans, i) {
if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
ret = extent_update_to_keys(trans, i->iter, i->k);
if (ret)
goto out;
} else {
bch2_trans_update2(trans, i->iter, i->k);
}
}
trans_for_each_update2(trans, i) {
BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
BUG_ON(i->iter->locks_want < 1);
u64s = jset_u64s(i->k->k.u64s);
if (0)
trans->journal_preres_u64s += u64s;
......@@ -773,7 +922,10 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
.trigger_flags = flags, .iter = iter, .k = k
};
EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&k->k)));
EBUG_ON(bkey_cmp(iter->pos,
(iter->flags & BTREE_ITER_IS_EXTENTS)
? bkey_start_pos(&k->k)
: k->k.p));
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
......
......@@ -1254,21 +1254,21 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
struct bkey_s_c old,
struct bkey_i *new,
struct bch_fs_usage *fs_usage,
unsigned flags)
unsigned flags,
bool is_extents)
{
struct bch_fs *c = trans->c;
struct btree *b = iter->l[0].b;
unsigned offset = 0;
s64 sectors = 0;
s64 sectors = -((s64) old.k->size);
flags |= BTREE_TRIGGER_OVERWRITE;
if (btree_node_is_extents(b)
if (is_extents
? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
: bkey_cmp(new->k.p, old.k->p))
return 0;
if (btree_node_is_extents(b)) {
if (is_extents) {
switch (bch2_extent_overlap(&new->k, old.k)) {
case BCH_EXTENT_OVERLAP_ALL:
offset = 0;
......@@ -1341,7 +1341,8 @@ int bch2_mark_update(struct btree_trans *trans,
struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
ret = bch2_mark_overwrite(trans, iter, k, insert,
fs_usage, flags);
fs_usage, flags,
btree_node_type_is_extents(iter->btree_id));
if (ret <= 0)
break;
......
......@@ -268,7 +268,7 @@ int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage_online *,
int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
struct bkey_s_c, struct bkey_i *,
struct bch_fs_usage *, unsigned);
struct bch_fs_usage *, unsigned, bool);
int bch2_mark_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, struct bch_fs_usage *, unsigned);
......
......@@ -39,6 +39,12 @@ static int count_iters_for_insert(struct btree_trans *trans,
{
int ret = 0;
/*
* The extent update path requires an _additional_ iterator for each
* extent we're inserting and overwriting:
*/
*nr_iters += 1;
switch (k.k->type) {
case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
......@@ -167,402 +173,40 @@ int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
enum btree_insert_ret
bch2_extent_can_insert(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert,
unsigned *u64s)
struct bkey_i *insert)
{
struct btree_iter_level *l = &iter->l[0];
struct btree_node_iter node_iter = l->iter;
struct bkey_packed *_k;
struct bkey_s_c k;
struct bkey unpacked;
int sectors;
while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
KEY_TYPE_discard))) {
struct bkey_s_c k = bkey_disassemble(l->b, _k, &unpacked);
enum bch_extent_overlap overlap =
bch2_extent_overlap(&insert->k, k.k);
if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
break;
overlap = bch2_extent_overlap(&insert->k, k.k);
/*
* If we're overwriting an existing extent, we may need to emit
* a whiteout - unless we're inserting a new extent at the same
* position:
*/
if (k.k->needs_whiteout &&
(!bkey_whiteout(&insert->k) ||
bkey_cmp(k.k->p, insert->k.p)))
*u64s += BKEY_U64s;
/*
* If we're partially overwriting an existing extent which has
* been written out to disk, we'll need to emit a new version of
* that extent:
*/
if (bkey_written(l->b, _k) &&
overlap != BCH_EXTENT_OVERLAP_ALL)
*u64s += _k->u64s;
/* And we may be splitting an existing extent: */
if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
*u64s += _k->u64s;
if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
(sectors = bch2_bkey_sectors_compressed(k))) {
int flags = trans->flags & BTREE_INSERT_NOFAIL
? BCH_DISK_RESERVATION_NOFAIL : 0;
switch (bch2_disk_reservation_add(trans->c,
trans->disk_res,
sectors, flags)) {
case 0:
break;
case -ENOSPC:
return BTREE_INSERT_ENOSPC;
default:
BUG();
}
}
if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
overlap == BCH_EXTENT_OVERLAP_MIDDLE)
break;
bch2_btree_node_iter_advance(&node_iter, l->b);
}
return BTREE_INSERT_OK;
}
static void verify_extent_nonoverlapping(struct bch_fs *c,
struct btree *b,
struct btree_node_iter *_iter,
struct bkey_i *insert)
{
#ifdef CONFIG_BCACHEFS_DEBUG
struct btree_node_iter iter;
struct bkey_packed *k;
struct bkey uk;
if (!expensive_debug_checks(c))
return;
iter = *_iter;
k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
BUG_ON(k &&
(uk = bkey_unpack_key(b, k),
bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
iter = *_iter;
k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
#if 0
BUG_ON(k &&
(uk = bkey_unpack_key(b, k),
bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
#else
if (k &&
(uk = bkey_unpack_key(b, k),
bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
char buf1[100];
char buf2[100];
bch2_bkey_to_text(&PBUF(buf1), &insert->k);
bch2_bkey_to_text(&PBUF(buf2), &uk);
bch2_dump_btree_node(b);
panic("insert > next :\n"
"insert %s\n"
"next %s\n",
buf1, buf2);
}
#endif
#endif
}
static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
struct bkey_i *insert)
{
struct btree_iter_level *l = &iter->l[0];
struct bkey_packed *k =
bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
if (debug_check_bkeys(c))
bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
bch2_bset_insert(l->b, &l->iter, k, insert, 0);
bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
}
static void pack_push_whiteout(struct bch_fs *c, struct btree *b,
struct bpos pos)
{
struct bkey_packed k;
if (!bkey_pack_pos(&k, pos, b)) {
struct bkey_i tmp;
bkey_init(&tmp.k);
tmp.k.p = pos;
bkey_copy(&k, &tmp);
}
k.needs_whiteout = true;
push_whiteout(c, b, &k);
}
static void
extent_drop(struct bch_fs *c, struct btree_iter *iter,
struct bkey_packed *_k, struct bkey_s k)
{
struct btree_iter_level *l = &iter->l[0];
if (!bkey_whiteout(k.k))
btree_account_key_drop(l->b, _k);
k.k->size = 0;
k.k->type = KEY_TYPE_deleted;
if (!btree_node_old_extent_overwrite(l->b) &&
k.k->needs_whiteout) {
pack_push_whiteout(c, l->b, k.k->p);
k.k->needs_whiteout = false;
}
if (_k >= btree_bset_last(l->b)->start) {
unsigned u64s = _k->u64s;
bch2_bset_delete(l->b, _k, _k->u64s);
bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, u64s, 0);
} else {
extent_save(l->b, _k, k.k);
bch2_btree_iter_fix_key_modified(iter, l->b, _k);
}
}
static void
extent_squash(struct bch_fs *c, struct btree_iter *iter,
struct bkey_i *insert,
struct bkey_packed *_k, struct bkey_s k,
enum bch_extent_overlap overlap)
{
struct btree_iter_level *l = &iter->l[0];
struct bkey_on_stack tmp, split;
bkey_on_stack_init(&tmp);
bkey_on_stack_init(&split);
if (!btree_node_old_extent_overwrite(l->b)) {
if (!bkey_whiteout(&insert->k) &&
!bkey_cmp(k.k->p, insert->k.p)) {
insert->k.needs_whiteout = k.k->needs_whiteout;
k.k->needs_whiteout = false;
}
} else {
insert->k.needs_whiteout |= k.k->needs_whiteout;
}
switch (overlap) {
case BCH_EXTENT_OVERLAP_FRONT:
if (bkey_written(l->b, _k)) {
bkey_on_stack_reassemble(&tmp, c, k.s_c);
bch2_cut_front(insert->k.p, tmp.k);
/*
* needs_whiteout was propagated to new version of @k,
* @tmp:
*/
if (!btree_node_old_extent_overwrite(l->b))
k.k->needs_whiteout = false;
extent_drop(c, iter, _k, k);
extent_bset_insert(c, iter, tmp.k);
} else {
btree_keys_account_val_delta(l->b, _k,
bch2_cut_front_s(insert->k.p, k));
extent_save(l->b, _k, k.k);
/*
* No need to call bset_fix_invalidated_key, start of
* extent changed but extents are indexed by where they
* end
*/
bch2_btree_iter_fix_key_modified(iter, l->b, _k);
}
break;
case BCH_EXTENT_OVERLAP_BACK:
if (bkey_written(l->b, _k)) {
bkey_on_stack_reassemble(&tmp, c, k.s_c);
bch2_cut_back(bkey_start_pos(&insert->k), tmp.k);
/*
* @tmp has different position than @k, needs_whiteout
* should not be propagated:
*/
if (!btree_node_old_extent_overwrite(l->b))
tmp.k->k.needs_whiteout = false;
extent_drop(c, iter, _k, k);
extent_bset_insert(c, iter, tmp.k);
} else {
/*
* position of @k is changing, emit a whiteout if
* needs_whiteout is set:
*/
if (!btree_node_old_extent_overwrite(l->b) &&
k.k->needs_whiteout) {
pack_push_whiteout(c, l->b, k.k->p);
k.k->needs_whiteout = false;
}
btree_keys_account_val_delta(l->b, _k,
bch2_cut_back_s(bkey_start_pos(&insert->k), k));
extent_save(l->b, _k, k.k);
bch2_bset_fix_invalidated_key(l->b, _k);
bch2_btree_node_iter_fix(iter, l->b, &l->iter,
_k, _k->u64s, _k->u64s);
}
break;
case BCH_EXTENT_OVERLAP_ALL:
extent_drop(c, iter, _k, k);
break;
case BCH_EXTENT_OVERLAP_MIDDLE:
bkey_on_stack_reassemble(&split, c, k.s_c);
bch2_cut_back(bkey_start_pos(&insert->k), split.k);
if (!btree_node_old_extent_overwrite(l->b))
split.k->k.needs_whiteout = false;
/* this is identical to BCH_EXTENT_OVERLAP_FRONT: */
if (bkey_written(l->b, _k)) {
bkey_on_stack_reassemble(&tmp, c, k.s_c);
bch2_cut_front(insert->k.p, tmp.k);
if (!btree_node_old_extent_overwrite(l->b))
k.k->needs_whiteout = false;
extent_drop(c, iter, _k, k);
extent_bset_insert(c, iter, tmp.k);
} else {
btree_keys_account_val_delta(l->b, _k,
bch2_cut_front_s(insert->k.p, k));
extent_save(l->b, _k, k.k);
bch2_btree_iter_fix_key_modified(iter, l->b, _k);
}
extent_bset_insert(c, iter, split.k);
break;
}
bkey_on_stack_exit(&split, c);
bkey_on_stack_exit(&tmp, c);
}
_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
KEY_TYPE_discard);
if (!_k)
return BTREE_INSERT_OK;
/**
* bch_extent_insert_fixup - insert a new extent and deal with overlaps
*
* this may result in not actually doing the insert, or inserting some subset
* of the insert key. For cmpxchg operations this is where that logic lives.
*
* All subsets of @insert that need to be inserted are inserted using
* bch2_btree_insert_and_journal(). If @b or @res fills up, this function
* returns false, setting @iter->pos for the prefix of @insert that actually got
* inserted.
*
* BSET INVARIANTS: this function is responsible for maintaining all the
* invariants for bsets of extents in memory. things get really hairy with 0
* size extents
*
* within one bset:
*
* bkey_start_pos(bkey_next(k)) >= k
* or bkey_start_offset(bkey_next(k)) >= k->offset
*
* i.e. strict ordering, no overlapping extents.
*
* multiple bsets (i.e. full btree node):
*
* ∀ k, j
* k.size != 0 ∧ j.size != 0 →
* ¬ (k > bkey_start_pos(j) ∧ k < j)
*
* i.e. no two overlapping keys _of nonzero size_
*
* We can't realistically maintain this invariant for zero size keys because of
* the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
* there may be another 0 size key between them in another bset, and it will
* thus overlap with the merged key.
*
* In addition, the end of iter->pos indicates how much has been processed.
* If the end of iter->pos is not the same as the end of insert, then
* key insertion needs to continue/be retried.
*/
void bch2_insert_fixup_extent(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
{
struct bch_fs *c = trans->c;
struct btree_iter_level *l = &iter->l[0];
struct btree_node_iter node_iter = l->iter;
bool do_update = !bkey_whiteout(&insert->k);
struct bkey_packed *_k;
struct bkey unpacked;
k = bkey_disassemble(l->b, _k, &unpacked);
EBUG_ON(iter->level);
EBUG_ON(!insert->k.size);
EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
/* Check if we're splitting a compressed extent: */
while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
KEY_TYPE_discard))) {
struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
enum bch_extent_overlap overlap =
bch2_extent_overlap(&insert->k, k.k);
if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 &&
bkey_cmp(insert->k.p, k.k->p) < 0 &&
(sectors = bch2_bkey_sectors_compressed(k))) {
int flags = trans->flags & BTREE_INSERT_NOFAIL
? BCH_DISK_RESERVATION_NOFAIL : 0;
if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
switch (bch2_disk_reservation_add(trans->c, trans->disk_res,
sectors, flags)) {
case 0:
break;
if (!bkey_whiteout(k.k))
do_update = true;
if (!do_update) {
struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
bch2_cut_front(cur_end, insert);
bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
} else {
extent_squash(c, iter, insert, _k, k, overlap);
case -ENOSPC:
return BTREE_INSERT_ENOSPC;
default:
BUG();
}
node_iter = l->iter;
if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
overlap == BCH_EXTENT_OVERLAP_MIDDLE)
break;
}
l->iter = node_iter;
bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
if (do_update) {
if (insert->k.type == KEY_TYPE_deleted)
insert->k.type = KEY_TYPE_discard;
if (!bkey_whiteout(&insert->k) ||
btree_node_old_extent_overwrite(l->b))
extent_bset_insert(c, iter, insert);
bch2_btree_journal_key(trans, iter, insert);
}
bch2_cut_front(insert->k.p, insert);
return BTREE_INSERT_OK;
}
......@@ -11,9 +11,6 @@ int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
enum btree_insert_ret
bch2_extent_can_insert(struct btree_trans *, struct btree_iter *,
struct bkey_i *, unsigned *);
void bch2_insert_fixup_extent(struct btree_trans *,
struct btree_iter *,
struct bkey_i *);
struct bkey_i *);
#endif /* _BCACHEFS_EXTENT_UPDATE_H */
......@@ -422,6 +422,42 @@ static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size)
POS(inode_nr + 1, 0), NULL);
}
static int bch2_fix_overlapping_extent(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k, struct bpos cut_at)
{
struct btree_iter *u_iter;
struct bkey_i *u;
int ret;
u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
ret = PTR_ERR_OR_ZERO(u);
if (ret)
return ret;
bkey_reassemble(u, k);
bch2_cut_front(cut_at, u);
u_iter = bch2_trans_copy_iter(trans, iter);
ret = PTR_ERR_OR_ZERO(u_iter);
if (ret)
return ret;
/*
* We don't want to go through the
* extent_handle_overwrites path:
*/
__bch2_btree_iter_set_pos(u_iter, u->k.p, false);
/*
* XXX: this is going to leave disk space
* accounting slightly wrong
*/
ret = bch2_trans_update(trans, u_iter, u, 0);
bch2_trans_iter_put(trans, u_iter);
return ret;
}
/*
* Walk extents: verify that extents have a corresponding S_ISREG inode, and
* that i_size an i_sectors are consistent
......@@ -433,6 +469,7 @@ static int check_extents(struct bch_fs *c)
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
struct bkey prev = KEY(0, 0, 0);
u64 i_sectors;
int ret = 0;
......@@ -444,6 +481,25 @@ static int check_extents(struct bch_fs *c)
POS(BCACHEFS_ROOT_INO, 0), 0);
retry:
for_each_btree_key_continue(iter, 0, k, ret) {
if (bkey_cmp(prev.p, bkey_start_pos(k.k)) > 0) {
char buf1[100];
char buf2[100];
bch2_bkey_to_text(&PBUF(buf1), &prev);
bch2_bkey_to_text(&PBUF(buf2), k.k);
if (fsck_err(c, "overlapping extents: %s, %s", buf1, buf2)) {
ret = __bch2_trans_do(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
bch2_fix_overlapping_extent(&trans,
iter, k, prev.p));
if (ret)
goto err;
}
}
prev = *k.k;
ret = walk_inode(&trans, &w, k.k->p.inode);
if (ret)
break;
......
......@@ -161,13 +161,16 @@ static void journal_entries_free(struct list_head *list)
}
}
/*
* When keys compare equal, oldest compares first:
*/
static int journal_sort_key_cmp(const void *_l, const void *_r)
{
const struct journal_key *l = _l;
const struct journal_key *r = _r;
return cmp_int(l->btree_id, r->btree_id) ?:
bkey_cmp(l->pos, r->pos) ?:
bkey_cmp(l->k->k.p, r->k->k.p) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->journal_offset, r->journal_offset);
}
......@@ -179,25 +182,11 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
return cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->btree_id, r->btree_id) ?:
bkey_cmp(l->pos, r->pos);
}
static void journal_keys_sift(struct journal_keys *keys, struct journal_key *i)
{
while (i + 1 < keys->d + keys->nr &&
journal_sort_key_cmp(i, i + 1) > 0) {
swap(i[0], i[1]);
i++;
}
bkey_cmp(l->k->k.p, r->k->k.p);
}
static void journal_keys_free(struct journal_keys *keys)
{
struct journal_key *i;
for_each_journal_key(*keys, i)
if (i->allocated)
kfree(i->k);
kvfree(keys->d);
keys->d = NULL;
keys->nr = 0;
......@@ -208,15 +197,15 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
struct journal_replay *p;
struct jset_entry *entry;
struct bkey_i *k, *_n;
struct journal_keys keys = { NULL }, keys_deduped = { NULL };
struct journal_key *i;
struct journal_keys keys = { NULL };
struct journal_key *src, *dst;
size_t nr_keys = 0;
list_for_each_entry(p, journal_entries, list)
for_each_jset_key(k, _n, entry, &p->j)
nr_keys++;
keys.journal_seq_base = keys_deduped.journal_seq_base =
keys.journal_seq_base =
le64_to_cpu(list_first_entry(journal_entries,
struct journal_replay,
list)->j.seq);
......@@ -225,96 +214,31 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
if (!keys.d)
goto err;
keys_deduped.d = kvmalloc(sizeof(keys.d[0]) * nr_keys * 2, GFP_KERNEL);
if (!keys_deduped.d)
goto err;
list_for_each_entry(p, journal_entries, list)
for_each_jset_key(k, _n, entry, &p->j) {
if (bkey_deleted(&k->k) &&
btree_node_type_is_extents(entry->btree_id))
continue;
for_each_jset_key(k, _n, entry, &p->j)
keys.d[keys.nr++] = (struct journal_key) {
.btree_id = entry->btree_id,
.pos = bkey_start_pos(&k->k),
.k = k,
.journal_seq = le64_to_cpu(p->j.seq) -
keys.journal_seq_base,
.journal_offset = k->_data - p->j._data,
};
}
sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
i = keys.d;
while (i < keys.d + keys.nr) {
if (i + 1 < keys.d + keys.nr &&
i[0].btree_id == i[1].btree_id &&
!bkey_cmp(i[0].pos, i[1].pos)) {
if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
i++;
} else {
bch2_cut_front(i[1].k->k.p, i[0].k);
i[0].pos = i[1].k->k.p;
journal_keys_sift(&keys, i);
}
continue;
}
if (i + 1 < keys.d + keys.nr &&
i[0].btree_id == i[1].btree_id &&
bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)) > 0) {
if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?:
cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) {
if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
bch2_cut_back(bkey_start_pos(&i[1].k->k), i[0].k);
} else {
struct bkey_i *split =
kmalloc(bkey_bytes(i[0].k), GFP_KERNEL);
if (!split)
goto err;
bkey_copy(split, i[0].k);
bch2_cut_back(bkey_start_pos(&i[1].k->k), split);
keys_deduped.d[keys_deduped.nr++] = (struct journal_key) {
.btree_id = i[0].btree_id,
.allocated = true,
.pos = bkey_start_pos(&split->k),
.k = split,
.journal_seq = i[0].journal_seq,
.journal_offset = i[0].journal_offset,
};
bch2_cut_front(i[1].k->k.p, i[0].k);
i[0].pos = i[1].k->k.p;
journal_keys_sift(&keys, i);
continue;
}
} else {
if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) >= 0) {
i[1] = i[0];
i++;
continue;
} else {
bch2_cut_front(i[0].k->k.p, i[1].k);
i[1].pos = i[0].k->k.p;
journal_keys_sift(&keys, i + 1);
continue;
}
}
}
src = dst = keys.d;
while (src < keys.d + keys.nr) {
while (src + 1 < keys.d + keys.nr &&
src[0].btree_id == src[1].btree_id &&
!bkey_cmp(src[0].k->k.p, src[1].k->k.p))
src++;
keys_deduped.d[keys_deduped.nr++] = *i++;
*dst++ = *src++;
}
kvfree(keys.d);
return keys_deduped;
keys.nr = dst - keys.d;
err:
journal_keys_free(&keys_deduped);
kvfree(keys.d);
return (struct journal_keys) { NULL };
return keys;
}
/* journal replay: */
......@@ -365,11 +289,6 @@ static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id,
atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p);
split_iter = bch2_trans_copy_iter(&trans, iter);
ret = PTR_ERR_OR_ZERO(split_iter);
if (ret)
goto err;
split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
ret = PTR_ERR_OR_ZERO(split);
if (ret)
......@@ -388,12 +307,25 @@ static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id,
}
bkey_copy(split, k);
bch2_cut_front(split_iter->pos, split);
bch2_cut_front(iter->pos, split);
bch2_cut_back(atomic_end, split);
split_iter = bch2_trans_copy_iter(&trans, iter);
ret = PTR_ERR_OR_ZERO(split_iter);
if (ret)
goto err;
/*
* It's important that we don't go through the
* extent_handle_overwrites() and extent_update_to_keys() path
* here: journal replay is supposed to treat extents like
* regular keys
*/
__bch2_btree_iter_set_pos(split_iter, split->k.p, false);
bch2_trans_update(&trans, split_iter, split, !remark
? BTREE_TRIGGER_NORUN
: BTREE_TRIGGER_NOOVERWRITES);
bch2_btree_iter_set_pos(iter, split->k.p);
} while (bkey_cmp(iter->pos, k->k.p) < 0);
......@@ -424,11 +356,18 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
struct btree_iter *iter;
int ret;
iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
iter = bch2_trans_get_iter(trans, id, k->k.p, BTREE_ITER_INTENT);
if (IS_ERR(iter))
return PTR_ERR(iter);
/*
* iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run
* extent_handle_overwrites() and extent_update_to_keys() - but we don't
* want that here, journal replay is supposed to treat extents like
* regular keys:
*/
__bch2_btree_iter_set_pos(iter, k->k.p, false);
ret = bch2_btree_iter_traverse(iter) ?:
bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
bch2_trans_iter_put(trans, iter);
......@@ -459,7 +398,7 @@ static int bch2_journal_replay(struct bch_fs *c,
if (i->btree_id == BTREE_ID_ALLOC)
ret = bch2_alloc_replay_key(c, i->k);
else if (btree_node_type_is_extents(i->btree_id))
else if (i->k->k.size)
ret = bch2_extent_replay_key(c, i->btree_id, i->k);
else
ret = bch2_journal_replay_key(c, i->btree_id, i->k);
......@@ -859,6 +798,15 @@ int bch2_fs_recovery(struct bch_fs *c)
journal_seq = le64_to_cpu(clean->journal_seq) + 1;
}
if (!c->sb.clean &&
!(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
ret = -EINVAL;
goto err;
}
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
ret = journal_replay_early(c, clean, &journal_entries);
if (ret)
goto err;
......
......@@ -5,8 +5,6 @@
struct journal_keys {
struct journal_key {
enum btree_id btree_id:8;
unsigned allocated:1;
struct bpos pos;
struct bkey_i *k;
u32 journal_seq;
u32 journal_offset;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment