Commit 9dec2a47 authored by Kent Overstreet's avatar Kent Overstreet

bcachefs: Accumulate accounting keys in journal replay

Until accounting keys hit the btree, they are deltas, not new versions
of the existing key; this means we have to teach journal replay to
accumulate them.

Additionally, the journal doesn't track precisely which entries have
been flushed to the btree; it only tracks a range of entries that may
possibly still need to be flushed.

That means we need to compare accounting keys against the version in the
btree and only flush updates that are newer.

There's another wrinkle with the write buffer: if the write buffer
starts flushing accounting keys before journal replay has finished
flushing accounting keys, journal replay will see the version number
from the new updates and updates from the journal will be lost.

To avoid this, journal replay has to flush accounting keys first, and
we'll be adding a flag so that write buffer flush knows to hold
accounting keys until then.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 2744e5c9
......@@ -16,21 +16,6 @@
* operations for the regular btree iter code to use:
*/
static int __journal_key_cmp(enum btree_id l_btree_id,
unsigned l_level,
struct bpos l_pos,
const struct journal_key *r)
{
return (cmp_int(l_btree_id, r->btree_id) ?:
cmp_int(l_level, r->level) ?:
bpos_cmp(l_pos, r->k->k.p));
}
static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
{
return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
}
static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
{
size_t gap_size = keys->size - keys->nr;
......@@ -548,7 +533,13 @@ static void __journal_keys_sort(struct journal_keys *keys)
struct journal_key *dst = keys->data;
darray_for_each(*keys, src) {
if (src + 1 < &darray_top(*keys) &&
/*
* We don't accumulate accounting keys here because we have to
* compare each individual accounting key against the version in
* the btree during replay:
*/
if (src->k->k.type != KEY_TYPE_accounting &&
src + 1 < &darray_top(*keys) &&
!journal_key_cmp(src, src + 1))
continue;
......
......@@ -26,6 +26,21 @@ struct btree_and_journal_iter {
bool prefetch;
};
static inline int __journal_key_cmp(enum btree_id l_btree_id,
unsigned l_level,
struct bpos l_pos,
const struct journal_key *r)
{
return (cmp_int(l_btree_id, r->btree_id) ?:
cmp_int(l_level, r->level) ?:
bpos_cmp(l_pos, r->k->k.p));
}
static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
{
return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
}
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
unsigned, struct bpos, struct bpos, size_t *);
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
......
......@@ -785,7 +785,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
{
/*
* Accounting keys aren't deduped in the journal: we have to compare
* each individual update against what's in the btree to see if it has
* been applied yet, and accounting updates also don't overwrite,
* they're deltas that accumulate.
*/
trans_for_each_update(trans, i)
if (i->k->k.type != KEY_TYPE_accounting)
bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
}
......@@ -993,15 +1000,24 @@ static noinline int
do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
int ret = 0;
trans_for_each_update(trans, i) {
ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
if (ret)
break;
return ret;
}
for (struct jset_entry *i = trans->journal_entries;
i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
i = vstruct_next(i))
if (i->type == BCH_JSET_ENTRY_btree_keys ||
i->type == BCH_JSET_ENTRY_write_buffer_keys) {
int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start);
if (ret)
return ret;
}
return 0;
}
int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
......
......@@ -130,7 +130,19 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr
enum btree_id btree,
struct bkey_i *k)
{
if (unlikely(trans->journal_replay_not_finished))
/*
* Most updates skip the btree write buffer until journal replay is
* finished because synchronization with journal replay relies on having
* a btree node locked - if we're overwriting a key in the journal that
* journal replay hasn't yet replayed, we have to mark it as
* overwritten.
*
* But accounting updates don't overwrite, they're deltas, and they have
* to be flushed to the btree strictly in order for journal replay to be
* able to tell which updates need to be applied:
*/
if (k->k.type != KEY_TYPE_accounting &&
unlikely(trans->journal_replay_not_finished))
return bch2_btree_insert_clone_trans(trans, btree, k);
struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
......
......@@ -10,6 +10,7 @@
#include "btree_io.h"
#include "buckets.h"
#include "dirent.h"
#include "disk_accounting.h"
#include "errcode.h"
#include "error.h"
#include "fs-common.h"
......@@ -135,6 +136,47 @@ static void replay_now_at(struct journal *j, u64 seq)
bch2_journal_pin_put(j, j->replay_journal_seq++);
}
static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
struct journal_key *k)
{
struct journal_keys *keys = &trans->c->journal_keys;
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, k->level,
BTREE_ITER_intent);
int ret = bch2_btree_iter_traverse(&iter);
if (ret)
goto out;
struct bkey u;
struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u);
/* Has this delta already been applied to the btree? */
if (bversion_cmp(old.k->version, k->k->k.version) >= 0) {
ret = 0;
goto out;
}
struct bkey_i *new = k->k;
if (old.k->type == KEY_TYPE_accounting) {
new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k));
ret = PTR_ERR_OR_ZERO(new);
if (ret)
goto out;
bch2_accounting_accumulate(bkey_i_to_accounting(new),
bkey_s_c_to_accounting(old));
}
trans->journal_res.seq = k->journal_seq;
ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun);
out:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static int bch2_journal_replay_key(struct btree_trans *trans,
struct journal_key *k)
{
......@@ -185,6 +227,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
if (k->overwritten)
goto out;
if (k->k->k.type == KEY_TYPE_accounting) {
ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k);
goto out;
}
ret = bch2_trans_update(trans, &iter, k->k, update_flags);
out:
bch2_trans_iter_exit(trans, &iter);
......@@ -222,6 +269,27 @@ int bch2_journal_replay(struct bch_fs *c)
move_gap(keys, keys->nr);
trans = bch2_trans_get(c);
/*
* Replay accounting keys first: we can't allow the write buffer to
* flush accounting keys until we're done
*/
darray_for_each(*keys, k) {
if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated))
continue;
cond_resched();
ret = commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_journal_res,
bch2_journal_replay_accounting_key(trans, k));
if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret)))
goto err;
k->overwritten = true;
}
/*
* First, attempt to replay keys in sorted order. This is more
* efficient - better locality of btree access - but some might fail if
......@@ -244,7 +312,7 @@ int bch2_journal_replay(struct bch_fs *c)
BCH_TRANS_COMMIT_journal_reclaim|
(!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
bch2_journal_replay_key(trans, k));
BUG_ON(!ret && !k->overwritten);
BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting);
if (ret) {
ret = darray_push(&keys_sorted, k);
if (ret)
......@@ -281,7 +349,7 @@ int bch2_journal_replay(struct bch_fs *c)
if (ret)
goto err;
BUG_ON(!k->overwritten);
BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten);
}
/*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment