Commit d1b2c864 authored by Kent Overstreet's avatar Kent Overstreet

bcachefs: Defer full journal entry validation

On journal read, previously we would do full journal entry validation
immediately after reading a journal entry.

However, this would lead to errors for journal entries we weren't
actually going to use, either because they were too old or too new
(newer than the most recent flush).

We've observed write tearing on journal entries newer than the newest
flush - which makes sense, prior to a flush there's no guarantees about
write persistence.

This patch defers full journal entry validation until the end of the
journal read path, when we know which journal entries we'll want to use.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 17fe3b64
......@@ -735,12 +735,8 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
static int jset_validate(struct bch_fs *c,
struct bch_dev *ca,
struct jset *jset, u64 sector,
unsigned bucket_sectors_left,
unsigned sectors_read,
int write)
{
size_t bytes = vstruct_bytes(jset);
struct bch_csum csum;
unsigned version;
int ret = 0;
......@@ -757,21 +753,7 @@ static int jset_validate(struct bch_fs *c,
sector, le64_to_cpu(jset->seq),
version)) {
/* don't try to continue: */
return EINVAL;
}
if (bytes > (sectors_read << 9) &&
sectors_read < bucket_sectors_left)
return JOURNAL_ENTRY_REREAD;
if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
c, jset, NULL,
"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq), bytes)) {
ret = JOURNAL_ENTRY_BAD;
le32_add_cpu(&jset->u64s,
-((bytes - (bucket_sectors_left << 9)) / 8));
return -EINVAL;
}
if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
......@@ -779,28 +761,9 @@ static int jset_validate(struct bch_fs *c,
"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq),
JSET_CSUM_TYPE(jset))) {
ret = JOURNAL_ENTRY_BAD;
goto csum_done;
}
if (write)
goto csum_done;
csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum),
c, jset, NULL,
"%s sector %llu seq %llu: journal checksum bad",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq)))
JSET_CSUM_TYPE(jset)))
ret = JOURNAL_ENTRY_BAD;
ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
bch2_fs_fatal_err_on(ret, c,
"error decrypting journal entry: %i", ret);
csum_done:
/* last_seq is ignored when JSET_NO_FLUSH is true */
if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
......@@ -811,16 +774,52 @@ static int jset_validate(struct bch_fs *c,
jset->last_seq = jset->seq;
return JOURNAL_ENTRY_BAD;
}
ret = jset_validate_entries(c, jset, write);
fsck_err:
return ret;
}
static int jset_validate_for_write(struct bch_fs *c, struct jset *jset)
static int jset_validate_early(struct bch_fs *c,
struct bch_dev *ca,
struct jset *jset, u64 sector,
unsigned bucket_sectors_left,
unsigned sectors_read)
{
unsigned sectors = vstruct_sectors(jset, c->block_bits);
size_t bytes = vstruct_bytes(jset);
unsigned version;
int write = READ;
int ret = 0;
if (le64_to_cpu(jset->magic) != jset_magic(c))
return JOURNAL_ENTRY_NONE;
version = le32_to_cpu(jset->version);
if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
version < bcachefs_metadata_version_min) ||
version >= bcachefs_metadata_version_max,
c, jset, NULL,
"%s sector %llu seq %llu: unknown journal entry version %u",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq),
version)) {
/* don't try to continue: */
return -EINVAL;
}
if (bytes > (sectors_read << 9) &&
sectors_read < bucket_sectors_left)
return JOURNAL_ENTRY_REREAD;
return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?:
jset_validate_entries(c, jset, WRITE);
if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
c, jset, NULL,
"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq), bytes))
le32_add_cpu(&jset->u64s,
-((bytes - (bucket_sectors_left << 9)) / 8));
fsck_err:
return ret;
}
struct journal_read_buf {
......@@ -898,9 +897,8 @@ static int journal_read_bucket(struct bch_dev *ca,
j = buf->data;
}
ret = jset_validate(c, ca, j, offset,
end - offset, sectors_read,
READ);
ret = jset_validate_early(c, ca, j, offset,
end - offset, sectors_read);
switch (ret) {
case 0:
sectors = vstruct_sectors(j, c->block_bits);
......@@ -916,17 +914,13 @@ static int journal_read_bucket(struct bch_dev *ca,
case JOURNAL_ENTRY_NONE:
if (!saw_bad)
return 0;
sectors = block_sectors(c);
goto next_block;
case JOURNAL_ENTRY_BAD:
saw_bad = true;
/*
* On checksum error we don't really trust the size
* field of the journal entry we read, so try reading
* again at next block boundary:
*/
sectors = block_sectors(c);
break;
goto next_block;
default:
return ret;
}
......@@ -946,6 +940,12 @@ static int journal_read_bucket(struct bch_dev *ca,
if (!csum_good)
saw_bad = true;
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
j->encrypted_start,
vstruct_end(j) - (void *) j->encrypted_start);
bch2_fs_fatal_err_on(ret, c,
"error decrypting journal entry: %i", ret);
mutex_lock(&jlist->lock);
ret = journal_entry_add(c, ca, (struct journal_ptr) {
.csum_good = csum_good,
......@@ -1153,6 +1153,14 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
*start_seq = le64_to_cpu(i->j.seq) + 1;
if (!JSET_NO_FLUSH(&i->j)) {
int write = READ;
if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
c, &i->j, NULL,
"invalid journal entry: last_seq > seq (%llu > %llu)",
le64_to_cpu(i->j.last_seq),
le64_to_cpu(i->j.seq)))
i->j.last_seq = i->j.seq;
last_seq = le64_to_cpu(i->j.last_seq);
*blacklist_seq = le64_to_cpu(i->j.seq) + 1;
break;
......@@ -1256,7 +1264,21 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
if (!i || i->ignore)
continue;
ret = jset_validate_entries(c, &i->j, READ);
for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
if (!i->ptrs[ptr].csum_good)
printk(KERN_ERR "bcachefs (%s) sector %llu: invalid journal checksum, seq %llu%s\n",
ca->name, i->ptrs[ptr].sector,
le64_to_cpu(i->j.seq),
i->csum_good ? " (had good copy on another device)" : "");
}
ret = jset_validate(c,
bch_dev_bkey_exists(c, i->ptrs[0].dev),
&i->j,
i->ptrs[0].sector,
READ);
if (ret)
goto err;
......@@ -1694,7 +1716,7 @@ void bch2_journal_write(struct closure *cl)
validate_before_checksum = true;
if (validate_before_checksum &&
jset_validate_for_write(c, jset))
jset_validate(c, NULL, jset, 0, WRITE))
goto err;
ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
......@@ -1708,7 +1730,7 @@ void bch2_journal_write(struct closure *cl)
journal_nonce(jset), jset);
if (!validate_before_checksum &&
jset_validate_for_write(c, jset))
jset_validate(c, NULL, jset, 0, WRITE))
goto err;
sectors = vstruct_sectors(jset, c->block_bits);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment