Commit c3de9b57 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'bcachefs-2024-06-22' of https://evilpiepirate.org/git/bcachefs

Pull bcachefs fixes from Kent Overstreet:
 "Lots of (mostly boring) fixes for syzbot bugs and rare(r) CI bugs.

  The LRU_TIME_BITS fix was slightly more involved; we only have 48 bits
  for the LRU position (we would prefer 64), so wraparound is possible
  for the cached data LRUs on a filesystem that has done sufficient
  (petabytes) reads; this is now handled.

  One notable user reported bugfix, where we were forgetting to
  correctly set the bucket data type, which should have been
  BCH_DATA_need_gc_gens instead of BCH_DATA_free; this was causing us to
  go emergency read-only on a filesystem that had seen heavy enough use
  to see bucket gen wraparoud.

  We're now starting to fix simple (safe) errors without requiring user
  intervention - i.e. a small incremental step towards full self
  healing.

  This is currently limited to just certain allocation information
  counters, and the error is still logged in the superblock; see that
  patch for more information. ("bcachefs: Fix safe errors by default")"

* tag 'bcachefs-2024-06-22' of https://evilpiepirate.org/git/bcachefs: (22 commits)
  bcachefs: Move the ei_flags setting to after initialization
  bcachefs: Fix a UAF after write_super()
  bcachefs: Use bch2_print_string_as_lines for long err
  bcachefs: Fix I_NEW warning in race path in bch2_inode_insert()
  bcachefs: Replace bare EEXIST with private error codes
  bcachefs: Fix missing alloc_data_type_set()
  closures: Change BUG_ON() to WARN_ON()
  bcachefs: fix alignment of VMA for memory mapped files on THP
  bcachefs: Fix safe errors by default
  bcachefs: Fix bch2_trans_put()
  bcachefs: set_worker_desc() for delete_dead_snapshots
  bcachefs: Fix bch2_sb_downgrade_update()
  bcachefs: Handle cached data LRU wraparound
  bcachefs: Guard against overflowing LRU_TIME_BITS
  bcachefs: delete_dead_snapshots() doesn't need to go RW
  bcachefs: Fix early init error path in journal code
  bcachefs: Check for invalid btree IDs
  bcachefs: Fix btree ID bitmasks
  bcachefs: Fix shift overflow in read_one_super()
  bcachefs: Fix a locking bug in the do_discard_fast() path
  ...
parents da3b6ef1 bd4da046
...@@ -259,6 +259,14 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, ...@@ -259,6 +259,14 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
"invalid data type (got %u should be %u)", "invalid data type (got %u should be %u)",
a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
for (unsigned i = 0; i < 2; i++)
bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX,
c, err,
alloc_key_io_time_bad,
"invalid io_time[%s]: %llu, max %llu",
i == READ ? "read" : "write",
a.v->io_time[i], LRU_TIME_MAX);
switch (a.v->data_type) { switch (a.v->data_type) {
case BCH_DATA_free: case BCH_DATA_free:
case BCH_DATA_need_gc_gens: case BCH_DATA_need_gc_gens:
...@@ -757,8 +765,8 @@ int bch2_trigger_alloc(struct btree_trans *trans, ...@@ -757,8 +765,8 @@ int bch2_trigger_alloc(struct btree_trans *trans,
alloc_data_type_set(new_a, new_a->data_type); alloc_data_type_set(new_a, new_a->data_type);
if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) { if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); new_a->io_time[READ] = bch2_current_io_time(c, READ);
new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
} }
...@@ -768,6 +776,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, ...@@ -768,6 +776,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
!bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
new_a->gen++; new_a->gen++;
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
alloc_data_type_set(new_a, new_a->data_type);
} }
if (old_a->data_type != new_a->data_type || if (old_a->data_type != new_a->data_type ||
...@@ -781,7 +790,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, ...@@ -781,7 +790,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
if (new_a->data_type == BCH_DATA_cached && if (new_a->data_type == BCH_DATA_cached &&
!new_a->io_time[READ]) !new_a->io_time[READ])
new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); new_a->io_time[READ] = bch2_current_io_time(c, READ);
u64 old_lru = alloc_lru_idx_read(*old_a); u64 old_lru = alloc_lru_idx_read(*old_a);
u64 new_lru = alloc_lru_idx_read(*new_a); u64 new_lru = alloc_lru_idx_read(*new_a);
...@@ -882,7 +891,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, ...@@ -882,7 +891,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
closure_wake_up(&c->freelist_wait); closure_wake_up(&c->freelist_wait);
if (statechange(a->data_type == BCH_DATA_need_discard) && if (statechange(a->data_type == BCH_DATA_need_discard) &&
!bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
bucket_flushed(new_a)) bucket_flushed(new_a))
bch2_discard_one_bucket_fast(c, new.k->p); bch2_discard_one_bucket_fast(c, new.k->p);
...@@ -1579,7 +1588,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, ...@@ -1579,7 +1588,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
if (ret) if (ret)
goto err; goto err;
a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); a_mut->v.io_time[READ] = bch2_current_io_time(c, READ);
ret = bch2_trans_update(trans, alloc_iter, ret = bch2_trans_update(trans, alloc_iter,
&a_mut->k_i, BTREE_TRIGGER_norun); &a_mut->k_i, BTREE_TRIGGER_norun);
if (ret) if (ret)
...@@ -1634,7 +1643,7 @@ static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket) ...@@ -1634,7 +1643,7 @@ static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket)
mutex_lock(&c->discard_buckets_in_flight_lock); mutex_lock(&c->discard_buckets_in_flight_lock);
darray_for_each(c->discard_buckets_in_flight, i) darray_for_each(c->discard_buckets_in_flight, i)
if (bkey_eq(*i, bucket)) { if (bkey_eq(*i, bucket)) {
ret = -EEXIST; ret = -BCH_ERR_EEXIST_discard_in_flight_add;
goto out; goto out;
} }
...@@ -1788,8 +1797,9 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, ...@@ -1788,8 +1797,9 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
} }
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
alloc_data_type_set(&a->v, a->v.data_type);
write: write:
alloc_data_type_set(&a->v, a->v.data_type);
ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL, bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree| BCH_WATERMARK_btree|
...@@ -1975,8 +1985,8 @@ static int invalidate_one_bucket(struct btree_trans *trans, ...@@ -1975,8 +1985,8 @@ static int invalidate_one_bucket(struct btree_trans *trans,
a->v.data_type = 0; a->v.data_type = 0;
a->v.dirty_sectors = 0; a->v.dirty_sectors = 0;
a->v.cached_sectors = 0; a->v.cached_sectors = 0;
a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); a->v.io_time[READ] = bch2_current_io_time(c, READ);
a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE);
ret = bch2_trans_commit(trans, NULL, NULL, ret = bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree| BCH_WATERMARK_btree|
...@@ -2011,6 +2021,21 @@ static int invalidate_one_bucket(struct btree_trans *trans, ...@@ -2011,6 +2021,21 @@ static int invalidate_one_bucket(struct btree_trans *trans,
goto out; goto out;
} }
static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
struct bch_dev *ca, bool *wrapped)
{
struct bkey_s_c k;
again:
k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
if (!k.k && !*wrapped) {
bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
*wrapped = true;
goto again;
}
return k;
}
static void bch2_do_invalidates_work(struct work_struct *work) static void bch2_do_invalidates_work(struct work_struct *work)
{ {
struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
...@@ -2024,12 +2049,33 @@ static void bch2_do_invalidates_work(struct work_struct *work) ...@@ -2024,12 +2049,33 @@ static void bch2_do_invalidates_work(struct work_struct *work)
for_each_member_device(c, ca) { for_each_member_device(c, ca) {
s64 nr_to_invalidate = s64 nr_to_invalidate =
should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
struct btree_iter iter;
bool wrapped = false;
bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
lru_pos(ca->dev_idx, 0,
((bch2_current_io_time(c, READ) + U32_MAX) &
LRU_TIME_MAX)), 0);
ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, while (true) {
lru_pos(ca->dev_idx, 0, 0), bch2_trans_begin(trans);
lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
BTREE_ITER_intent, k, struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate)); ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
break;
if (!k.k)
break;
ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
if (ret)
break;
bch2_btree_iter_advance(&iter);
}
bch2_trans_iter_exit(trans, &iter);
if (ret < 0) { if (ret < 0) {
bch2_dev_put(ca); bch2_dev_put(ca);
...@@ -2204,7 +2250,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, ...@@ -2204,7 +2250,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
if (ret) if (ret)
return ret; return ret;
now = atomic64_read(&c->io_clock[rw].now); now = bch2_current_io_time(c, rw);
if (a->v.io_time[rw] == now) if (a->v.io_time[rw] == now)
goto out; goto out;
......
...@@ -141,7 +141,13 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, ...@@ -141,7 +141,13 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
!bch2_bucket_sectors_fragmented(ca, a)) !bch2_bucket_sectors_fragmented(ca, a))
return 0; return 0;
u64 d = bch2_bucket_sectors_dirty(a); /*
* avoid overflowing LRU_TIME_BITS on a corrupted fs, when
* bucket_sectors_dirty is (much) bigger than bucket_size
*/
u64 d = min(bch2_bucket_sectors_dirty(a),
ca->mi.bucket_size);
return div_u64(d * (1ULL << 31), ca->mi.bucket_size); return div_u64(d * (1ULL << 31), ca->mi.bucket_size);
} }
......
...@@ -1214,6 +1214,11 @@ static inline s64 bch2_current_time(const struct bch_fs *c) ...@@ -1214,6 +1214,11 @@ static inline s64 bch2_current_time(const struct bch_fs *c)
return timespec_to_bch2_time(c, now); return timespec_to_bch2_time(c, now);
} }
static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw)
{
return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX);
}
static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
{ {
struct stdio_redirect *stdio = c->stdio; struct stdio_redirect *stdio = c->stdio;
......
...@@ -476,6 +476,9 @@ struct bch_lru { ...@@ -476,6 +476,9 @@ struct bch_lru {
#define LRU_ID_STRIPES (1U << 16) #define LRU_ID_STRIPES (1U << 16)
#define LRU_TIME_BITS 48
#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
/* Optional/variable size superblock sections: */ /* Optional/variable size superblock sections: */
struct bch_sb_field { struct bch_sb_field {
...@@ -987,8 +990,9 @@ enum bch_version_upgrade_opts { ...@@ -987,8 +990,9 @@ enum bch_version_upgrade_opts {
#define BCH_ERROR_ACTIONS() \ #define BCH_ERROR_ACTIONS() \
x(continue, 0) \ x(continue, 0) \
x(ro, 1) \ x(fix_safe, 1) \
x(panic, 2) x(panic, 2) \
x(ro, 3)
enum bch_error_actions { enum bch_error_actions {
#define x(t, n) BCH_ON_ERROR_##t = n, #define x(t, n) BCH_ON_ERROR_##t = n,
...@@ -1382,9 +1386,10 @@ enum btree_id { ...@@ -1382,9 +1386,10 @@ enum btree_id {
/* /*
* Maximum number of btrees that we will _ever_ have under the current scheme, * Maximum number of btrees that we will _ever_ have under the current scheme,
* where we refer to them with bitfields * where we refer to them with 64 bit bitfields - and we also need a bit for
* the interior btree node type:
*/ */
#define BTREE_ID_NR_MAX 64 #define BTREE_ID_NR_MAX 63
static inline bool btree_id_is_alloc(enum btree_id id) static inline bool btree_id_is_alloc(enum btree_id id)
{ {
......
...@@ -1064,7 +1064,7 @@ void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) ...@@ -1064,7 +1064,7 @@ void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
{ {
const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
u8 *l = k->key_start; u8 *l = k->key_start;
u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1;
while (l < h) { while (l < h) {
swap(*l, *h); swap(*l, *h);
......
...@@ -398,8 +398,12 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, ...@@ -398,8 +398,12 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
for (i = 0; i < nr_compat; i++) for (i = 0; i < nr_compat; i++)
switch (!write ? i : nr_compat - 1 - i) { switch (!write ? i : nr_compat - 1 - i) {
case 0: case 0:
if (big_endian != CPU_BIG_ENDIAN) if (big_endian != CPU_BIG_ENDIAN) {
bch2_bkey_swab_key(f, k);
} else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
bch2_bkey_swab_key(f, k); bch2_bkey_swab_key(f, k);
bch2_bkey_swab_key(f, k);
}
break; break;
case 1: case 1:
if (version < bcachefs_metadata_version_bkey_renumber) if (version < bcachefs_metadata_version_bkey_renumber)
......
...@@ -129,7 +129,8 @@ static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, ...@@ -129,7 +129,8 @@ static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id,
struct bkey_packed *k) struct bkey_packed *k)
{ {
if (version < bcachefs_metadata_version_current || if (version < bcachefs_metadata_version_current ||
big_endian != CPU_BIG_ENDIAN) big_endian != CPU_BIG_ENDIAN ||
IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
__bch2_bkey_compat(level, btree_id, version, __bch2_bkey_compat(level, btree_id, version,
big_endian, write, f, k); big_endian, write, f, k);
......
...@@ -3161,6 +3161,7 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) ...@@ -3161,6 +3161,7 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
list_add_done: list_add_done:
seqmutex_unlock(&c->btree_trans_lock); seqmutex_unlock(&c->btree_trans_lock);
got_trans: got_trans:
trans->ref.closure_get_happened = false;
trans->c = c; trans->c = c;
trans->last_begin_time = local_clock(); trans->last_begin_time = local_clock();
trans->fn_idx = fn_idx; trans->fn_idx = fn_idx;
...@@ -3235,7 +3236,6 @@ void bch2_trans_put(struct btree_trans *trans) ...@@ -3235,7 +3236,6 @@ void bch2_trans_put(struct btree_trans *trans)
trans_for_each_update(trans, i) trans_for_each_update(trans, i)
__btree_path_put(trans->paths + i->path, true); __btree_path_put(trans->paths + i->path, true);
trans->nr_updates = 0; trans->nr_updates = 0;
trans->locking_wait.task = NULL;
check_btree_paths_leaked(trans); check_btree_paths_leaked(trans);
...@@ -3256,6 +3256,13 @@ void bch2_trans_put(struct btree_trans *trans) ...@@ -3256,6 +3256,13 @@ void bch2_trans_put(struct btree_trans *trans)
if (unlikely(trans->journal_replay_not_finished)) if (unlikely(trans->journal_replay_not_finished))
bch2_journal_keys_put(c); bch2_journal_keys_put(c);
/*
* trans->ref protects trans->locking_wait.task, btree_paths arary; used
* by cycle detector
*/
closure_sync(&trans->ref);
trans->locking_wait.task = NULL;
unsigned long *paths_allocated = trans->paths_allocated; unsigned long *paths_allocated = trans->paths_allocated;
trans->paths_allocated = NULL; trans->paths_allocated = NULL;
trans->paths = NULL; trans->paths = NULL;
...@@ -3273,8 +3280,6 @@ void bch2_trans_put(struct btree_trans *trans) ...@@ -3273,8 +3280,6 @@ void bch2_trans_put(struct btree_trans *trans)
trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
if (trans) { if (trans) {
closure_sync(&trans->ref);
seqmutex_lock(&c->btree_trans_lock); seqmutex_lock(&c->btree_trans_lock);
list_del(&trans->list); list_del(&trans->list);
seqmutex_unlock(&c->btree_trans_lock); seqmutex_unlock(&c->btree_trans_lock);
......
...@@ -761,13 +761,13 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type) ...@@ -761,13 +761,13 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type)
static inline bool btree_node_type_is_extents(enum btree_node_type type) static inline bool btree_node_type_is_extents(enum btree_node_type type)
{ {
const unsigned mask = 0 const u64 mask = 0
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1)) #define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1))
BCH_BTREE_IDS() BCH_BTREE_IDS()
#undef x #undef x
; ;
return (1U << type) & mask; return BIT_ULL(type) & mask;
} }
static inline bool btree_id_is_extents(enum btree_id btree) static inline bool btree_id_is_extents(enum btree_id btree)
...@@ -777,35 +777,35 @@ static inline bool btree_id_is_extents(enum btree_id btree) ...@@ -777,35 +777,35 @@ static inline bool btree_id_is_extents(enum btree_id btree)
static inline bool btree_type_has_snapshots(enum btree_id id) static inline bool btree_type_has_snapshots(enum btree_id id)
{ {
const unsigned mask = 0 const u64 mask = 0
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr) #define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr)
BCH_BTREE_IDS() BCH_BTREE_IDS()
#undef x #undef x
; ;
return (1U << id) & mask; return BIT_ULL(id) & mask;
} }
static inline bool btree_type_has_snapshot_field(enum btree_id id) static inline bool btree_type_has_snapshot_field(enum btree_id id)
{ {
const unsigned mask = 0 const u64 mask = 0
#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr) #define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
BCH_BTREE_IDS() BCH_BTREE_IDS()
#undef x #undef x
; ;
return (1U << id) & mask; return BIT_ULL(id) & mask;
} }
static inline bool btree_type_has_ptrs(enum btree_id id) static inline bool btree_type_has_ptrs(enum btree_id id)
{ {
const unsigned mask = 0 const u64 mask = 0
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr) #define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr)
BCH_BTREE_IDS() BCH_BTREE_IDS()
#undef x #undef x
; ;
return (1U << id) & mask; return BIT_ULL(id) & mask;
} }
struct btree_root { struct btree_root {
......
...@@ -116,6 +116,9 @@ ...@@ -116,6 +116,9 @@
x(ENOENT, ENOENT_dev_idx_not_found) \ x(ENOENT, ENOENT_dev_idx_not_found) \
x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
x(EEXIST, EEXIST_str_hash_set) \
x(EEXIST, EEXIST_discard_in_flight_add) \
x(EEXIST, EEXIST_subvolume_create) \
x(0, open_buckets_empty) \ x(0, open_buckets_empty) \
x(0, freelist_empty) \ x(0, freelist_empty) \
x(BCH_ERR_freelist_empty, no_buckets_found) \ x(BCH_ERR_freelist_empty, no_buckets_found) \
......
...@@ -15,6 +15,7 @@ bool bch2_inconsistent_error(struct bch_fs *c) ...@@ -15,6 +15,7 @@ bool bch2_inconsistent_error(struct bch_fs *c)
switch (c->opts.errors) { switch (c->opts.errors) {
case BCH_ON_ERROR_continue: case BCH_ON_ERROR_continue:
return false; return false;
case BCH_ON_ERROR_fix_safe:
case BCH_ON_ERROR_ro: case BCH_ON_ERROR_ro:
if (bch2_fs_emergency_read_only(c)) if (bch2_fs_emergency_read_only(c))
bch_err(c, "inconsistency detected - emergency read only at journal seq %llu", bch_err(c, "inconsistency detected - emergency read only at journal seq %llu",
...@@ -191,6 +192,12 @@ static void prt_actioning(struct printbuf *out, const char *action) ...@@ -191,6 +192,12 @@ static void prt_actioning(struct printbuf *out, const char *action)
prt_str(out, "ing"); prt_str(out, "ing");
} }
static const u8 fsck_flags_extra[] = {
#define x(t, n, flags) [BCH_FSCK_ERR_##t] = flags,
BCH_SB_ERRS()
#undef x
};
int bch2_fsck_err(struct bch_fs *c, int bch2_fsck_err(struct bch_fs *c,
enum bch_fsck_flags flags, enum bch_fsck_flags flags,
enum bch_sb_error_id err, enum bch_sb_error_id err,
...@@ -203,6 +210,9 @@ int bch2_fsck_err(struct bch_fs *c, ...@@ -203,6 +210,9 @@ int bch2_fsck_err(struct bch_fs *c,
int ret = -BCH_ERR_fsck_ignore; int ret = -BCH_ERR_fsck_ignore;
const char *action_orig = "fix?", *action = action_orig; const char *action_orig = "fix?", *action = action_orig;
if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
flags |= fsck_flags_extra[err];
if ((flags & FSCK_CAN_FIX) && if ((flags & FSCK_CAN_FIX) &&
test_bit(err, c->sb.errors_silent)) test_bit(err, c->sb.errors_silent))
return -BCH_ERR_fsck_fix; return -BCH_ERR_fsck_fix;
...@@ -265,7 +275,14 @@ int bch2_fsck_err(struct bch_fs *c, ...@@ -265,7 +275,14 @@ int bch2_fsck_err(struct bch_fs *c,
prt_printf(out, bch2_log_msg(c, "")); prt_printf(out, bch2_log_msg(c, ""));
#endif #endif
if (!test_bit(BCH_FS_fsck_running, &c->flags)) { if ((flags & FSCK_CAN_FIX) &&
(flags & FSCK_AUTOFIX) &&
(c->opts.errors == BCH_ON_ERROR_continue ||
c->opts.errors == BCH_ON_ERROR_fix_safe)) {
prt_str(out, ", ");
prt_actioning(out, action);
ret = -BCH_ERR_fsck_fix;
} else if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
if (c->opts.errors != BCH_ON_ERROR_continue || if (c->opts.errors != BCH_ON_ERROR_continue ||
!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
prt_str(out, ", shutting down"); prt_str(out, ", shutting down");
......
...@@ -108,13 +108,6 @@ struct fsck_err_state { ...@@ -108,13 +108,6 @@ struct fsck_err_state {
char *last_msg; char *last_msg;
}; };
enum bch_fsck_flags {
FSCK_CAN_FIX = 1 << 0,
FSCK_CAN_IGNORE = 1 << 1,
FSCK_NEED_FSCK = 1 << 2,
FSCK_NO_RATELIMIT = 1 << 3,
};
#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) #define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
__printf(4, 5) __cold __printf(4, 5) __cold
......
...@@ -373,7 +373,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, ...@@ -373,7 +373,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
} }
if (dst_dentry->d_inode) { if (dst_dentry->d_inode) {
error = -EEXIST; error = -BCH_ERR_EEXIST_subvolume_create;
goto err3; goto err3;
} }
......
...@@ -188,6 +188,12 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino ...@@ -188,6 +188,12 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
BUG_ON(!old); BUG_ON(!old);
if (unlikely(old != inode)) { if (unlikely(old != inode)) {
/*
* bcachefs doesn't use I_NEW; we have no use for it since we
* only insert fully created inodes in the inode hash table. But
* discard_new_inode() expects it to be set...
*/
inode->v.i_flags |= I_NEW;
discard_new_inode(&inode->v); discard_new_inode(&inode->v);
inode = old; inode = old;
} else { } else {
...@@ -195,8 +201,10 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino ...@@ -195,8 +201,10 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock); mutex_unlock(&c->vfs_inodes_lock);
/* /*
* we really don't want insert_inode_locked2() to be setting * Again, I_NEW makes no sense for bcachefs. This is only needed
* I_NEW... * for clearing I_NEW, but since the inode was already fully
* created and initialized we didn't actually want
* inode_insert5() to set it for us.
*/ */
unlock_new_inode(&inode->v); unlock_new_inode(&inode->v);
} }
...@@ -1157,6 +1165,7 @@ static const struct file_operations bch_file_operations = { ...@@ -1157,6 +1165,7 @@ static const struct file_operations bch_file_operations = {
.read_iter = bch2_read_iter, .read_iter = bch2_read_iter,
.write_iter = bch2_write_iter, .write_iter = bch2_write_iter,
.mmap = bch2_mmap, .mmap = bch2_mmap,
.get_unmapped_area = thp_get_unmapped_area,
.fsync = bch2_fsync, .fsync = bch2_fsync,
.splice_read = filemap_splice_read, .splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write, .splice_write = iter_file_splice_write,
...@@ -1488,11 +1497,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, ...@@ -1488,11 +1497,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
bch2_iget5_set(&inode->v, &inum); bch2_iget5_set(&inode->v, &inum);
bch2_inode_update_after_write(trans, inode, bi, ~0); bch2_inode_update_after_write(trans, inode, bi, ~0);
if (BCH_SUBVOLUME_SNAP(subvol))
set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
else
clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
inode->v.i_blocks = bi->bi_sectors; inode->v.i_blocks = bi->bi_sectors;
inode->v.i_ino = bi->bi_inum; inode->v.i_ino = bi->bi_inum;
inode->v.i_rdev = bi->bi_dev; inode->v.i_rdev = bi->bi_dev;
...@@ -1504,6 +1508,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, ...@@ -1504,6 +1508,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
inode->ei_qid = bch_qid(bi); inode->ei_qid = bch_qid(bi);
inode->ei_subvol = inum.subvol; inode->ei_subvol = inum.subvol;
if (BCH_SUBVOLUME_SNAP(subvol))
set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
inode->v.i_mapping->a_ops = &bch_address_space_operations; inode->v.i_mapping->a_ops = &bch_address_space_operations;
switch (inode->v.i_mode & S_IFMT) { switch (inode->v.i_mode & S_IFMT) {
......
...@@ -1167,6 +1167,9 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) ...@@ -1167,6 +1167,9 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
void bch2_fs_journal_stop(struct journal *j) void bch2_fs_journal_stop(struct journal *j)
{ {
if (!test_bit(JOURNAL_running, &j->flags))
return;
bch2_journal_reclaim_stop(j); bch2_journal_reclaim_stop(j);
bch2_journal_flush_all_pins(j); bch2_journal_flush_all_pins(j);
......
...@@ -1967,7 +1967,6 @@ CLOSURE_CALLBACK(bch2_journal_write) ...@@ -1967,7 +1967,6 @@ CLOSURE_CALLBACK(bch2_journal_write)
struct journal *j = container_of(w, struct journal, buf[w->idx]); struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_replicas_padded replicas; struct bch_replicas_padded replicas;
struct printbuf journal_debug_buf = PRINTBUF;
unsigned nr_rw_members = 0; unsigned nr_rw_members = 0;
int ret; int ret;
...@@ -2011,11 +2010,15 @@ CLOSURE_CALLBACK(bch2_journal_write) ...@@ -2011,11 +2010,15 @@ CLOSURE_CALLBACK(bch2_journal_write)
} }
if (ret) { if (ret) {
__bch2_journal_debug_to_text(&journal_debug_buf, j); struct printbuf buf = PRINTBUF;
buf.atomic++;
prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write: %s"),
bch2_err_str(ret));
__bch2_journal_debug_to_text(&buf, j);
spin_unlock(&j->lock); spin_unlock(&j->lock);
bch_err(c, "Unable to allocate journal write:\n%s", bch2_print_string_as_lines(KERN_ERR, buf.buf);
journal_debug_buf.buf); printbuf_exit(&buf);
printbuf_exit(&journal_debug_buf);
goto err; goto err;
} }
......
...@@ -2,9 +2,6 @@ ...@@ -2,9 +2,6 @@
#ifndef _BCACHEFS_LRU_H #ifndef _BCACHEFS_LRU_H
#define _BCACHEFS_LRU_H #define _BCACHEFS_LRU_H
#define LRU_TIME_BITS 48
#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
static inline u64 lru_pos_id(struct bpos pos) static inline u64 lru_pos_id(struct bpos pos)
{ {
return pos.inode >> LRU_TIME_BITS; return pos.inode >> LRU_TIME_BITS;
......
...@@ -137,7 +137,7 @@ enum fsck_err_opts { ...@@ -137,7 +137,7 @@ enum fsck_err_opts {
x(errors, u8, \ x(errors, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_STR(bch2_error_actions), \ OPT_STR(bch2_error_actions), \
BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \
NULL, "Action to take on filesystem error") \ NULL, "Action to take on filesystem error") \
x(metadata_replicas, u8, \ x(metadata_replicas, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
......
...@@ -326,6 +326,12 @@ static int journal_replay_entry_early(struct bch_fs *c, ...@@ -326,6 +326,12 @@ static int journal_replay_entry_early(struct bch_fs *c,
case BCH_JSET_ENTRY_btree_root: { case BCH_JSET_ENTRY_btree_root: {
struct btree_root *r; struct btree_root *r;
if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX,
c, invalid_btree_id,
"invalid btree id %u (max %u)",
entry->btree_id, BTREE_ID_NR_MAX))
return 0;
while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) {
ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL });
if (ret) if (ret)
...@@ -415,7 +421,7 @@ static int journal_replay_entry_early(struct bch_fs *c, ...@@ -415,7 +421,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time));
} }
} }
fsck_err:
return ret; return ret;
} }
...@@ -658,10 +664,10 @@ int bch2_fs_recovery(struct bch_fs *c) ...@@ -658,10 +664,10 @@ int bch2_fs_recovery(struct bch_fs *c)
if (check_version_upgrade(c)) if (check_version_upgrade(c))
write_sb = true; write_sb = true;
c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
if (write_sb) if (write_sb)
bch2_write_super(c); bch2_write_super(c);
c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
mutex_unlock(&c->sb_lock); mutex_unlock(&c->sb_lock);
if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
......
...@@ -228,7 +228,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c) ...@@ -228,7 +228,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
dst = (void *) &darray_top(table); dst = (void *) &darray_top(table);
dst->version = cpu_to_le16(src->version); dst->version = cpu_to_le16(src->version);
dst->recovery_passes[0] = cpu_to_le64(src->recovery_passes); dst->recovery_passes[0] = cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes));
dst->recovery_passes[1] = 0; dst->recovery_passes[1] = 0;
dst->nr_errors = cpu_to_le16(src->nr_errors); dst->nr_errors = cpu_to_le16(src->nr_errors);
for (unsigned i = 0; i < src->nr_errors; i++) for (unsigned i = 0; i < src->nr_errors; i++)
......
This diff is collapsed.
...@@ -1565,13 +1565,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) ...@@ -1565,13 +1565,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
return 0; return 0;
if (!test_bit(BCH_FS_started, &c->flags)) {
ret = bch2_fs_read_write_early(c);
bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
if (ret)
return ret;
}
trans = bch2_trans_get(c); trans = bch2_trans_get(c);
/* /*
...@@ -1687,6 +1680,8 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work) ...@@ -1687,6 +1680,8 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work)
{ {
struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
bch2_delete_dead_snapshots(c); bch2_delete_dead_snapshots(c);
bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
} }
......
...@@ -300,7 +300,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, ...@@ -300,7 +300,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
if (!found && (flags & STR_HASH_must_replace)) { if (!found && (flags & STR_HASH_must_replace)) {
ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
} else if (found && (flags & STR_HASH_must_create)) { } else if (found && (flags & STR_HASH_must_create)) {
ret = -EEXIST; ret = -BCH_ERR_EEXIST_str_hash_set;
} else { } else {
if (!found && slot.path) if (!found && slot.path)
swap(iter, slot); swap(iter, slot);
......
...@@ -649,9 +649,10 @@ static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf ...@@ -649,9 +649,10 @@ static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf
bytes = vstruct_bytes(sb->sb); bytes = vstruct_bytes(sb->sb);
if (bytes > 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits)) { u64 sb_size = 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits);
prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", if (bytes > sb_size) {
bytes, 512UL << sb->sb->layout.sb_max_size_bits); prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %llu)",
bytes, sb_size);
return -BCH_ERR_invalid_sb_too_big; return -BCH_ERR_invalid_sb_too_big;
} }
......
...@@ -912,9 +912,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) ...@@ -912,9 +912,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_io_clock_init(&c->io_clock[WRITE]) ?: bch2_io_clock_init(&c->io_clock[WRITE]) ?:
bch2_fs_journal_init(&c->journal) ?: bch2_fs_journal_init(&c->journal) ?:
bch2_fs_replicas_init(c) ?: bch2_fs_replicas_init(c) ?:
bch2_fs_btree_iter_init(c) ?:
bch2_fs_btree_cache_init(c) ?: bch2_fs_btree_cache_init(c) ?:
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
bch2_fs_btree_iter_init(c) ?:
bch2_fs_btree_interior_update_init(c) ?: bch2_fs_btree_interior_update_init(c) ?:
bch2_fs_buckets_waiting_for_journal_init(c) ?: bch2_fs_buckets_waiting_for_journal_init(c) ?:
bch2_fs_btree_write_buffer_init(c) ?: bch2_fs_btree_write_buffer_init(c) ?:
...@@ -931,10 +931,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) ...@@ -931,10 +931,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
if (ret) if (ret)
goto err; goto err;
for (i = 0; i < c->sb.nr_devices; i++) for (i = 0; i < c->sb.nr_devices; i++) {
if (bch2_member_exists(c->disk_sb.sb, i) && if (!bch2_member_exists(c->disk_sb.sb, i))
bch2_dev_alloc(c, i)) { continue;
ret = -EEXIST; ret = bch2_dev_alloc(c, i);
if (ret)
goto err; goto err;
} }
......
...@@ -17,12 +17,18 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) ...@@ -17,12 +17,18 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
{ {
int r = flags & CLOSURE_REMAINING_MASK; int r = flags & CLOSURE_REMAINING_MASK;
BUG_ON(flags & CLOSURE_GUARD_MASK); if (WARN(flags & CLOSURE_GUARD_MASK,
BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); "closure has guard bits set: %x (%u)",
flags & CLOSURE_GUARD_MASK, (unsigned) __fls(r)))
r &= ~CLOSURE_GUARD_MASK;
if (!r) { if (!r) {
smp_acquire__after_ctrl_dep(); smp_acquire__after_ctrl_dep();
WARN(flags & ~CLOSURE_DESTRUCTOR,
"closure ref hit 0 with incorrect flags set: %x (%u)",
flags & ~CLOSURE_DESTRUCTOR, (unsigned) __fls(flags));
cl->closure_get_happened = false; cl->closure_get_happened = false;
if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment