Merge tag 'bcachefs-2024-03-19' of https://evilpiepirate.org/git/bcachefs

Pull bcachefs fixes from Kent Overstreet: "Assorted bugfixes. Most are fixes for simple assertion pops; the most significant fix is for a deadlock in recovery when we have to rewrite large numbers of btree nodes to fix errors. This was incorrectly running out of the same workqueue as the core interior btree update path - we now give it its own single threaded workqueue. This was visible to users as "bch2_btree_update_start(): error: BCH_ERR_journal_reclaim_would_deadlock" - and then recovery hanging" * tag 'bcachefs-2024-03-19' of https://evilpiepirate.org/git/bcachefs: bcachefs: Fix lost wakeup on journal shutdown bcachefs; Fix deadlock in bch2_btree_update_start() bcachefs: ratelimit errors from async_btree_node_rewrite bcachefs: Run check_topology() first bcachefs: Improve bch2_fatal_error() bcachefs: Fix lost transaction restart error bcachefs: Don't corrupt journal keys gap buffer when dropping alloc info bcachefs: fix for building in userspace bcachefs: bch2_snapshot_is_ancestor() now safe to call in early recovery bcachefs: Fix nested transaction restart handling in bch2_bucket_gens_init() bcachefs: Improve sysfs internal/btree_updates bcachefs: Split out btree_node_rewrite_worker bcachefs: Fix locking in bch2_alloc_write_key() bcachefs: Avoid extent entry type assertions in .invalid() bcachefs: Fix spurious -BCH_ERR_transaction_restart_nested bcachefs: Fix check_key_has_snapshot() call bcachefs: Change "accounting overran journal reservation" to a warning

Merge tag 'bcachefs-2024-03-19' of https://evilpiepirate.org/git/bcachefs
Pull bcachefs fixes from Kent Overstreet: "Assorted bugfixes. Most are fixes for simple assertion pops; the most significant fix is for a deadlock in recovery when we have to rewrite large numbers of btree nodes to fix errors. This was incorrectly running out of the same workqueue as the core interior btree update path - we now give it its own single threaded workqueue. This was visible to users as "bch2_btree_update_start(): error: BCH_ERR_journal_reclaim_would_deadlock" - and then recovery hanging" * tag 'bcachefs-2024-03-19' of https://evilpiepirate.org/git/bcachefs: bcachefs: Fix lost wakeup on journal shutdown bcachefs; Fix deadlock in bch2_btree_update_start() bcachefs: ratelimit errors from async_btree_node_rewrite bcachefs: Run check_topology() first bcachefs: Improve bch2_fatal_error() bcachefs: Fix lost transaction restart error bcachefs: Don't corrupt journal keys gap buffer when dropping alloc info bcachefs: fix for building in userspace bcachefs: bch2_snapshot_is_ancestor() now safe to call in early recovery bcachefs: Fix nested transaction restart handling in bch2_bucket_gens_init() bcachefs: Improve sysfs internal/btree_updates bcachefs: Split out btree_node_rewrite_worker bcachefs: Fix locking in bch2_alloc_write_key() bcachefs: Avoid extent entry type assertions in .invalid() bcachefs: Fix spurious -BCH_ERR_transaction_restart_nested bcachefs: Fix check_key_has_snapshot() call bcachefs: Change "accounting overran journal reservation" to a warning
a4145ce1 · Linus Torvalds · 78c3925c · 2e92d26b · a4145ce1 · a4145ce1
Commit a4145ce1 authored Mar 19, 2024 by Linus Torvalds
26 changed files
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -532,13 +532,13 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 		u8 gen = bch2_alloc_to_v4(k, &a)->gen;
 		unsigned offset;
 		struct bpos pos = alloc_gens_pos(iter.pos, &offset);
+		int ret2 = 0;

 		if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
-			ret = commit_do(trans, NULL, NULL,
-					BCH_TRANS_COMMIT_no_enospc,
-				bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
-			if (ret)
-				break;
+			ret2 =  bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
+				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+			if (ret2)
+				goto iter_err;
 			have_bucket_gens_key = false;
 		}

@@ -549,7 +549,8 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 		}

 		g.v.gens[offset] = gen;
-		0;
+iter_err:
+		ret2;
 	}));

 	if (have_bucket_gens_key && !ret)
@@ -852,7 +853,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 					bucket_journal_seq);
 			if (ret) {
 				bch2_fs_fatal_error(c,
-					"error setting bucket_needs_journal_commit: %i", ret);
+					"setting bucket_needs_journal_commit: %s", bch2_err_str(ret));
 				return ret;
 			}
 		}

--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1356,16 +1356,18 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,

 		/* Don't retry from all devices if we're out of open buckets: */
 		if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
-			int ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+			int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
 					      target, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, watermark,
 					      flags, cl);
-			if (!ret ||
-			    bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-			    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+			if (!ret2 ||
+			    bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
+			    bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
+				ret = ret2;
 				goto alloc_done;
 			}
+		}

 		/*
 		 * Only try to allocate cache (durability = 0 devices) from the

--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -849,6 +849,8 @@ struct bch_fs {
 	struct workqueue_struct	*btree_interior_update_worker;
 	struct work_struct	btree_interior_update_work;

+	struct workqueue_struct	*btree_node_rewrite_worker;
+
 	struct list_head	pending_node_rewrites;
 	struct mutex		pending_node_rewrites_lock;


--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1392,11 +1392,11 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 					 *old,
 					 b->data_type);
 	gc = *b;
-	percpu_up_read(&c->mark_lock);

 	if (gc.data_type != old_gc.data_type ||
 	    gc.dirty_sectors != old_gc.dirty_sectors)
 		bch2_dev_usage_update_m(c, ca, &old_gc, &gc);
+	percpu_up_read(&c->mark_lock);

 	if (metadata_only &&
 	    gc.data_type != BCH_DATA_sb &&

--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1066,7 +1066,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,

 			ret = bset_encrypt(c, i, b->written << 9);
 			if (bch2_fs_fatal_err_on(ret, c,
-					"error decrypting btree node: %i", ret))
+					"decrypting btree node: %s", bch2_err_str(ret)))
 				goto fsck_err;

 			btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
@@ -1107,7 +1107,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,

 			ret = bset_encrypt(c, i, b->written << 9);
 			if (bch2_fs_fatal_err_on(ret, c,
-					"error decrypting btree node: %i\n", ret))
+					"decrypting btree node: %s", bch2_err_str(ret)))
 				goto fsck_err;

 			sectors = vstruct_sectors(bne, c->block_bits);
@@ -1338,7 +1338,7 @@ static void btree_node_read_work(struct work_struct *work)
 	if (saw_error && !btree_node_read_error(b)) {
 		printbuf_reset(&buf);
 		bch2_bpos_to_text(&buf, b->key.k.p);
-		bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
+		bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
 			 __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);

 		bch2_btree_node_rewrite_async(c, b);
@@ -1874,8 +1874,8 @@ static void btree_node_write_work(struct work_struct *work)
 	return;
 err:
 	set_btree_node_noevict(b);
-	if (!bch2_err_matches(ret, EROFS))
-		bch2_fs_fatal_error(c, "fatal error writing btree node: %s", bch2_err_str(ret));
+	bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
+			     "writing btree node: %s", bch2_err_str(ret));
 	goto out;
 }

@@ -2131,7 +2131,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)

 	ret = bset_encrypt(c, i, b->written << 9);
 	if (bch2_fs_fatal_err_on(ret, c,
-			"error encrypting btree node: %i\n", ret))
+			"encrypting btree node: %s", bch2_err_str(ret)))
 		goto err;

 	nonce = btree_nonce(i, b->written << 9);

--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -676,7 +676,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 			     !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
 			     !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
 			     !bch2_journal_error(j), c,
-			     "error flushing key cache: %s", bch2_err_str(ret));
+			     "flushing key cache: %s", bch2_err_str(ret));
 	if (ret)
 		goto out;


--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -646,7 +646,7 @@ static void btree_update_nodes_written(struct btree_update *as)
 	bch2_trans_unlock(trans);

 	bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
-			     "%s(): error %s", __func__, bch2_err_str(ret));
+			     "%s", bch2_err_str(ret));
 err:
 	if (as->b) {

@@ -1067,13 +1067,18 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	flags &= ~BCH_WATERMARK_MASK;
 	flags |= watermark;

-	if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
-	    watermark < c->journal.watermark) {
+	if (watermark < c->journal.watermark) {
 		struct journal_res res = { 0 };
+		unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK;
+
+		if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
+		    watermark != BCH_WATERMARK_reclaim)
+			journal_flags |= JOURNAL_RES_GET_NONBLOCK;

 		ret = drop_locks_do(trans,
-			bch2_journal_res_get(&c->journal, &res, 1,
-					     watermark|JOURNAL_RES_GET_CHECK));
+			bch2_journal_res_get(&c->journal, &res, 1, journal_flags));
+		if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+			ret = -BCH_ERR_journal_reclaim_would_deadlock;
 		if (ret)
 			return ERR_PTR(ret);
 	}
@@ -1117,6 +1122,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	closure_init(&as->cl, NULL);
 	as->c		= c;
 	as->start_time	= start_time;
+	as->ip_started	= _RET_IP_;
 	as->mode	= BTREE_INTERIOR_NO_UPDATE;
 	as->took_gc_lock = true;
 	as->btree_id	= path->btree_id;
@@ -1192,7 +1198,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 err:
 	bch2_btree_update_free(as, trans);
 	if (!bch2_err_matches(ret, ENOSPC) &&
-	    !bch2_err_matches(ret, EROFS))
+	    !bch2_err_matches(ret, EROFS) &&
+	    ret != -BCH_ERR_journal_reclaim_would_deadlock)
 		bch_err_fn_ratelimited(c, ret);
 	return ERR_PTR(ret);
 }
@@ -2114,7 +2121,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work)

 	ret = bch2_trans_do(c, NULL, NULL, 0,
 		      async_btree_node_rewrite_trans(trans, a));
-	bch_err_fn(c, ret);
+	bch_err_fn_ratelimited(c, ret);
 	bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
 	kfree(a);
 }
@@ -2161,7 +2168,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 		bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
 	}

-	queue_work(c->btree_interior_update_worker, &a->work);
+	queue_work(c->btree_node_rewrite_worker, &a->work);
 }

 void bch2_do_pending_node_rewrites(struct bch_fs *c)
@@ -2173,7 +2180,7 @@ void bch2_do_pending_node_rewrites(struct bch_fs *c)
 		list_del(&a->list);

 		bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
-		queue_work(c->btree_interior_update_worker, &a->work);
+		queue_work(c->btree_node_rewrite_worker, &a->work);
 	}
 	mutex_unlock(&c->pending_node_rewrites_lock);
 }
@@ -2441,8 +2448,8 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)

 	mutex_lock(&c->btree_interior_update_lock);
 	list_for_each_entry(as, &c->btree_interior_update_list, list)
-		prt_printf(out, "%p m %u w %u r %u j %llu\n",
-		       as,
+		prt_printf(out, "%ps: mode=%u nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
+			   (void *) as->ip_started,
 			   as->mode,
 			   as->nodes_written,
 			   closure_nr_remaining(&as->cl),
@@ -2510,6 +2517,8 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c,

 void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
 {
+	if (c->btree_node_rewrite_worker)
+		destroy_workqueue(c->btree_node_rewrite_worker);
 	if (c->btree_interior_update_worker)
 		destroy_workqueue(c->btree_interior_update_worker);
 	mempool_exit(&c->btree_interior_update_pool);
@@ -2534,6 +2543,11 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c)
 	if (!c->btree_interior_update_worker)
 		return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;

+	c->btree_node_rewrite_worker =
+		alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
+	if (!c->btree_node_rewrite_worker)
+		return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+
 	if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
 				      sizeof(struct btree_update)))
 		return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;

--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -32,6 +32,7 @@ struct btree_update {
 	struct closure			cl;
 	struct bch_fs			*c;
 	u64				start_time;
+	unsigned long			ip_started;

 	struct list_head		list;
 	struct list_head		unwritten_list;

--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -378,7 +378,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 		}
 	}
 err:
-	bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
+	bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
 	trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
 	bch2_journal_pin_drop(j, &wb->flushing.pin);
 	wb->flushing.keys.nr = 0;

--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -990,8 +990,8 @@ static int __trigger_extent(struct btree_trans *trans,
 				ret = !gc
 					? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors)
 					: update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true);
-				bch2_fs_fatal_err_on(ret && gc, c, "%s(): no replicas entry while updating cached sectors",
-						     __func__);
+				bch2_fs_fatal_err_on(ret && gc, c, "%s: no replicas entry while updating cached sectors",
+						     bch2_err_str(ret));
 				if (ret)
 					return ret;
 			}
@@ -1020,7 +1020,7 @@ static int __trigger_extent(struct btree_trans *trans,
 			struct printbuf buf = PRINTBUF;

 			bch2_bkey_val_to_text(&buf, c, k);
-			bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
+			bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
 			printbuf_exit(&buf);
 		}
 		if (ret)

--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -170,7 +170,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 		struct printbuf buf = PRINTBUF;

 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-		bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf);
+		bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf);
 		printbuf_exit(&buf);
 	}
 out:

--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -448,7 +448,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
 			struct printbuf buf = PRINTBUF;

 			bch2_bkey_val_to_text(&buf, c, new);
-			bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+			bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
 			printbuf_exit(&buf);
 			return ret;
 		}
@@ -1868,10 +1868,10 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
 		return -BCH_ERR_stripe_alloc_blocked;

 	ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
+	bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
+			     "reading stripe key: %s", bch2_err_str(ret));
 	if (ret) {
 		bch2_stripe_close(c, h->s);
-		if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret));
 		return ret;
 	}


--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -191,9 +191,9 @@ do {									\

 void bch2_fatal_error(struct bch_fs *);

-#define bch2_fs_fatal_error(c, ...)					\
+#define bch2_fs_fatal_error(c, _msg, ...)				\
 do {									\
-	bch_err(c, __VA_ARGS__);					\
+	bch_err(c, "%s(): fatal error " _msg, __func__, ##__VA_ARGS__);	\
 	bch2_fatal_error(c);						\
 } while (0)


--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -108,17 +108,17 @@ static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *en

 static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
 {
-	return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+	return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
 }

 static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
 {
-	return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
+	return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
 }

 static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
 {
-	switch (extent_entry_type(e)) {
+	switch (__extent_entry_type(e)) {
 	case BCH_EXTENT_ENTRY_crc32:
 	case BCH_EXTENT_ENTRY_crc64:
 	case BCH_EXTENT_ENTRY_crc128:

--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -108,7 +108,8 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 		goto retry;

 	bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
-			     "inode %u:%llu not found when updating",
+			     "%s: inode %u:%llu not found when updating",
+			     bch2_err_str(ret),
 			     inode_inum(inode).subvol,
 			     inode_inum(inode).inum);


--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1114,10 +1114,9 @@ int bch2_check_inodes(struct bch_fs *c)
 	return ret;
 }

-static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
 {
 	struct bch_fs *c = trans->c;
-	u32 restart_count = trans->restart_count;
 	int ret = 0;
 	s64 count2;

@@ -1149,7 +1148,14 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 	}
 fsck_err:
 	bch_err_fn(c, ret);
-	return ret ?: trans_was_restarted(trans, restart_count);
+	return ret;
+}
+
+static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+{
+	u32 restart_count = trans->restart_count;
+	return check_i_sectors_notnested(trans, w) ?:
+		trans_was_restarted(trans, restart_count);
 }

 struct extent_end {
@@ -1533,7 +1539,7 @@ int bch2_check_extents(struct bch_fs *c)
 			check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
 			check_extent_overbig(trans, &iter, k);
 		})) ?:
-		check_i_sectors(trans, &w));
+		check_i_sectors_notnested(trans, &w));

 	bch2_disk_reservation_put(c, &res);
 	extent_ends_exit(&extent_ends);
@@ -1563,10 +1569,9 @@ int bch2_check_indirect_extents(struct bch_fs *c)
 	return ret;
 }

-static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
 {
 	struct bch_fs *c = trans->c;
-	u32 restart_count = trans->restart_count;
 	int ret = 0;
 	s64 count2;

@@ -1598,7 +1603,14 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 	}
 fsck_err:
 	bch_err_fn(c, ret);
-	return ret ?: trans_was_restarted(trans, restart_count);
+	return ret;
+}
+
+static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+{
+	u32 restart_count = trans->restart_count;
+	return check_subdir_count_notnested(trans, w) ?:
+		trans_was_restarted(trans, restart_count);
 }

 static int check_dirent_inode_dirent(struct btree_trans *trans,
@@ -2003,7 +2015,8 @@ int bch2_check_dirents(struct bch_fs *c)
 				k,
 				NULL, NULL,
 				BCH_TRANS_COMMIT_no_enospc,
-			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)));
+			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
+		check_subdir_count_notnested(trans, &dir));

 	snapshots_seen_exit(&s);
 	inode_walker_exit(&dir);
@@ -2022,8 +2035,10 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 	int ret;

 	ret = check_key_has_snapshot(trans, iter, k);
-	if (ret)
+	if (ret < 0)
 		return ret;
+	if (ret)
+		return 0;

 	i = walk_inode(trans, inode, k);
 	ret = PTR_ERR_OR_ZERO(i);

--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -511,18 +511,18 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
 	if (journal_res_get_fast(j, res, flags))
 		return 0;

+	if (bch2_journal_error(j))
+		return -BCH_ERR_erofs_journal_err;
+
+	if (j->blocked)
+		return -BCH_ERR_journal_res_get_blocked;
+
 	if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
 		ret = JOURNAL_ERR_journal_full;
 		can_discard = j->can_discard;
 		goto out;
 	}

-	if (j->blocked)
-		return -BCH_ERR_journal_res_get_blocked;
-
-	if (bch2_journal_error(j))
-		return -BCH_ERR_erofs_journal_err;
-
 	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
 		ret = JOURNAL_ERR_max_in_flight;
 		goto out;

--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1082,9 +1082,7 @@ static int journal_read_bucket(struct bch_dev *ca,
 		ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
 			     j->encrypted_start,
 			     vstruct_end(j) - (void *) j->encrypted_start);
-		bch2_fs_fatal_err_on(ret, c,
-				"error decrypting journal entry: %s",
-				bch2_err_str(ret));
+		bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret));

 		mutex_lock(&jlist->lock);
 		ret = journal_entry_add(c, ca, (struct journal_ptr) {
@@ -1820,7 +1818,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 			jset_entry_for_each_key(i, k) {
 				ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
 				if (ret) {
-					bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer");
+					bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s",
+							    bch2_err_str(ret));
 					bch2_journal_keys_to_write_buffer_end(c, &wb);
 					return ret;
 				}
@@ -1848,7 +1847,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)

 	bch2_journal_super_entries_add_common(c, &end, seq);
 	u64s	= (u64 *) end - (u64 *) start;
-	BUG_ON(u64s > j->entry_u64s_reserved);
+
+	WARN_ON(u64s > j->entry_u64s_reserved);

 	le32_add_cpu(&jset->u64s, u64s);

@@ -1856,7 +1856,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 	bytes	= vstruct_bytes(jset);

 	if (sectors > w->sectors) {
-		bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
+		bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
 				    vstruct_bytes(jset), w->sectors << 9,
 				    u64s, w->u64s_reserved, j->entry_u64s_reserved);
 		return -EINVAL;
@@ -1884,8 +1884,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 	ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
 		    jset->encrypted_start,
 		    vstruct_end(jset) - (void *) jset->encrypted_start);
-	if (bch2_fs_fatal_err_on(ret, c,
-			"error decrypting journal entry: %i", ret))
+	if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)))
 		return ret;

 	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),

--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -101,8 +101,8 @@ void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
 		struct printbuf buf = PRINTBUF;

 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-		bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s",
-				     __func__, buf.buf, bch2_err_str(ret));
+		bch2_fs_fatal_error(c, "deleting logged operation %s: %s",
+				    buf.buf, bch2_err_str(ret));
 		printbuf_exit(&buf);
 	}
 }
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -155,8 +155,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
 	if (bch2_err_matches(ret, EROFS))
 		return ret;

-	if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()",
-				 __func__, bch2_err_str(ret)))
+	if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
 		return ret;

 	ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,

--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -90,10 +90,12 @@ static void do_reconstruct_alloc(struct bch_fs *c)
 	struct journal_keys *keys = &c->journal_keys;
 	size_t src, dst;

+	move_gap(keys, keys->nr);
+
 	for (src = 0, dst = 0; src < keys->nr; src++)
 		if (!btree_id_is_alloc(keys->data[src].btree_id))
 			keys->data[dst++] = keys->data[src];
-	keys->nr = dst;
+	keys->nr = keys->gap = dst;
 }

 /*
@@ -203,6 +205,8 @@ static int bch2_journal_replay(struct bch_fs *c)

 	BUG_ON(!atomic_read(&keys->ref));

+	move_gap(keys, keys->nr);
+
 	/*
 	 * First, attempt to replay keys in sorted order. This is more
 	 * efficient - better locality of btree access -  but some might fail if

--- a/fs/bcachefs/recovery_types.h
+++ b/fs/bcachefs/recovery_types.h
@@ -13,11 +13,11 @@
 * must never change:
 */
 #define BCH_RECOVERY_PASSES()							\
+	x(check_topology,			 4, 0)				\
 	x(alloc_read,				 0, PASS_ALWAYS)		\
 	x(stripes_read,				 1, PASS_ALWAYS)		\
 	x(initialize_subvolumes,		 2, 0)				\
 	x(snapshots_read,			 3, PASS_ALWAYS)		\
-	x(check_topology,			 4, 0)				\
 	x(check_allocations,			 5, PASS_FSCK)			\
 	x(trans_mark_dev_sbs,			 6, PASS_ALWAYS|PASS_SILENT)	\
 	x(fs_journal_alloc,			 7, PASS_ALWAYS|PASS_SILENT)	\

--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -91,18 +91,20 @@ static int bch2_snapshot_tree_create(struct btree_trans *trans,

 /* Snapshot nodes: */

-static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
 {
-	struct snapshot_table *t;
-
-	rcu_read_lock();
-	t = rcu_dereference(c->snapshots);
-
 	while (id && id < ancestor)
 		id = __snapshot_t(t, id)->parent;
+	return id == ancestor;
+}
+
+static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	rcu_read_lock();
+	bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
 	rcu_read_unlock();

-	return id == ancestor;
+	return ret;
 }

 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
@@ -120,13 +122,15 @@ static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ances

 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
 {
-	struct snapshot_table *t;
 	bool ret;

-	EBUG_ON(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots);
-
 	rcu_read_lock();
-	t = rcu_dereference(c->snapshots);
+	struct snapshot_table *t = rcu_dereference(c->snapshots);
+
+	if (unlikely(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots)) {
+		ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
+		goto out;
+	}

 	while (id && id < ancestor - IS_ANCESTOR_BITMAP)
 		id = get_ancestor_below(t, id, ancestor);
@@ -134,11 +138,11 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
 	if (id && id < ancestor) {
 		ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);

-		EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor));
+		EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
 	} else {
 		ret = id == ancestor;
 	}
-
+out:
 	rcu_read_unlock();

 	return ret;
@@ -547,7 +551,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
 			"snapshot tree points to missing subvolume:\n  %s",
 			(printbuf_reset(&buf),
 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
-	    fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
+	    fsck_err_on(!bch2_snapshot_is_ancestor(c,
 						le32_to_cpu(subvol.snapshot),
 						root_id),
 			c, snapshot_tree_to_wrong_subvol,

--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -985,7 +985,7 @@ int bch2_write_super(struct bch_fs *c)
 		prt_str(&buf, " > ");
 		bch2_version_to_text(&buf, bcachefs_metadata_version_current);
 		prt_str(&buf, ")");
-		bch2_fs_fatal_error(c, "%s", buf.buf);
+		bch2_fs_fatal_error(c, ": %s", buf.buf);
 		printbuf_exit(&buf);
 		return -BCH_ERR_sb_not_downgraded;
 	}
@@ -1005,7 +1005,7 @@ int bch2_write_super(struct bch_fs *c)

 		if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
 			bch2_fs_fatal_error(c,
-				"Superblock write was silently dropped! (seq %llu expected %llu)",
+				": Superblock write was silently dropped! (seq %llu expected %llu)",
 				le64_to_cpu(ca->sb_read_scratch->seq),
 				ca->disk_sb.seq);
 			percpu_ref_put(&ca->io_ref);
@@ -1015,7 +1015,7 @@ int bch2_write_super(struct bch_fs *c)

 		if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
 			bch2_fs_fatal_error(c,
-				"Superblock modified by another process (seq %llu expected %llu)",
+				": Superblock modified by another process (seq %llu expected %llu)",
 				le64_to_cpu(ca->sb_read_scratch->seq),
 				ca->disk_sb.seq);
 			percpu_ref_put(&ca->io_ref);
@@ -1066,7 +1066,7 @@ int bch2_write_super(struct bch_fs *c)
 				 !can_mount_with_written ||
 				 (can_mount_without_written &&
 				  !can_mount_with_written), c,
-		"Unable to write superblock to sufficient devices (from %ps)",
+		": Unable to write superblock to sufficient devices (from %ps)",
 		(void *) _RET_IP_))
 		ret = -1;
 out:

--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -87,20 +87,28 @@ const char * const bch2_fs_flag_strs[] = {
 	NULL
 };

-void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)
+__printf(2, 0)
+static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
 {
-	struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;
-
-	va_list args;
-	va_start(args, fmt);
-	if (likely(!stdio)) {
-		vprintk(fmt, args);
-	} else {
+#ifdef __KERNEL__
+	if (unlikely(stdio)) {
 		if (fmt[0] == KERN_SOH[0])
 			fmt += 2;

 		bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
+		return;
 	}
+#endif
+	vprintk(fmt, args);
+}
+
+void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)
+{
+	struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;
+
+	va_list args;
+	va_start(args, fmt);
+	bch2_print_maybe_redirect(stdio, fmt, args);
 	va_end(args);
 }

@@ -110,14 +118,7 @@ void __bch2_print(struct bch_fs *c, const char *fmt, ...)

 	va_list args;
 	va_start(args, fmt);
-	if (likely(!stdio)) {
-		vprintk(fmt, args);
-	} else {
-		if (fmt[0] == KERN_SOH[0])
-			fmt += 2;
-
-		bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
-	}
+	bch2_print_maybe_redirect(stdio, fmt, args);
 	va_end(args);
 }


--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -683,6 +683,9 @@ static inline void __move_gap(void *array, size_t element_size,
 /* Move the gap in a gap buffer: */
 #define move_gap(_d, _new_gap)						\
 do {									\
+	BUG_ON(_new_gap > (_d)->nr);					\
+	BUG_ON((_d)->gap > (_d)->nr);					\
+									\
 	__move_gap((_d)->data, sizeof((_d)->data[0]),			\
 		   (_d)->nr, (_d)->size, (_d)->gap, _new_gap);		\
 	(_d)->gap = _new_gap;						\