Merge tag 'bcachefs-2024-08-21' of https://github.com/koverstreet/bcachefs

Push bcachefs fixes from Kent Overstreet: "The data corruption in the buffered write path is troubling; inode lock should not have been able to cause that... - Fix a rare data corruption in the rebalance path, caught as a nonce inconsistency on encrypted filesystems - Revert lockless buffered write path - Mark more errors as autofix" * tag 'bcachefs-2024-08-21' of https://github.com/koverstreet/bcachefs: bcachefs: Mark more errors as autofix bcachefs: Revert lockless buffered IO path bcachefs: Fix bch2_extents_match() false positive bcachefs: Fix failure to return error in data_update_index_update()

Merge tag 'bcachefs-2024-08-21' of https://github.com/koverstreet/bcachefs
Push bcachefs fixes from Kent Overstreet: "The data corruption in the buffered write path is troubling; inode lock should not have been able to cause that... - Fix a rare data corruption in the rebalance path, caught as a nonce inconsistency on encrypted filesystems - Revert lockless buffered write path - Mark more errors as autofix" * tag 'bcachefs-2024-08-21' of https://github.com/koverstreet/bcachefs: bcachefs: Mark more errors as autofix bcachefs: Revert lockless buffered IO path bcachefs: Fix bch2_extents_match() false positive bcachefs: Fix failure to return error in data_update_index_update()
a4c76312 · Linus Torvalds · 6cd90e5e · 3d3020c4 · a4c76312 · a4c76312
Commit a4c76312 authored Sep 01, 2024 by Linus Torvalds
5 changed files
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -337,6 +337,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
 			printbuf_exit(&buf);
 			bch2_fatal_error(c);
+			ret = -EIO;
 			goto out;
 		}

--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -257,7 +257,6 @@
 	x(BCH_ERR_nopromote,		nopromote_in_flight)			\
 	x(BCH_ERR_nopromote,		nopromote_no_writes)			\
 	x(BCH_ERR_nopromote,		nopromote_enomem)			\
-	x(0,				need_inode_lock)			\
 	x(0,				invalid_snapshot_node)			\
 	x(0,				option_needs_open_fs)

--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -929,8 +929,29 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
 			bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
 				if (p1.ptr.dev		== p2.ptr.dev &&
 				    p1.ptr.gen		== p2.ptr.gen &&
+				    /*
+				     * This checks that the two pointers point
+				     * to the same region on disk - adjusting
+				     * for the difference in where the extents
+				     * start, since one may have been trimmed:
+				     */
 				    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
-				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
+				    /*
+				     * This additionally checks that the
+				     * extents overlap on disk, since the
+				     * previous check may trigger spuriously
+				     * when one extent is immediately partially
+				     * overwritten with another extent (so that
+				     * on disk they are adjacent) and
+				     * compression is in use:
+				     */
+				    ((p1.ptr.offset >= p2.ptr.offset &&
+				      p1.ptr.offset  < p2.ptr.offset + p2.crc.compressed_size) ||
+				     (p2.ptr.offset >= p1.ptr.offset &&
+				      p2.ptr.offset  < p1.ptr.offset + p1.crc.compressed_size)))
 					return true;
 		return false;

--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -802,8 +802,7 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
 static int __bch2_buffered_write(struct bch_inode_info *inode,
 				 struct address_space *mapping,
 				 struct iov_iter *iter,
-				 loff_t pos, unsigned len,
+				 loff_t pos, unsigned len)
-				 bool inode_locked)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch2_folio_reservation res;
@@ -827,15 +826,6 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	BUG_ON(!fs.nr);
-	/*
-	 * If we're not using the inode lock, we need to lock all the folios for
-	 * atomiticity of writes vs. other writes:
-	 */
-	if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
-		ret = -BCH_ERR_need_inode_lock;
-		goto out;
-	}
 	f = darray_first(fs);
 	if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
 		ret = bch2_read_single_folio(f, mapping);
@@ -932,10 +922,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	end = pos + copied;
 	spin_lock(&inode->v.i_lock);
-	if (end > inode->v.i_size) {
+	if (end > inode->v.i_size)
-		BUG_ON(!inode_locked);
 		i_size_write(&inode->v, end);
-	}
 	spin_unlock(&inode->v.i_lock);
 	f_pos = pos;
@@ -979,68 +967,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct bch_inode_info *inode = file_bch_inode(file);
-	loff_t pos;
+	loff_t pos = iocb->ki_pos;
-	bool inode_locked = false;
+	ssize_t written = 0;
-	ssize_t written = 0, written2 = 0, ret = 0;
+	int ret = 0;
-	/*
-	 * We don't take the inode lock unless i_size will be changing. Folio
-	 * locks provide exclusion with other writes, and the pagecache add lock
-	 * provides exclusion with truncate and hole punching.
-	 *
-	 * There is one nasty corner case where atomicity would be broken
-	 * without great care: when copying data from userspace to the page
-	 * cache, we do that with faults disable - a page fault would recurse
-	 * back into the filesystem, taking filesystem locks again, and
-	 * deadlock; so it's done with faults disabled, and we fault in the user
-	 * buffer when we aren't holding locks.
-	 *
-	 * If we do part of the write, but we then race and in the userspace
-	 * buffer have been evicted and are no longer resident, then we have to
-	 * drop our folio locks to re-fault them in, breaking write atomicity.
-	 *
-	 * To fix this, we restart the write from the start, if we weren't
-	 * holding the inode lock.
-	 *
-	 * There is another wrinkle after that; if we restart the write from the
-	 * start, and then get an unrecoverable error, we _cannot_ claim to
-	 * userspace that we did not write data we actually did - so we must
-	 * track (written2) the most we ever wrote.
-	 */
-	if ((iocb->ki_flags & IOCB_APPEND) ||
-	    (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
-		inode_lock(&inode->v);
-		inode_locked = true;
-	}
-	ret = generic_write_checks(iocb, iter);
-	if (ret <= 0)
-		goto unlock;
-	ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
-	if (ret) {
-		if (!inode_locked) {
-			inode_lock(&inode->v);
-			inode_locked = true;
-			ret = file_remove_privs_flags(file, 0);
-		}
-		if (ret)
-			goto unlock;
-	}
-	ret = file_update_time(file);
-	if (ret)
-		goto unlock;
-	pos = iocb->ki_pos;
 	bch2_pagecache_add_get(inode);
-	if (!inode_locked &&
-	    (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
-		goto get_inode_lock;
 	do {
 		unsigned offset = pos & (PAGE_SIZE - 1);
 		unsigned bytes = iov_iter_count(iter);
@@ -1065,17 +997,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
 			}
 		}
-		if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
-			goto get_inode_lock;
 		if (unlikely(fatal_signal_pending(current))) {
 			ret = -EINTR;
 			break;
 		}
-		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
+		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
-		if (ret == -BCH_ERR_need_inode_lock)
-			goto get_inode_lock;
 		if (unlikely(ret < 0))
 			break;
@@ -1096,46 +1023,50 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
 		}
 		pos += ret;
 		written += ret;
-		written2 = max(written, written2);
-		if (ret != bytes && !inode_locked)
-			goto get_inode_lock;
 		ret = 0;
 		balance_dirty_pages_ratelimited(mapping);
-		if (0) {
-get_inode_lock:
-			bch2_pagecache_add_put(inode);
-			inode_lock(&inode->v);
-			inode_locked = true;
-			bch2_pagecache_add_get(inode);
-			iov_iter_revert(iter, written);
-			pos -= written;
-			written = 0;
-			ret = 0;
-		}
 	} while (iov_iter_count(iter));
-	bch2_pagecache_add_put(inode);
-unlock:
-	if (inode_locked)
-		inode_unlock(&inode->v);
-	iocb->ki_pos += written;
+	bch2_pagecache_add_put(inode);
-	ret = max(written, written2) ?: ret;
+	return written ? written : ret;
-	if (ret > 0)
-		ret = generic_write_sync(iocb, ret);
-	return ret;
 }
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
-	ssize_t ret = iocb->ki_flags & IOCB_DIRECT
+	struct file *file = iocb->ki_filp;
-		? bch2_direct_write(iocb, iter)
+	struct bch_inode_info *inode = file_bch_inode(file);
-		: bch2_buffered_write(iocb, iter);
+	ssize_t ret;
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		ret = bch2_direct_write(iocb, from);
+		goto out;
+	}
+	inode_lock(&inode->v);
+	ret = generic_write_checks(iocb, from);
+	if (ret <= 0)
+		goto unlock;
+	ret = file_remove_privs(file);
+	if (ret)
+		goto unlock;
+	ret = file_update_time(file);
+	if (ret)
+		goto unlock;
+	ret = bch2_buffered_write(iocb, from);
+	if (likely(ret > 0))
+		iocb->ki_pos += ret;
+unlock:
+	inode_unlock(&inode->v);
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
+out:
 	return bch2_err_class(ret);
 }

--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -23,7 +23,7 @@ enum bch_fsck_flags {
 	x(jset_past_bucket_end,					  9,	0)		\
 	x(jset_seq_blacklisted,					 10,	0)		\
 	x(journal_entries_missing,				 11,	0)		\
-	x(journal_entry_replicas_not_marked,			 12,	0)		\
+	x(journal_entry_replicas_not_marked,			 12,	FSCK_AUTOFIX)	\
 	x(journal_entry_past_jset_end,				 13,	0)		\
 	x(journal_entry_replicas_data_mismatch,			 14,	0)		\
 	x(journal_entry_bkey_u64s_0,				 15,	0)		\
@@ -288,10 +288,10 @@ enum bch_fsck_flags {
 	x(invalid_btree_id,					274,	0)		\
 	x(alloc_key_io_time_bad,				275,	0)		\
 	x(alloc_key_fragmentation_lru_wrong,			276,	FSCK_AUTOFIX)	\
-	x(accounting_key_junk_at_end,				277,	0)		\
+	x(accounting_key_junk_at_end,				277,	FSCK_AUTOFIX)	\
-	x(accounting_key_replicas_nr_devs_0,			278,	0)		\
+	x(accounting_key_replicas_nr_devs_0,			278,	FSCK_AUTOFIX)	\
-	x(accounting_key_replicas_nr_required_bad,		279,	0)		\
+	x(accounting_key_replicas_nr_required_bad,		279,	FSCK_AUTOFIX)	\
-	x(accounting_key_replicas_devs_unsorted,		280,	0)		\
+	x(accounting_key_replicas_devs_unsorted,		280,	FSCK_AUTOFIX)	\
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,