Merge tag 'for-5.16-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba: "The updates this time are more under the hood and enhancing existing features (subpage with compression and zoned namespaces). Performance related: - misc small inode logging improvements (+3% throughput, -11% latency on sample dbench workload) - more efficient directory logging: bulk item insertion, less tree searches and locking - speed up bulk insertion of items into a b-tree, which is used when logging directories, when running delayed items for directories (fsync and transaction commits) and when running the slow path (full sync) of an fsync (bulk creation run time -4%, deletion -12%) Core: - continued subpage support - make defragmentation work - make compression write work - zoned mode - support ZNS (zoned namespaces), zone capacity is number of usable blocks in each zone - add dedicated block group (zoned) for relocation, to prevent out of order writes in some cases - greedy block group reclaim, pick the ones with least usable space first - preparatory work for send protocol updates - error handling improvements - cleanups and refactoring Fixes: - lockdep warnings - in show_devname callback, on seeding device - device delete on loop device due to conversions to workqueues - fix deadlock between chunk allocation and chunk btree modifications - fix tracking of missing device count and status" * tag 'for-5.16-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (140 commits) btrfs: remove root argument from check_item_in_log() btrfs: remove root argument from add_link() btrfs: remove root argument from btrfs_unlink_inode() btrfs: remove root argument from drop_one_dir_item() btrfs: clear MISSING device status bit in btrfs_close_one_device btrfs: call btrfs_check_rw_degradable only if there is a missing device btrfs: send: prepare for v2 protocol btrfs: fix comment about sector sizes supported in 64K systems btrfs: update device path inode time instead of bd_inode fs: export an inode_update_time helper btrfs: fix deadlock when defragging transparent huge pages btrfs: sysfs: convert scnprintf and snprintf to sysfs_emit btrfs: make btrfs_super_block size match BTRFS_SUPER_INFO_SIZE btrfs: update comments for chunk allocation -ENOSPC cases btrfs: fix deadlock between chunk allocation and chunk btree modifications btrfs: zoned: use greedy gc for auto reclaim btrfs: check-integrity: stop storing the block device name in btrfsic_dev_state btrfs: use btrfs_get_dev_args_from_path in dev removal ioctls btrfs: add a btrfs_get_dev_args_from_path helper btrfs: handle device lookup with btrfs_dev_lookup_args ...

Merge tag 'for-5.16-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "The updates this time are more under the hood and enhancing existing features (subpage with compression and zoned namespaces). Performance related: - misc small inode logging improvements (+3% throughput, -11% latency on sample dbench workload) - more efficient directory logging: bulk item insertion, less tree searches and locking - speed up bulk insertion of items into a b-tree, which is used when logging directories, when running delayed items for directories (fsync and transaction commits) and when running the slow path (full sync) of an fsync (bulk creation run time -4%, deletion -12%) Core: - continued subpage support - make defragmentation work - make compression write work - zoned mode - support ZNS (zoned namespaces), zone capacity is number of usable blocks in each zone - add dedicated block group (zoned) for relocation, to prevent out of order writes in some cases - greedy block group reclaim, pick the ones with least usable space first - preparatory work for send protocol updates - error handling improvements - cleanups and refactoring Fixes: - lockdep warnings - in show_devname callback, on seeding device - device delete on loop device due to conversions to workqueues - fix deadlock between chunk allocation and chunk btree modifications - fix tracking of missing device count and status" * tag 'for-5.16-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (140 commits) btrfs: remove root argument from check_item_in_log() btrfs: remove root argument from add_link() btrfs: remove root argument from btrfs_unlink_inode() btrfs: remove root argument from drop_one_dir_item() btrfs: clear MISSING device status bit in btrfs_close_one_device btrfs: call btrfs_check_rw_degradable only if there is a missing device btrfs: send: prepare for v2 protocol btrfs: fix comment about sector sizes supported in 64K systems btrfs: update device path inode time instead of bd_inode fs: export an inode_update_time helper btrfs: fix deadlock when defragging transparent huge pages btrfs: sysfs: convert scnprintf and snprintf to sysfs_emit btrfs: make btrfs_super_block size match BTRFS_SUPER_INFO_SIZE btrfs: update comments for chunk allocation -ENOSPC cases btrfs: fix deadlock between chunk allocation and chunk btree modifications btrfs: zoned: use greedy gc for auto reclaim btrfs: check-integrity: stop storing the block device name in btrfsic_dev_state btrfs: use btrfs_get_dev_args_from_path in dev removal ioctls btrfs: add a btrfs_get_dev_args_from_path helper btrfs: handle device lookup with btrfs_dev_lookup_args ...
037c50bf · Linus Torvalds · 2cf3f813 · d1ed82f3 · 037c50bf · 037c50bf
Commit 037c50bf authored Nov 01, 2021 by Linus Torvalds
53 changed files
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -98,6 +98,7 @@ struct btrfs_block_group {
 	unsigned int to_copy:1;
 	unsigned int relocating_repair:1;
 	unsigned int chunk_item_inserted:1;
+	unsigned int zone_is_active:1;

 	int disk_cache_state;

@@ -202,7 +203,10 @@ struct btrfs_block_group {
 	 */
 	u64 alloc_offset;
 	u64 zone_unusable;
+	u64 zone_capacity;
 	u64 meta_write_pointer;
+	struct map_lookup *physical_map;
+	struct list_head active_bg_list;
 };

 static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
@@ -280,7 +284,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
 int btrfs_update_block_group(struct btrfs_trans_handle *trans,
-			     u64 bytenr, u64 num_bytes, int alloc);
+			     u64 bytenr, u64 num_bytes, bool alloc);
 int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
 			     u64 ram_bytes, u64 num_bytes, int delalloc);
 void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
@@ -289,6 +293,8 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
 		      enum btrfs_chunk_alloc_enum force);
 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
 void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+				  bool is_item_insertion);
 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
 void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);

--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -138,17 +138,34 @@ struct btrfs_inode {
 	/* a local copy of root's last_log_commit */
 	int last_log_commit;

-	/* total number of bytes pending delalloc, used by stat to calc the
-	 * real block usage of the file
-	 */
-	u64 delalloc_bytes;
-
-	/*
-	 * Total number of bytes pending delalloc that fall within a file
-	 * range that is either a hole or beyond EOF (and no prealloc extent
-	 * exists in the range). This is always <= delalloc_bytes.
-	 */
-	u64 new_delalloc_bytes;
+	union {
+		/*
+		 * Total number of bytes pending delalloc, used by stat to
+		 * calculate the real block usage of the file. This is used
+		 * only for files.
+		 */
+		u64 delalloc_bytes;
+		/*
+		 * The offset of the last dir item key that was logged.
+		 * This is used only for directories.
+		 */
+		u64 last_dir_item_offset;
+	};
+
+	union {
+		/*
+		 * Total number of bytes pending delalloc that fall within a file
+		 * range that is either a hole or beyond EOF (and no prealloc extent
+		 * exists in the range). This is always <= delalloc_bytes and this
+		 * is used only for files.
+		 */
+		u64 new_delalloc_bytes;
+		/*
+		 * The offset of the last dir index key that was logged.
+		 * This is used only for directories.
+		 */
+		u64 last_dir_index_offset;
+	};

 	/*
 	 * total number of bytes pending defrag, used by stat to check whether
@@ -339,7 +356,12 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)

 struct btrfs_dio_private {
 	struct inode *inode;
-	u64 logical_offset;
+
+	/*
+	 * Since DIO can use anonymous page, we cannot use page_offset() to
+	 * grab the file offset, thus need a dedicated member for file offset.
+	 */
+	u64 file_offset;
 	u64 disk_bytenr;
 	/* Used for bio::bi_size */
 	u32 bytes;

--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -28,8 +28,8 @@ struct btrfs_inode;
 #define	BTRFS_ZLIB_DEFAULT_LEVEL		3

 struct compressed_bio {
-	/* number of bios pending for this compressed extent */
-	refcount_t pending_bios;
+	/* Number of sectors with unfinished IO (unsubmitted or unfinished) */
+	refcount_t pending_sectors;

 	/* Number of compressed pages in the array */
 	unsigned int nr_pages;

--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -396,7 +396,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	if (*cow_ret == buf)
 		unlock_orig = 1;

-	btrfs_assert_tree_locked(buf);
+	btrfs_assert_tree_write_locked(buf);

 	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
 		trans->transid != fs_info->running_transaction->transid);
@@ -2488,7 +2488,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
 	int ret;

 	BUG_ON(!path->nodes[level]);
-	btrfs_assert_tree_locked(path->nodes[level]);
+	btrfs_assert_tree_write_locked(path->nodes[level]);
 	lower = path->nodes[level];
 	nritems = btrfs_header_nritems(lower);
 	BUG_ON(slot > nritems);
@@ -2828,7 +2828,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (slot >= btrfs_header_nritems(upper) - 1)
 		return 1;

-	btrfs_assert_tree_locked(path->nodes[1]);
+	btrfs_assert_tree_write_locked(path->nodes[1]);

 	right = btrfs_read_node_slot(upper, slot + 1);
 	/*
@@ -3066,7 +3066,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (right_nritems == 0)
 		return 1;

-	btrfs_assert_tree_locked(path->nodes[1]);
+	btrfs_assert_tree_write_locked(path->nodes[1]);

 	left = btrfs_read_node_slot(path->nodes[1], slot - 1);
 	/*
@@ -3581,40 +3581,6 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
 	return ret;
 }

-/*
- * This function duplicate a item, giving 'new_key' to the new item.
- * It guarantees both items live in the same tree leaf and the new item
- * is contiguous with the original item.
- *
- * This allows us to split file extent in place, keeping a lock on the
- * leaf the entire time.
- */
-int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root,
-			 struct btrfs_path *path,
-			 const struct btrfs_key *new_key)
-{
-	struct extent_buffer *leaf;
-	int ret;
-	u32 item_size;
-
-	leaf = path->nodes[0];
-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-	ret = setup_leaf_for_split(trans, root, path,
-				   item_size + sizeof(struct btrfs_item));
-	if (ret)
-		return ret;
-
-	path->slots[0]++;
-	setup_items_for_insert(root, path, new_key, &item_size, 1);
-	leaf = path->nodes[0];
-	memcpy_extent_buffer(leaf,
-			     btrfs_item_ptr_offset(leaf, path->slots[0]),
-			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
-			     item_size);
-	return 0;
-}
-
 /*
 * make the item pointed to by the path smaller.  new_size indicates
 * how small to make it, and from_end tells us if we just chop bytes
@@ -3786,13 +3752,10 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
 *
 * @root:	root we are inserting items to
 * @path:	points to the leaf/slot where we are going to insert new items
- * @cpu_key:	array of keys for items to be inserted
- * @data_size:	size of the body of each item we are going to insert
- * @nr:		size of @cpu_key/@data_size arrays
+ * @batch:      information about the batch of items to insert
 */
-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
-			    const struct btrfs_key *cpu_key, u32 *data_size,
-			    int nr)
+static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+				   const struct btrfs_item_batch *batch)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_item *item;
@@ -3804,14 +3767,14 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 	int slot;
 	struct btrfs_map_token token;
 	u32 total_size;
-	u32 total_data = 0;
-
-	for (i = 0; i < nr; i++)
-		total_data += data_size[i];
-	total_size = total_data + (nr * sizeof(struct btrfs_item));

+	/*
+	 * Before anything else, update keys in the parent and other ancestors
+	 * if needed, then release the write locks on them, so that other tasks
+	 * can use them while we modify the leaf.
+	 */
 	if (path->slots[0] == 0) {
-		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+		btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
 		fixup_low_keys(path, &disk_key, 1);
 	}
 	btrfs_unlock_up_safe(path, 1);
@@ -3821,6 +3784,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,

 	nritems = btrfs_header_nritems(leaf);
 	data_end = leaf_data_end(leaf);
+	total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));

 	if (btrfs_leaf_free_space(leaf) < total_size) {
 		btrfs_print_leaf(leaf);
@@ -3850,31 +3814,32 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 			item = btrfs_item_nr(i);
 			ioff = btrfs_token_item_offset(&token, item);
 			btrfs_set_token_item_offset(&token, item,
-						    ioff - total_data);
+						    ioff - batch->total_data_size);
 		}
 		/* shift the items */
-		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
 			      btrfs_item_nr_offset(slot),
 			      (nritems - slot) * sizeof(struct btrfs_item));

 		/* shift the data */
 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
-			      data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
-			      data_end, old_data - data_end);
+				      data_end - batch->total_data_size,
+				      BTRFS_LEAF_DATA_OFFSET + data_end,
+				      old_data - data_end);
 		data_end = old_data;
 	}

 	/* setup the item for the new data */
-	for (i = 0; i < nr; i++) {
-		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+	for (i = 0; i < batch->nr; i++) {
+		btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
 		btrfs_set_item_key(leaf, &disk_key, slot + i);
 		item = btrfs_item_nr(slot + i);
-		data_end -= data_size[i];
+		data_end -= batch->data_sizes[i];
 		btrfs_set_token_item_offset(&token, item, data_end);
-		btrfs_set_token_item_size(&token, item, data_size[i]);
+		btrfs_set_token_item_size(&token, item, batch->data_sizes[i]);
 	}

-	btrfs_set_header_nritems(leaf, nritems + nr);
+	btrfs_set_header_nritems(leaf, nritems + batch->nr);
 	btrfs_mark_buffer_dirty(leaf);

 	if (btrfs_leaf_free_space(leaf) < 0) {
@@ -3883,6 +3848,29 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 	}
 }

+/*
+ * Insert a new item into a leaf.
+ *
+ * @root:      The root of the btree.
+ * @path:      A path pointing to the target leaf and slot.
+ * @key:       The key of the new item.
+ * @data_size: The size of the data associated with the new key.
+ */
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 const struct btrfs_key *key,
+				 u32 data_size)
+{
+	struct btrfs_item_batch batch;
+
+	batch.keys = key;
+	batch.data_sizes = &data_size;
+	batch.total_data_size = data_size;
+	batch.nr = 1;
+
+	setup_items_for_insert(root, path, &batch);
+}
+
 /*
 * Given a key and some data, insert items into the tree.
 * This does all the path init required, making room in the tree if needed.
@@ -3890,20 +3878,14 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root,
 			    struct btrfs_path *path,
-			    const struct btrfs_key *cpu_key, u32 *data_size,
-			    int nr)
+			    const struct btrfs_item_batch *batch)
 {
 	int ret = 0;
 	int slot;
-	int i;
-	u32 total_size = 0;
-	u32 total_data = 0;
-
-	for (i = 0; i < nr; i++)
-		total_data += data_size[i];
+	u32 total_size;

-	total_size = total_data + (nr * sizeof(struct btrfs_item));
-	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+	total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
+	ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
 	if (ret == 0)
 		return -EEXIST;
 	if (ret < 0)
@@ -3912,7 +3894,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 	slot = path->slots[0];
 	BUG_ON(slot < 0);

-	setup_items_for_insert(root, path, cpu_key, data_size, nr);
+	setup_items_for_insert(root, path, batch);
 	return 0;
 }

@@ -3943,6 +3925,40 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	return ret;
 }

+/*
+ * This function duplicates an item, giving 'new_key' to the new item.
+ * It guarantees both items live in the same tree leaf and the new item is
+ * contiguous with the original item.
+ *
+ * This allows us to split a file extent in place, keeping a lock on the leaf
+ * the entire time.
+ */
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 struct btrfs_path *path,
+			 const struct btrfs_key *new_key)
+{
+	struct extent_buffer *leaf;
+	int ret;
+	u32 item_size;
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	ret = setup_leaf_for_split(trans, root, path,
+				   item_size + sizeof(struct btrfs_item));
+	if (ret)
+		return ret;
+
+	path->slots[0]++;
+	btrfs_setup_item_for_insert(root, path, new_key, item_size);
+	leaf = path->nodes[0];
+	memcpy_extent_buffer(leaf,
+			     btrfs_item_ptr_offset(leaf, path->slots[0]),
+			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+			     item_size);
+	return 0;
+}
+
 /*
 * delete the pointer from a given node.
 *

--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,6 +48,7 @@ extern struct kmem_cache *btrfs_free_space_cachep;
 extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
 struct btrfs_ordered_sum;
 struct btrfs_ref;
+struct btrfs_bio;

 #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */

@@ -217,6 +218,9 @@ struct btrfs_root_backup {
 	u8 unused_8[10];
 } __attribute__ ((__packed__));

+#define BTRFS_SUPER_INFO_OFFSET			SZ_64K
+#define BTRFS_SUPER_INFO_SIZE			4096
+
 /*
 * the super block basically lists the main trees of the FS
 * it currently lacks any block count etc etc
@@ -269,7 +273,11 @@ struct btrfs_super_block {
 	__le64 reserved[28];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
+
+	/* Padded to 4096 bytes */
+	u8 padding[565];
 } __attribute__ ((__packed__));
+static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);

 /*
 * Compat flags that we support.  If any incompat flags are set other than the
@@ -899,6 +907,7 @@ struct btrfs_fs_info {
 	struct btrfs_workqueue *scrub_workers;
 	struct btrfs_workqueue *scrub_wr_completion_workers;
 	struct btrfs_workqueue *scrub_parity_workers;
+	struct btrfs_subpage_info *subpage_info;

 	struct btrfs_discard_ctl discard_ctl;

@@ -1017,6 +1026,16 @@ struct btrfs_fs_info {
 	spinlock_t treelog_bg_lock;
 	u64 treelog_bg;

+	/*
+	 * Start of the dedicated data relocation block group, protected by
+	 * relocation_bg_lock.
+	 */
+	spinlock_t relocation_bg_lock;
+	u64 data_reloc_bg;
+
+	spinlock_t zone_active_bgs_lock;
+	struct list_head zone_active_bgs;
+
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
 	spinlock_t ref_verify_lock;
 	struct rb_root block_tree;
@@ -2885,16 +2904,42 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
 	return btrfs_del_items(trans, root, path, path->slots[0], 1);
 }

-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
-			    const struct btrfs_key *cpu_key, u32 *data_size,
-			    int nr);
+/*
+ * Describes a batch of items to insert in a btree. This is used by
+ * btrfs_insert_empty_items().
+ */
+struct btrfs_item_batch {
+	/*
+	 * Pointer to an array containing the keys of the items to insert (in
+	 * sorted order).
+	 */
+	const struct btrfs_key *keys;
+	/* Pointer to an array containing the data size for each item to insert. */
+	const u32 *data_sizes;
+	/*
+	 * The sum of data sizes for all items. The caller can compute this while
+	 * setting up the data_sizes array, so it ends up being more efficient
+	 * than having btrfs_insert_empty_items() or setup_item_for_insert()
+	 * doing it, as it would avoid an extra loop over a potentially large
+	 * array, and in the case of setup_item_for_insert(), we would be doing
+	 * it while holding a write lock on a leaf and often on upper level nodes
+	 * too, unnecessarily increasing the size of a critical section.
+	 */
+	u32 total_data_size;
+	/* Size of the keys and data_sizes arrays (number of items in the batch). */
+	int nr;
+};
+
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 const struct btrfs_key *key,
+				 u32 data_size);
 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		      const struct btrfs_key *key, void *data, u32 data_size);
 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path,
-			     const struct btrfs_key *cpu_key, u32 *data_size,
-			     int nr);
+			     const struct btrfs_item_batch *batch);

 static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
@@ -2902,7 +2947,14 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 					  const struct btrfs_key *key,
 					  u32 data_size)
 {
-	return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
+	struct btrfs_item_batch batch;
+
+	batch.keys = key;
+	batch.data_sizes = &data_size;
+	batch.total_data_size = data_size;
+	batch.nr = 1;
+
+	return btrfs_insert_empty_items(trans, root, path, &batch);
 }

 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
@@ -3129,8 +3181,9 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path);
 /* inode.c */
 blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
 				   int mirror_num, unsigned long bio_flags);
-unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
-				    struct page *page, u64 start, u64 end);
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+				    u32 bio_offset, struct page *page,
+				    u64 start, u64 end);
 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
 					   u64 start, u64 len);
 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
@@ -3142,7 +3195,6 @@ void __btrfs_del_delalloc_inode(struct btrfs_root *root,
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root,
 		       struct btrfs_inode *dir, struct btrfs_inode *inode,
 		       const char *name, int name_len);
 int btrfs_add_link(struct btrfs_trans_handle *trans,
@@ -3174,8 +3226,6 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
 				 struct extent_state *other);
 void btrfs_split_delalloc_extent(struct inode *inode,
 				 struct extent_state *orig, u64 split);
-int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
-			     unsigned long bio_flags);
 void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
 vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
@@ -3242,9 +3292,9 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns,
 int btrfs_ioctl_get_supported_features(void __user *arg);
 void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
 int __pure btrfs_is_empty_uuid(u8 *uuid);
-int btrfs_defrag_file(struct inode *inode, struct file *file,
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
 		      struct btrfs_ioctl_defrag_range_args *range,
-		      u64 newer_than, unsigned long max_pages);
+		      u64 newer_than, unsigned long max_to_defrag);
 void btrfs_get_block_group_info(struct list_head *groups_list,
 				struct btrfs_ioctl_space_info *space);
 void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
@@ -3563,6 +3613,9 @@ do {								\
 			  (errno), fmt, ##args);		\
 } while (0)

+#define BTRFS_FS_ERROR(fs_info)	(unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
+						   &(fs_info)->fs_state)))
+
 __printf(5, 6)
 __cold
 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
@@ -3842,6 +3895,11 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
 	return fs_info->zoned != 0;
 }

+static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
+{
+	return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
+}
+
 /*
 * We use page status Private2 to indicate there is an ordered extent with
 * unfinished IO.

--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -679,19 +679,18 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
 				     struct btrfs_path *path,
 				     struct btrfs_delayed_item *first_item)
 {
-	LIST_HEAD(batch);
+	LIST_HEAD(item_list);
 	struct btrfs_delayed_item *curr;
 	struct btrfs_delayed_item *next;
 	const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
+	struct btrfs_item_batch batch;
 	int total_size;
-	int nitems;
 	char *ins_data = NULL;
-	struct btrfs_key *ins_keys;
-	u32 *ins_sizes;
 	int ret;

-	list_add_tail(&first_item->tree_list, &batch);
-	nitems = 1;
+	list_add_tail(&first_item->tree_list, &item_list);
+	batch.total_data_size = first_item->data_len;
+	batch.nr = 1;
 	total_size = first_item->data_len + sizeof(struct btrfs_item);
 	curr = first_item;

@@ -706,39 +705,43 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
 		if (total_size + next_size > max_size)
 			break;

-		list_add_tail(&next->tree_list, &batch);
-		nitems++;
+		list_add_tail(&next->tree_list, &item_list);
+		batch.nr++;
 		total_size += next_size;
+		batch.total_data_size += next->data_len;
 		curr = next;
 	}

-	if (nitems == 1) {
-		ins_keys = &first_item->key;
-		ins_sizes = &first_item->data_len;
+	if (batch.nr == 1) {
+		batch.keys = &first_item->key;
+		batch.data_sizes = &first_item->data_len;
 	} else {
+		struct btrfs_key *ins_keys;
+		u32 *ins_sizes;
 		int i = 0;

-		ins_data = kmalloc(nitems * sizeof(u32) +
-				   nitems * sizeof(struct btrfs_key), GFP_NOFS);
+		ins_data = kmalloc(batch.nr * sizeof(u32) +
+				   batch.nr * sizeof(struct btrfs_key), GFP_NOFS);
 		if (!ins_data) {
 			ret = -ENOMEM;
 			goto out;
 		}
 		ins_sizes = (u32 *)ins_data;
-		ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32));
-		list_for_each_entry(curr, &batch, tree_list) {
+		ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32));
+		batch.keys = ins_keys;
+		batch.data_sizes = ins_sizes;
+		list_for_each_entry(curr, &item_list, tree_list) {
 			ins_keys[i] = curr->key;
 			ins_sizes[i] = curr->data_len;
 			i++;
 		}
 	}

-	ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes,
-				       nitems);
+	ret = btrfs_insert_empty_items(trans, root, path, &batch);
 	if (ret)
 		goto out;

-	list_for_each_entry(curr, &batch, tree_list) {
+	list_for_each_entry(curr, &item_list, tree_list) {
 		char *data_ptr;

 		data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
@@ -754,7 +757,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_release_path(path);

-	list_for_each_entry_safe(curr, next, &batch, tree_list) {
+	list_for_each_entry_safe(curr, next, &item_list, tree_list) {
 		list_del(&curr->tree_list);
 		btrfs_delayed_item_release_metadata(root, curr);
 		btrfs_release_delayed_item(curr);

--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -906,7 +906,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	u64 parent = generic_ref->parent;
 	u8 ref_type;

-	is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID);
+	is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);

 	ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
 	BUG_ON(extent_op && extent_op->is_data);
@@ -921,8 +921,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	}

 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
-	    is_fstree(generic_ref->real_root) &&
-	    is_fstree(generic_ref->tree_ref.root) &&
 	    !generic_ref->skip_qgroup) {
 		record = kzalloc(sizeof(*record), GFP_NOFS);
 		if (!record) {
@@ -938,14 +936,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 		ref_type = BTRFS_TREE_BLOCK_REF_KEY;

 	init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
-				generic_ref->tree_ref.root, action, ref_type);
-	ref->root = generic_ref->tree_ref.root;
+				generic_ref->tree_ref.owning_root, action,
+				ref_type);
+	ref->root = generic_ref->tree_ref.owning_root;
 	ref->parent = parent;
 	ref->level = level;

 	init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
-			      generic_ref->tree_ref.root, 0, action, false,
-			      is_system);
+			      generic_ref->tree_ref.owning_root, 0, action,
+			      false, is_system);
 	head_ref->extent_op = extent_op;

 	delayed_refs = &trans->transaction->delayed_refs;
@@ -997,7 +996,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 	u64 bytenr = generic_ref->bytenr;
 	u64 num_bytes = generic_ref->len;
 	u64 parent = generic_ref->parent;
-	u64 ref_root = generic_ref->data_ref.ref_root;
+	u64 ref_root = generic_ref->data_ref.owning_root;
 	u64 owner = generic_ref->data_ref.ino;
 	u64 offset = generic_ref->data_ref.offset;
 	u8 ref_type;
@@ -1026,8 +1025,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 	}

 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
-	    is_fstree(ref_root) &&
-	    is_fstree(generic_ref->real_root) &&
 	    !generic_ref->skip_qgroup) {
 		record = kzalloc(sizeof(*record), GFP_NOFS);
 		if (!record) {

--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -186,8 +186,8 @@ enum btrfs_ref_type {
 struct btrfs_data_ref {
 	/* For EXTENT_DATA_REF */

-	/* Root which refers to this data extent */
-	u64 ref_root;
+	/* Original root this data extent belongs to */
+	u64 owning_root;

 	/* Inode which refers to this data extent */
 	u64 ino;
@@ -210,11 +210,11 @@ struct btrfs_tree_ref {
 	int level;

 	/*
-	 * Root which refers to this tree block.
+	 * Root which owns this tree block.
 	 *
 	 * For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
 	 */
-	u64 root;
+	u64 owning_root;

 	/* For non-skinny metadata, no special member needed */
 };
@@ -231,17 +231,10 @@ struct btrfs_ref {
 	 */
 	bool skip_qgroup;

-	/*
-	 * Optional. For which root is this modification.
-	 * Mostly used for qgroup optimization.
-	 *
-	 * When unset, data/tree ref init code will populate it.
-	 * In certain cases, we're modifying reference for a different root.
-	 * E.g. COW fs tree blocks for balance.
-	 * In that case, tree_ref::root will be fs tree, but we're doing this
-	 * for reloc tree, then we should set @real_root to reloc tree.
-	 */
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+	/* Through which root is this modification. */
 	u64 real_root;
+#endif
 	u64 bytenr;
 	u64 len;

@@ -271,26 +264,40 @@ static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
 }

 static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref,
-				int level, u64 root)
+				int level, u64 root, u64 mod_root, bool skip_qgroup)
 {
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
 	/* If @real_root not set, use @root as fallback */
-	if (!generic_ref->real_root)
-		generic_ref->real_root = root;
+	generic_ref->real_root = mod_root ?: root;
+#endif
 	generic_ref->tree_ref.level = level;
-	generic_ref->tree_ref.root = root;
+	generic_ref->tree_ref.owning_root = root;
 	generic_ref->type = BTRFS_REF_METADATA;
+	if (skip_qgroup || !(is_fstree(root) &&
+			     (!mod_root || is_fstree(mod_root))))
+		generic_ref->skip_qgroup = true;
+	else
+		generic_ref->skip_qgroup = false;
+
 }

 static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
-				u64 ref_root, u64 ino, u64 offset)
+				u64 ref_root, u64 ino, u64 offset, u64 mod_root,
+				bool skip_qgroup)
 {
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
 	/* If @real_root not set, use @root as fallback */
-	if (!generic_ref->real_root)
-		generic_ref->real_root = ref_root;
-	generic_ref->data_ref.ref_root = ref_root;
+	generic_ref->real_root = mod_root ?: ref_root;
+#endif
+	generic_ref->data_ref.owning_root = ref_root;
 	generic_ref->data_ref.ino = ino;
 	generic_ref->data_ref.offset = offset;
 	generic_ref->type = BTRFS_REF_DATA;
+	if (skip_qgroup || !(is_fstree(ref_root) &&
+			     (!mod_root || is_fstree(mod_root))))
+		generic_ref->skip_qgroup = true;
+	else
+		generic_ref->skip_qgroup = false;
 }

 static inline struct btrfs_delayed_extent_op *

--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -70,6 +70,7 @@ static int btrfs_dev_replace_kthread(void *data);

 int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
 {
+	struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID };
 	struct btrfs_key key;
 	struct btrfs_root *dev_root = fs_info->dev_root;
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
@@ -100,8 +101,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
 		 * We don't have a replace item or it's corrupted.  If there is
 		 * a replace target, fail the mount.
 		 */
-		if (btrfs_find_device(fs_info->fs_devices,
-				      BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+		if (btrfs_find_device(fs_info->fs_devices, &args)) {
 			btrfs_err(fs_info,
 			"found replace target device without a valid replace item");
 			ret = -EUCLEAN;
@@ -163,8 +163,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
 		 * We don't have an active replace item but if there is a
 		 * replace target, fail the mount.
 		 */
-		if (btrfs_find_device(fs_info->fs_devices,
-				      BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+		if (btrfs_find_device(fs_info->fs_devices, &args)) {
 			btrfs_err(fs_info,
 			"replace devid present without an active replace item");
 			ret = -EUCLEAN;
@@ -175,11 +174,10 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
 		break;
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
-		dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
-						src_devid, NULL, NULL);
-		dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
-							BTRFS_DEV_REPLACE_DEVID,
-							NULL, NULL);
+		dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args);
+		args.devid = src_devid;
+		dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args);
+
 		/*
 		 * allow 'btrfs dev replace_cancel' if src/tgt device is
 		 * missing

--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -683,7 +683,7 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
 	return ret;
 }

-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
 				   struct page *page, u64 start, u64 end,
 				   int mirror)
 {
@@ -1036,7 +1036,7 @@ static int btree_set_page_dirty(struct page *page)
 		BUG_ON(!eb);
 		BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
 		BUG_ON(!atomic_read(&eb->refs));
-		btrfs_assert_tree_locked(eb);
+		btrfs_assert_tree_write_locked(eb);
 		return __set_page_dirty_nobuffers(page);
 	}
 	ASSERT(PagePrivate(page) && page->private);
@@ -1061,7 +1061,7 @@ static int btree_set_page_dirty(struct page *page)
 		ASSERT(eb);
 		ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
 		ASSERT(atomic_read(&eb->refs));
-		btrfs_assert_tree_locked(eb);
+		btrfs_assert_tree_write_locked(eb);
 		free_extent_buffer(eb);

 		cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
@@ -1125,7 +1125,7 @@ void btrfs_clean_tree_block(struct extent_buffer *buf)
 	struct btrfs_fs_info *fs_info = buf->fs_info;
 	if (btrfs_header_generation(buf) ==
 	    fs_info->running_transaction->transid) {
-		btrfs_assert_tree_locked(buf);
+		btrfs_assert_tree_write_locked(buf);

 		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
 			percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
@@ -1500,7 +1500,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
 		goto fail;

 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
-	    root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+	    !btrfs_is_data_reloc_root(root)) {
 		set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
 		btrfs_check_and_init_root_item(&root->root_item);
 	}
@@ -1644,6 +1644,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
 	btrfs_extent_buffer_leak_debug_check(fs_info);
 	kfree(fs_info->super_copy);
 	kfree(fs_info->super_for_commit);
+	kfree(fs_info->subpage_info);
 	kvfree(fs_info);
 }

@@ -1953,8 +1954,7 @@ static int transaction_kthread(void *arg)
 		wake_up_process(fs_info->cleaner_kthread);
 		mutex_unlock(&fs_info->transaction_kthread_mutex);

-		if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
-				      &fs_info->fs_state)))
+		if (BTRFS_FS_ERROR(fs_info))
 			btrfs_cleanup_transaction(fs_info);
 		if (!kthread_should_stop() &&
 				(!btrfs_transaction_blocked(fs_info) ||
@@ -2592,8 +2592,7 @@ static int validate_super(struct btrfs_fs_info *fs_info,

 	/*
 	 * For 4K page size, we only support 4K sector size.
-	 * For 64K page size, we support read-write for 64K sector size, and
-	 * read-only for 4K sector size.
+	 * For 64K page size, we support 64K and 4K sector sizes.
 	 */
 	if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
 	    (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
@@ -2883,6 +2882,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	spin_lock_init(&fs_info->buffer_lock);
 	spin_lock_init(&fs_info->unused_bgs_lock);
 	spin_lock_init(&fs_info->treelog_bg_lock);
+	spin_lock_init(&fs_info->zone_active_bgs_lock);
+	spin_lock_init(&fs_info->relocation_bg_lock);
 	rwlock_init(&fs_info->tree_mod_log_lock);
 	mutex_init(&fs_info->unused_bg_unpin_mutex);
 	mutex_init(&fs_info->reclaim_bgs_lock);
@@ -2896,6 +2897,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
 	INIT_LIST_HEAD(&fs_info->unused_bgs);
 	INIT_LIST_HEAD(&fs_info->reclaim_bgs);
+	INIT_LIST_HEAD(&fs_info->zone_active_bgs);
 #ifdef CONFIG_BTRFS_DEBUG
 	INIT_LIST_HEAD(&fs_info->allocated_roots);
 	INIT_LIST_HEAD(&fs_info->allocated_ebs);
@@ -3228,12 +3230,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
 	btrfs_init_btree_inode(fs_info);

-	invalidate_bdev(fs_devices->latest_bdev);
+	invalidate_bdev(fs_devices->latest_dev->bdev);

 	/*
 	 * Read super block and check the signature bytes only
 	 */
-	disk_super = btrfs_read_dev_super(fs_devices->latest_bdev);
+	disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
 	if (IS_ERR(disk_super)) {
 		err = PTR_ERR(disk_super);
 		goto fail_alloc;
@@ -3392,12 +3394,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 		goto fail_alloc;
 	}

-	if (sectorsize != PAGE_SIZE) {
+	if (sectorsize < PAGE_SIZE) {
+		struct btrfs_subpage_info *subpage_info;
+
 		btrfs_warn(fs_info,
 		"read-write for sector size %u with page size %lu is experimental",
 			   sectorsize, PAGE_SIZE);
-	}
-	if (sectorsize != PAGE_SIZE) {
 		if (btrfs_super_incompat_flags(fs_info->super_copy) &
 			BTRFS_FEATURE_INCOMPAT_RAID56) {
 			btrfs_err(fs_info,
@@ -3406,6 +3408,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 			err = -EINVAL;
 			goto fail_alloc;
 		}
+		subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
+		if (!subpage_info)
+			goto fail_alloc;
+		btrfs_init_subpage_info(subpage_info, sectorsize);
+		fs_info->subpage_info = subpage_info;
 	}

 	ret = btrfs_init_workqueues(fs_info, fs_devices);
@@ -3465,7 +3472,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	 * below in btrfs_init_dev_replace().
 	 */
 	btrfs_free_extra_devids(fs_devices);
-	if (!fs_devices->latest_bdev) {
+	if (!fs_devices->latest_dev->bdev) {
 		btrfs_err(fs_info, "failed to read devices");
 		goto fail_tree_roots;
 	}
@@ -3556,7 +3563,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 		goto fail_sysfs;
 	}

-	if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
+	if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
+	    !btrfs_check_rw_degradable(fs_info, NULL)) {
 		btrfs_warn(fs_info,
 		"writable mount is not allowed due to too many missing devices");
 		goto fail_sysfs;
@@ -3881,7 +3889,9 @@ static int write_dev_supers(struct btrfs_device *device,
 			bio->bi_opf |= REQ_FUA;

 		btrfsic_submit_bio(bio);
-		btrfs_advance_sb_log(device, i);
+
+		if (btrfs_advance_sb_log(device, i))
+			errors++;
 	}
 	return errors < i ? 0 : -1;
 }
@@ -4221,7 +4231,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
 		drop_ref = true;
 	spin_unlock(&fs_info->fs_roots_radix_lock);

-	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+	if (BTRFS_FS_ERROR(fs_info)) {
 		ASSERT(root->log_root == NULL);
 		if (root->reloc_root) {
 			btrfs_put_root(root->reloc_root);
@@ -4372,8 +4382,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 			btrfs_err(fs_info, "commit super ret %d", ret);
 	}

-	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
-	    test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
+	if (BTRFS_FS_ERROR(fs_info))
 		btrfs_error_commit_super(fs_info);

 	kthread_stop(fs_info->transaction_kthread);
@@ -4470,7 +4479,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
 		return;
 #endif
-	btrfs_assert_tree_locked(buf);
+	btrfs_assert_tree_write_locked(buf);
 	if (transid != fs_info->generation)
 		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
 			buf->start, transid, fs_info->generation);

--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -6,9 +6,6 @@
 #ifndef BTRFS_DISK_IO_H
 #define BTRFS_DISK_IO_H

-#define BTRFS_SUPER_INFO_OFFSET SZ_64K
-#define BTRFS_SUPER_INFO_SIZE 4096
-
 #define BTRFS_SUPER_MIRROR_MAX	 3
 #define BTRFS_SUPER_MIRROR_SHIFT 12

@@ -81,7 +78,7 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
 void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
 void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
 				 struct btrfs_root *root);
-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
 				   struct page *page, u64 start, u64 end,
 				   int mirror);
 blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,

--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -32,6 +32,7 @@ enum {
 	/* write IO error */
 	EXTENT_BUFFER_WRITE_ERR,
 	EXTENT_BUFFER_NO_CHECK,
+	EXTENT_BUFFER_ZONE_FINISH,
 };

 /* these are flags for __process_pages_contig */
@@ -183,8 +184,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 		      struct btrfs_bio_ctrl *bio_ctrl,
 		      unsigned int read_flags, u64 *prev_em_start);
 int extent_write_full_page(struct page *page, struct writeback_control *wbc);
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
-			      int mode);
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end);
 int extent_writepages(struct address_space *mapping,
 		      struct writeback_control *wbc);
 int btree_write_cache_pages(struct address_space *mapping,
@@ -277,14 +277,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
 				  struct page *locked_page,
 				  u32 bits_to_clear, unsigned long page_ops);
-struct bio *btrfs_bio_alloc(u64 first_byte);
-struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
+struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
 struct bio *btrfs_bio_clone(struct bio *bio);
 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);

-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
-		      u64 length, u64 logical, struct page *page,
-		      unsigned int pg_offset, int mirror_num);
 void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
 int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);


--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -360,7 +360,7 @@ static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
 	int i;

 	for (i = 0; i < map->num_stripes; i++) {
-		struct btrfs_bio_stripe *stripe = &map->stripes[i];
+		struct btrfs_io_stripe *stripe = &map->stripes[i];
 		struct btrfs_device *device = stripe->dev;

 		set_extent_bits_nowait(&device->alloc_state, stripe->physical,
@@ -375,7 +375,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
 	int i;

 	for (i = 0; i < map->num_stripes; i++) {
-		struct btrfs_bio_stripe *stripe = &map->stripes[i];
+		struct btrfs_io_stripe *stripe = &map->stripes[i];
 		struct btrfs_device *device = stripe->dev;

 		__clear_extent_bit(&device->alloc_state, stripe->physical,

--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -358,7 +358,7 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
 * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
 *       checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
 *       NULL, the checksum buffer is allocated and returned in
- *       btrfs_io_bio(bio)->csum instead.
+ *       btrfs_bio(bio)->csum instead.
 *
 * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
 */
@@ -397,19 +397,18 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
 		return BLK_STS_RESOURCE;

 	if (!dst) {
-		struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
+		struct btrfs_bio *bbio = btrfs_bio(bio);

 		if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
-			btrfs_bio->csum = kmalloc_array(nblocks, csum_size,
-							GFP_NOFS);
-			if (!btrfs_bio->csum) {
+			bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
+			if (!bbio->csum) {
 				btrfs_free_path(path);
 				return BLK_STS_RESOURCE;
 			}
 		} else {
-			btrfs_bio->csum = btrfs_bio->csum_inline;
+			bbio->csum = bbio->csum_inline;
 		}
-		csum = btrfs_bio->csum;
+		csum = bbio->csum;
 	} else {
 		csum = dst;
 	}
@@ -709,12 +708,12 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
 				index = 0;
 			}

-			data = kmap_atomic(bvec.bv_page);
-			crypto_shash_digest(shash, data + bvec.bv_offset
-					    + (i * fs_info->sectorsize),
+			data = bvec_kmap_local(&bvec);
+			crypto_shash_digest(shash,
+					    data + (i * fs_info->sectorsize),
 					    fs_info->sectorsize,
 					    sums->sums + index);
-			kunmap_atomic(data);
+			kunmap_local(data);
 			index += fs_info->csum_size;
 			offset += fs_info->sectorsize;
 			this_sum_bytes += fs_info->sectorsize;

--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -437,9 +437,15 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
 /*
 * unlocks pages after btrfs_file_write is done with them
 */
-static void btrfs_drop_pages(struct page **pages, size_t num_pages)
+static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
+			     struct page **pages, size_t num_pages,
+			     u64 pos, u64 copied)
 {
 	size_t i;
+	u64 block_start = round_down(pos, fs_info->sectorsize);
+	u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
+
+	ASSERT(block_len <= U32_MAX);
 	for (i = 0; i < num_pages; i++) {
 		/* page checked is some magic around finding pages that
 		 * have been modified without going through btrfs_set_page_dirty
@@ -447,7 +453,8 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
 		 * accessed as prepare_pages should have marked them accessed
 		 * in prepare_pages via find_or_create_page()
 		 */
-		ClearPageChecked(pages[i]);
+		btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
+					       block_len);
 		unlock_page(pages[i]);
 		put_page(pages[i]);
 	}
@@ -504,7 +511,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
 		struct page *p = pages[i];

 		btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
-		ClearPageChecked(p);
+		btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
 		btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
 	}

@@ -869,7 +876,8 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 				btrfs_init_data_ref(&ref,
 						root->root_key.objectid,
 						new_key.objectid,
-						args->start - extent_offset);
+						args->start - extent_offset,
+						0, false);
 				ret = btrfs_inc_extent_ref(trans, &ref);
 				BUG_ON(ret); /* -ENOMEM */
 			}
@@ -955,7 +963,8 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 				btrfs_init_data_ref(&ref,
 						root->root_key.objectid,
 						key.objectid,
-						key.offset - extent_offset);
+						key.offset - extent_offset, 0,
+						false);
 				ret = btrfs_free_extent(trans, &ref);
 				BUG_ON(ret); /* -ENOMEM */
 				args->bytes_found += extent_end - key.offset;
@@ -1020,8 +1029,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
 				path->slots[0]++;
 		}
-		setup_items_for_insert(root, path, &key,
-				       &args->extent_item_size, 1);
+		btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
 		args->extent_inserted = true;
 	}

@@ -1232,7 +1240,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
 				       num_bytes, 0);
 		btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
-				    orig_offset);
+				    orig_offset, 0, false);
 		ret = btrfs_inc_extent_ref(trans, &ref);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
@@ -1257,7 +1265,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 	other_end = 0;
 	btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
 			       num_bytes, 0);
-	btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
+	btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
+			    0, false);
 	if (extent_mergeable(leaf, path->slots[0] + 1,
 			     ino, bytenr, orig_offset,
 			     &other_start, &other_end)) {
@@ -1844,7 +1853,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,

 		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
 		if (ret) {
-			btrfs_drop_pages(pages, num_pages);
+			btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
 			break;
 		}

@@ -1852,7 +1861,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
 		if (only_release_metadata)
 			btrfs_check_nocow_unlock(BTRFS_I(inode));

-		btrfs_drop_pages(pages, num_pages);
+		btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);

 		cond_resched();

@@ -2012,7 +2021,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	 * have opened a file as writable, we have to stop this write operation
 	 * to ensure consistency.
 	 */
-	if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state))
+	if (BTRFS_FS_ERROR(inode->root->fs_info))
 		return -EROFS;

 	if (!(iocb->ki_flags & IOCB_DIRECT) &&
@@ -2620,7 +2629,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
 				       extent_info->disk_len, 0);
 		ref_offset = extent_info->file_offset - extent_info->data_offset;
 		btrfs_init_data_ref(&ref, root->root_key.objectid,
-				    btrfs_ino(inode), ref_offset);
+				    btrfs_ino(inode), ref_offset, 0, false);
 		ret = btrfs_inc_extent_ref(trans, &ref);
 	}


--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -22,6 +22,7 @@
 #include "delalloc-space.h"
 #include "block-group.h"
 #include "discard.h"
+#include "subpage.h"

 #define BITS_PER_BITMAP		(PAGE_SIZE * 8UL)
 #define MAX_CACHE_BYTES_PER_GIG	SZ_64K
@@ -411,7 +412,10 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)

 	for (i = 0; i < io_ctl->num_pages; i++) {
 		if (io_ctl->pages[i]) {
-			ClearPageChecked(io_ctl->pages[i]);
+			btrfs_page_clear_checked(io_ctl->fs_info,
+					io_ctl->pages[i],
+					page_offset(io_ctl->pages[i]),
+					PAGE_SIZE);
 			unlock_page(io_ctl->pages[i]);
 			put_page(io_ctl->pages[i]);
 		}
@@ -2539,10 +2543,16 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
 	u64 offset = bytenr - block_group->start;
 	u64 to_free, to_unusable;
 	const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
+	bool initial = (size == block_group->length);
+	u64 reclaimable_unusable;
+
+	WARN_ON(!initial && offset + size > block_group->zone_capacity);

 	spin_lock(&ctl->tree_lock);
 	if (!used)
 		to_free = size;
+	else if (initial)
+		to_free = block_group->zone_capacity;
 	else if (offset >= block_group->alloc_offset)
 		to_free = size;
 	else if (offset + size <= block_group->alloc_offset)
@@ -2565,12 +2575,15 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
 		spin_unlock(&block_group->lock);
 	}

+	reclaimable_unusable = block_group->zone_unusable -
+			       (block_group->length - block_group->zone_capacity);
 	/* All the region is now unusable. Mark it as unused and reclaim */
 	if (block_group->zone_unusable == block_group->length) {
 		btrfs_mark_bg_unused(block_group);
 	} else if (bg_reclaim_threshold &&
-		   block_group->zone_unusable >=
-		   div_factor_fine(block_group->length, bg_reclaim_threshold)) {
+		   reclaimable_unusable >=
+		   div_factor_fine(block_group->zone_capacity,
+				   bg_reclaim_threshold)) {
 		btrfs_mark_bg_to_reclaim(block_group);
 	}

@@ -2754,8 +2767,9 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
 	 * out the free space after the allocation offset.
 	 */
 	if (btrfs_is_zoned(fs_info)) {
-		btrfs_info(fs_info, "free space %llu",
-			   block_group->length - block_group->alloc_offset);
+		btrfs_info(fs_info, "free space %llu active %d",
+			   block_group->zone_capacity - block_group->alloc_offset,
+			   block_group->zone_is_active);
 		return;
 	}


--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -96,11 +96,12 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);

 #ifdef CONFIG_BTRFS_DEBUG
-static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
-	lockdep_assert_held(&eb->lock);
+static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
+{
+	lockdep_assert_held_write(&eb->lock);
 }
 #else
-static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { }
+static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { }
 #endif

 void btrfs_unlock_up_safe(struct btrfs_path *path, int level);

--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -32,19 +32,19 @@
 *     payload.
 *     One regular LZO compressed extent can have one or more segments.
 *     For inlined LZO compressed extent, only one segment is allowed.
- *     One segment represents at most one page of uncompressed data.
+ *     One segment represents at most one sector of uncompressed data.
 *
 * 2.1 Segment header
 *     Fixed size. LZO_LEN (4) bytes long, LE32.
 *     Records the total size of the segment (not including the header).
- *     Segment header never crosses page boundary, thus it's possible to
- *     have at most 3 padding zeros at the end of the page.
+ *     Segment header never crosses sector boundary, thus it's possible to
+ *     have at most 3 padding zeros at the end of the sector.
 *
 * 2.2 Data Payload
- *     Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE)
- *     which is 4419 for a 4KiB page.
+ *     Variable size. Size up limit should be lzo1x_worst_compress(sectorsize)
+ *     which is 4419 for a 4KiB sectorsize.
 *
- * Example:
+ * Example with 4K sectorsize:
 * Page 1:
 *          0     0x2   0x4   0x6   0x8   0xa   0xc   0xe     0x10
 * 0x0000   |  Header   | SegHdr 01 | Data payload 01 ...     |
@@ -112,170 +112,174 @@ static inline size_t read_compress_length(const char *buf)
 	return le32_to_cpu(dlen);
 }

+/*
+ * Will do:
+ *
+ * - Write a segment header into the destination
+ * - Copy the compressed buffer into the destination
+ * - Make sure we have enough space in the last sector to fit a segment header
+ *   If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros.
+ *
+ * Will allocate new pages when needed.
+ */
+static int copy_compressed_data_to_page(char *compressed_data,
+					size_t compressed_size,
+					struct page **out_pages,
+					u32 *cur_out,
+					const u32 sectorsize)
+{
+	u32 sector_bytes_left;
+	u32 orig_out;
+	struct page *cur_page;
+	char *kaddr;
+
+	/*
+	 * We never allow a segment header crossing sector boundary, previous
+	 * run should ensure we have enough space left inside the sector.
+	 */
+	ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
+
+	cur_page = out_pages[*cur_out / PAGE_SIZE];
+	/* Allocate a new page */
+	if (!cur_page) {
+		cur_page = alloc_page(GFP_NOFS);
+		if (!cur_page)
+			return -ENOMEM;
+		out_pages[*cur_out / PAGE_SIZE] = cur_page;
+	}
+
+	kaddr = kmap(cur_page);
+	write_compress_length(kaddr + offset_in_page(*cur_out),
+			      compressed_size);
+	*cur_out += LZO_LEN;
+
+	orig_out = *cur_out;
+
+	/* Copy compressed data */
+	while (*cur_out - orig_out < compressed_size) {
+		u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize,
+				     orig_out + compressed_size - *cur_out);
+
+		kunmap(cur_page);
+		cur_page = out_pages[*cur_out / PAGE_SIZE];
+		/* Allocate a new page */
+		if (!cur_page) {
+			cur_page = alloc_page(GFP_NOFS);
+			if (!cur_page)
+				return -ENOMEM;
+			out_pages[*cur_out / PAGE_SIZE] = cur_page;
+		}
+		kaddr = kmap(cur_page);
+
+		memcpy(kaddr + offset_in_page(*cur_out),
+		       compressed_data + *cur_out - orig_out, copy_len);
+
+		*cur_out += copy_len;
+	}
+
+	/*
+	 * Check if we can fit the next segment header into the remaining space
+	 * of the sector.
+	 */
+	sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out;
+	if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0)
+		goto out;
+
+	/* The remaining size is not enough, pad it with zeros */
+	memset(kaddr + offset_in_page(*cur_out), 0,
+	       sector_bytes_left);
+	*cur_out += sector_bytes_left;
+
+out:
+	kunmap(cur_page);
+	return 0;
+}
+
 int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 		u64 start, struct page **pages, unsigned long *out_pages,
 		unsigned long *total_in, unsigned long *total_out)
 {
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize;
+	struct page *page_in = NULL;
+	char *sizes_ptr;
 	int ret = 0;
-	char *data_in;
-	char *cpage_out, *sizes_ptr;
-	int nr_pages = 0;
-	struct page *in_page = NULL;
-	struct page *out_page = NULL;
-	unsigned long bytes_left;
-	unsigned long len = *total_out;
-	unsigned long nr_dest_pages = *out_pages;
-	const unsigned long max_out = nr_dest_pages * PAGE_SIZE;
-	size_t in_len;
-	size_t out_len;
-	char *buf;
-	unsigned long tot_in = 0;
-	unsigned long tot_out = 0;
-	unsigned long pg_bytes_left;
-	unsigned long out_offset;
-	unsigned long bytes;
+	/* Points to the file offset of input data */
+	u64 cur_in = start;
+	/* Points to the current output byte */
+	u32 cur_out = 0;
+	u32 len = *total_out;

 	*out_pages = 0;
 	*total_out = 0;
 	*total_in = 0;

-	in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-	data_in = kmap(in_page);
-
 	/*
-	 * store the size of all chunks of compressed data in
-	 * the first 4 bytes
+	 * Skip the header for now, we will later come back and write the total
+	 * compressed size
 	 */
-	out_page = alloc_page(GFP_NOFS);
-	if (out_page == NULL) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	cpage_out = kmap(out_page);
-	out_offset = LZO_LEN;
-	tot_out = LZO_LEN;
-	pages[0] = out_page;
-	nr_pages = 1;
-	pg_bytes_left = PAGE_SIZE - LZO_LEN;
-
-	/* compress at most one page of data each time */
-	in_len = min(len, PAGE_SIZE);
-	while (tot_in < len) {
-		ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
-				       &out_len, workspace->mem);
-		if (ret != LZO_E_OK) {
-			pr_debug("BTRFS: lzo in loop returned %d\n",
-			       ret);
+	cur_out += LZO_LEN;
+	while (cur_in < start + len) {
+		char *data_in;
+		const u32 sectorsize_mask = sectorsize - 1;
+		u32 sector_off = (cur_in - start) & sectorsize_mask;
+		u32 in_len;
+		size_t out_len;
+
+		/* Get the input page first */
+		if (!page_in) {
+			page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT);
+			ASSERT(page_in);
+		}
+
+		/* Compress at most one sector of data each time */
+		in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
+		ASSERT(in_len);
+		data_in = kmap(page_in);
+		ret = lzo1x_1_compress(data_in +
+				       offset_in_page(cur_in), in_len,
+				       workspace->cbuf, &out_len,
+				       workspace->mem);
+		kunmap(page_in);
+		if (ret < 0) {
+			pr_debug("BTRFS: lzo in loop returned %d\n", ret);
 			ret = -EIO;
 			goto out;
 		}

-		/* store the size of this chunk of compressed data */
-		write_compress_length(cpage_out + out_offset, out_len);
-		tot_out += LZO_LEN;
-		out_offset += LZO_LEN;
-		pg_bytes_left -= LZO_LEN;
-
-		tot_in += in_len;
-		tot_out += out_len;
-
-		/* copy bytes from the working buffer into the pages */
-		buf = workspace->cbuf;
-		while (out_len) {
-			bytes = min_t(unsigned long, pg_bytes_left, out_len);
-
-			memcpy(cpage_out + out_offset, buf, bytes);
-
-			out_len -= bytes;
-			pg_bytes_left -= bytes;
-			buf += bytes;
-			out_offset += bytes;
-
-			/*
-			 * we need another page for writing out.
-			 *
-			 * Note if there's less than 4 bytes left, we just
-			 * skip to a new page.
-			 */
-			if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
-			    pg_bytes_left == 0) {
-				if (pg_bytes_left) {
-					memset(cpage_out + out_offset, 0,
-					       pg_bytes_left);
-					tot_out += pg_bytes_left;
-				}
-
-				/* we're done, don't allocate new page */
-				if (out_len == 0 && tot_in >= len)
-					break;
-
-				kunmap(out_page);
-				if (nr_pages == nr_dest_pages) {
-					out_page = NULL;
-					ret = -E2BIG;
-					goto out;
-				}
-
-				out_page = alloc_page(GFP_NOFS);
-				if (out_page == NULL) {
-					ret = -ENOMEM;
-					goto out;
-				}
-				cpage_out = kmap(out_page);
-				pages[nr_pages++] = out_page;
-
-				pg_bytes_left = PAGE_SIZE;
-				out_offset = 0;
-			}
-		}
+		ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
+						   pages, &cur_out, sectorsize);
+		if (ret < 0)
+			goto out;

-		/* we're making it bigger, give up */
-		if (tot_in > 8192 && tot_in < tot_out) {
+		cur_in += in_len;
+
+		/*
+		 * Check if we're making it bigger after two sectors.  And if
+		 * it is so, give up.
+		 */
+		if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) {
 			ret = -E2BIG;
 			goto out;
 		}

-		/* we're all done */
-		if (tot_in >= len)
-			break;
-
-		if (tot_out > max_out)
-			break;
-
-		bytes_left = len - tot_in;
-		kunmap(in_page);
-		put_page(in_page);
-
-		start += PAGE_SIZE;
-		in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-		data_in = kmap(in_page);
-		in_len = min(bytes_left, PAGE_SIZE);
-	}
-
-	if (tot_out >= tot_in) {
-		ret = -E2BIG;
-		goto out;
+		/* Check if we have reached page boundary */
+		if (IS_ALIGNED(cur_in, PAGE_SIZE)) {
+			put_page(page_in);
+			page_in = NULL;
+		}
 	}

-	/* store the size of all chunks of compressed data */
+	/* Store the size of all chunks of compressed data */
 	sizes_ptr = kmap_local_page(pages[0]);
-	write_compress_length(sizes_ptr, tot_out);
+	write_compress_length(sizes_ptr, cur_out);
 	kunmap_local(sizes_ptr);

 	ret = 0;
-	*total_out = tot_out;
-	*total_in = tot_in;
+	*total_out = cur_out;
+	*total_in = cur_in - start;
 out:
-	*out_pages = nr_pages;
-	if (out_page)
-		kunmap(out_page);
-
-	if (in_page) {
-		kunmap(in_page);
-		put_page(in_page);
-	}
-
+	*out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE);
 	return ret;
 }


--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -30,25 +30,23 @@ static inline int nr_data_stripes(const struct map_lookup *map)
 struct btrfs_raid_bio;
 struct btrfs_device;

-int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
-			  struct btrfs_bio *bbio, u64 stripe_len,
-			  int mirror_num, int generic_io);
-int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
-			       struct btrfs_bio *bbio, u64 stripe_len);
+int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+			  u64 stripe_len, int mirror_num, int generic_io);
+int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
+			u64 stripe_len);

 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
 			    u64 logical);

-struct btrfs_raid_bio *
-raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
-			       struct btrfs_bio *bbio, u64 stripe_len,
-			       struct btrfs_device *scrub_dev,
-			       unsigned long *dbitmap, int stripe_nsectors);
+struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+				struct btrfs_io_context *bioc, u64 stripe_len,
+				struct btrfs_device *scrub_dev,
+				unsigned long *dbitmap, int stripe_nsectors);
 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);

 struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
-			  struct btrfs_bio *bbio, u64 length);
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
+			  u64 length);
 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);

 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);

--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -227,7 +227,7 @@ int btree_readahead_hook(struct extent_buffer *eb, int err)
 }

 static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
-					  struct btrfs_bio *bbio)
+					  struct btrfs_io_context *bioc)
 {
 	struct btrfs_fs_info *fs_info = dev->fs_info;
 	int ret;
@@ -275,11 +275,11 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
 	kref_init(&zone->refcnt);
 	zone->elems = 0;
 	zone->device = dev; /* our device always sits at index 0 */
-	for (i = 0; i < bbio->num_stripes; ++i) {
+	for (i = 0; i < bioc->num_stripes; ++i) {
 		/* bounds have already been checked */
-		zone->devs[i] = bbio->stripes[i].dev;
+		zone->devs[i] = bioc->stripes[i].dev;
 	}
-	zone->ndevs = bbio->num_stripes;
+	zone->ndevs = bioc->num_stripes;

 	spin_lock(&fs_info->reada_lock);
 	ret = radix_tree_insert(&dev->reada_zones,
@@ -309,7 +309,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
 	int ret;
 	struct reada_extent *re = NULL;
 	struct reada_extent *re_exist = NULL;
-	struct btrfs_bio *bbio = NULL;
+	struct btrfs_io_context *bioc = NULL;
 	struct btrfs_device *dev;
 	struct btrfs_device *prev_dev;
 	u64 length;
@@ -345,28 +345,28 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
 	 */
 	length = fs_info->nodesize;
 	ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
-			&length, &bbio, 0);
-	if (ret || !bbio || length < fs_info->nodesize)
+			      &length, &bioc, 0);
+	if (ret || !bioc || length < fs_info->nodesize)
 		goto error;

-	if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
+	if (bioc->num_stripes > BTRFS_MAX_MIRRORS) {
 		btrfs_err(fs_info,
 			   "readahead: more than %d copies not supported",
 			   BTRFS_MAX_MIRRORS);
 		goto error;
 	}

-	real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+	real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
 	for (nzones = 0; nzones < real_stripes; ++nzones) {
 		struct reada_zone *zone;

-		dev = bbio->stripes[nzones].dev;
+		dev = bioc->stripes[nzones].dev;

 		/* cannot read ahead on missing device. */
 		if (!dev->bdev)
 			continue;

-		zone = reada_find_zone(dev, logical, bbio);
+		zone = reada_find_zone(dev, logical, bioc);
 		if (!zone)
 			continue;

@@ -464,7 +464,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
 	if (!have_zone)
 		goto error;

-	btrfs_put_bbio(bbio);
+	btrfs_put_bioc(bioc);
 	return re;

 error:
@@ -488,7 +488,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
 		kref_put(&zone->refcnt, reada_zone_release);
 		spin_unlock(&fs_info->reada_lock);
 	}
-	btrfs_put_bbio(bbio);
+	btrfs_put_bioc(bioc);
 	kfree(re);
 	return re_exist;
 }

--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -678,10 +678,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,

 	if (generic_ref->type == BTRFS_REF_METADATA) {
 		if (!parent)
-			ref_root = generic_ref->tree_ref.root;
+			ref_root = generic_ref->tree_ref.owning_root;
 		owner = generic_ref->tree_ref.level;
 	} else if (!parent) {
-		ref_root = generic_ref->data_ref.ref_root;
+		ref_root = generic_ref->data_ref.owning_root;
 		owner = generic_ref->data_ref.ino;
 		offset = generic_ref->data_ref.offset;
 	}

--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -138,7 +138,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
 	}

 	btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
-	ClearPageChecked(page);
+	btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
 	btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
 out_unlock:
 	if (page) {
@@ -649,7 +649,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
 static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
 			     struct inode *dst, u64 dst_loff)
 {
-	int ret;
+	int ret = 0;
 	u64 i, tail_len, chunk_count;
 	struct btrfs_root *root_dst = BTRFS_I(dst)->root;


--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -84,6 +84,8 @@ struct send_ctx {
 	u64 total_send_size;
 	u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
 	u64 flags;	/* 'flags' member of btrfs_ioctl_send_args is u64 */
+	/* Protocol version compatibility requested */
+	u32 proto;

 	struct btrfs_root *send_root;
 	struct btrfs_root *parent_root;
@@ -312,6 +314,16 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx,
 		   sctx->parent_root->root_key.objectid : 0));
 }

+__maybe_unused
+static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
+{
+	switch (sctx->proto) {
+	case 1:	 return cmd < __BTRFS_SEND_C_MAX_V1;
+	case 2:	 return cmd < __BTRFS_SEND_C_MAX_V2;
+	default: return false;
+	}
+}
+
 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);

 static struct waiting_dir_move *
@@ -2720,19 +2732,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx)
 	if (S_ISDIR(sctx->cur_inode_mode)) {
 		ret = did_create_dir(sctx, sctx->cur_ino);
 		if (ret < 0)
-			goto out;
-		if (ret) {
-			ret = 0;
-			goto out;
-		}
+			return ret;
+		else if (ret > 0)
+			return 0;
 	}

-	ret = send_create_inode(sctx, sctx->cur_ino);
-	if (ret < 0)
-		goto out;
-
-out:
-	return ret;
+	return send_create_inode(sctx, sctx->cur_ino);
 }

 struct recorded_ref {
@@ -7276,6 +7281,17 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)

 	sctx->flags = arg->flags;

+	if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
+		if (arg->version > BTRFS_SEND_STREAM_VERSION) {
+			ret = -EPROTO;
+			goto out;
+		}
+		/* Zero means "use the highest version" */
+		sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION;
+	} else {
+		sctx->proto = 1;
+	}
+
 	sctx->send_filp = fget(arg->send_fd);
 	if (!sctx->send_filp) {
 		ret = -EBADF;

--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
--- a/fs/inode.c
+++ b/fs/inode.c
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h