Merge tag 'for-5.18-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba: - prevent deleting subvolume with active swapfile - fix qgroup reserve limit calculation overflow - remove device count in superblock and its item in one transaction so they cant't get out of sync - skip defragmenting an isolated sector, this could cause some extra IO - unify handling of mtime/permissions in hole punch with fallocate - zoned mode fixes: - remove assert checking for only single mode, we have the DUP mode implemented - fix potential lockdep warning while traversing devices when checking for zone activation * tag 'for-5.18-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: prevent subvol with swapfile from being deleted btrfs: do not warn for free space inode in cow_file_range btrfs: avoid defragging extents whose next extents are not targets btrfs: fix fallocate to use file_modified to update permissions consistently btrfs: remove device item and update super block in the same transaction btrfs: fix qgroup reserve overflow the qgroup limit btrfs: zoned: remove left over ASSERT checking for single profile btrfs: zoned: traverse devices under chunk_mutex in btrfs_can_activate_zone

Merge tag 'for-5.18-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba: - prevent deleting subvolume with active swapfile - fix qgroup reserve limit calculation overflow - remove device count in superblock and its item in one transaction so they cant't get out of sync - skip defragmenting an isolated sector, this could cause some extra IO - unify handling of mtime/permissions in hole punch with fallocate - zoned mode fixes: - remove assert checking for only single mode, we have the DUP mode implemented - fix potential lockdep warning while traversing devices when checking for zone activation * tag 'for-5.18-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: prevent subvol with swapfile from being deleted btrfs: do not warn for free space inode in cow_file_range btrfs: avoid defragging extents whose next extents are not targets btrfs: fix fallocate to use file_modified to update permissions consistently btrfs: remove device item and update super block in the same transaction btrfs: fix qgroup reserve overflow the qgroup limit btrfs: zoned: remove left over ASSERT checking for single profile btrfs: zoned: traverse devices under chunk_mutex in btrfs_can_activate_zone
ce4c854e · Linus Torvalds · 31231092 · 60021bd7 · ce4c854e · ce4c854e
Commit ce4c854e authored Apr 05, 2022 by Linus Torvalds
6 changed files
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -118,7 +118,7 @@ struct btrfs_bio_ctrl {
 */
 struct extent_changeset {
 	/* How many bytes are set/cleared in this operation */
-	unsigned int bytes_changed;
+	u64 bytes_changed;
 	/* Changed ranges */
 	struct ulist range_changed;

--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2957,8 +2957,9 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
 	return ret;
 }
-static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
 {
+	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_state *cached_state = NULL;
@@ -2990,6 +2991,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		goto out_only_mutex;
 	}
+	ret = file_modified(file);
+	if (ret)
+		goto out_only_mutex;
 	lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
 	lockend = round_down(offset + len,
 			     btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
@@ -3430,7 +3435,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 		return -EOPNOTSUPP;
 	if (mode & FALLOC_FL_PUNCH_HOLE)
-		return btrfs_punch_hole(inode, offset, len);
+		return btrfs_punch_hole(file, offset, len);
 	/*
 	 * Only trigger disk allocation, don't trigger qgroup reserve
@@ -3452,6 +3457,10 @@ static long btrfs_fallocate(struct file *file, int mode,
 			goto out;
 	}
+	ret = file_modified(file);
+	if (ret)
+		goto out;
 	/*
 	 * TODO: Move these two operations after we have checked
 	 * accurate reserved space, or fallocate can still fail but

--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1128,7 +1128,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
 	int ret = 0;
 	if (btrfs_is_free_space_inode(inode)) {
-		WARN_ON_ONCE(1);
 		ret = -EINVAL;
 		goto out_unlock;
 	}
@@ -4488,6 +4487,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
 			   dest->root_key.objectid);
 		return -EPERM;
 	}
+	if (atomic_read(&dest->nr_swapfiles)) {
+		spin_unlock(&dest->root_item_lock);
+		btrfs_warn(fs_info,
+			   "attempt to delete subvolume %llu with active swapfile",
+			   root->root_key.objectid);
+		return -EPERM;
+	}
 	root_flags = btrfs_root_flags(&dest->root_item);
 	btrfs_set_root_flags(&dest->root_item,
 			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
@@ -11107,8 +11113,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 	 * set. We use this counter to prevent snapshots. We must increment it
 	 * before walking the extents because we don't want a concurrent
 	 * snapshot to run after we've already checked the extents.
+	 *
+	 * It is possible that subvolume is marked for deletion but still not
+	 * removed yet. To prevent this race, we check the root status before
+	 * activating the swapfile.
 	 */
+	spin_lock(&root->root_item_lock);
+	if (btrfs_root_dead(root)) {
+		spin_unlock(&root->root_item_lock);
+		btrfs_exclop_finish(fs_info);
+		btrfs_warn(fs_info,
+		"cannot activate swapfile because subvolume %llu is being deleted",
+			root->root_key.objectid);
+		return -EPERM;
+	}
 	atomic_inc(&root->nr_swapfiles);
+	spin_unlock(&root->root_item_lock);
 	isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);

--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1239,7 +1239,7 @@ static u32 get_extent_max_capacity(const struct extent_map *em)
 }
 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
-				     bool locked)
+				     u32 extent_thresh, u64 newer_than, bool locked)
 {
 	struct extent_map *next;
 	bool ret = false;
@@ -1249,11 +1249,12 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
 		return false;
 	/*
-	 * We want to check if the next extent can be merged with the current
+	 * Here we need to pass @newer_then when checking the next extent, or
-	 * one, which can be an extent created in a past generation, so we pass
+	 * we will hit a case we mark current extent for defrag, but the next
-	 * a minimum generation of 0 to defrag_lookup_extent().
+	 * one will not be a target.
+	 * This will just cause extra IO without really reducing the fragments.
 	 */
-	next = defrag_lookup_extent(inode, em->start + em->len, 0, locked);
+	next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
 	/* No more em or hole */
 	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
 		goto out;
@@ -1265,6 +1266,13 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
 	 */
 	if (next->len >= get_extent_max_capacity(em))
 		goto out;
+	/* Skip older extent */
+	if (next->generation < newer_than)
+		goto out;
+	/* Also check extent size */
+	if (next->len >= extent_thresh)
+		goto out;
 	ret = true;
 out:
 	free_extent_map(next);
@@ -1470,7 +1478,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
 			goto next;
 		next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
-							  locked);
+						extent_thresh, newer_than, locked);
 		if (!next_mergeable) {
 			struct defrag_target_range *last;

--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1896,23 +1896,18 @@ static void update_dev_time(const char *device_path)
 	path_put(&path);
 }
-static int btrfs_rm_dev_item(struct btrfs_device *device)
+static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
+			     struct btrfs_device *device)
 {
 	struct btrfs_root *root = device->fs_info->chunk_root;
 	int ret;
 	struct btrfs_path *path;
 	struct btrfs_key key;
-	struct btrfs_trans_handle *trans;
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	trans = btrfs_start_transaction(root, 0);
-	if (IS_ERR(trans)) {
-		btrfs_free_path(path);
-		return PTR_ERR(trans);
-	}
 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
 	key.type = BTRFS_DEV_ITEM_KEY;
 	key.offset = device->devid;
@@ -1923,21 +1918,12 @@ static int btrfs_rm_dev_item(struct btrfs_device *device)
 	if (ret) {
 		if (ret > 0)
 			ret = -ENOENT;
-		btrfs_abort_transaction(trans, ret);
-		btrfs_end_transaction(trans);
 		goto out;
 	}
 	ret = btrfs_del_item(trans, root, path);
-	if (ret) {
-		btrfs_abort_transaction(trans, ret);
-		btrfs_end_transaction(trans);
-	}
 out:
 	btrfs_free_path(path);
-	if (!ret)
-		ret = btrfs_commit_transaction(trans);
 	return ret;
 }
@@ -2078,6 +2064,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		    struct btrfs_dev_lookup_args *args,
 		    struct block_device **bdev, fmode_t *mode)
 {
+	struct btrfs_trans_handle *trans;
 	struct btrfs_device *device;
 	struct btrfs_fs_devices *cur_devices;
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
@@ -2098,7 +2085,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
 	if (ret)
-		goto out;
+		return ret;
 	device = btrfs_find_device(fs_info->fs_devices, args);
 	if (!device) {
@@ -2106,27 +2093,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
 		else
 			ret = -ENOENT;
-		goto out;
+		return ret;
 	}
 	if (btrfs_pinned_by_swapfile(fs_info, device)) {
 		btrfs_warn_in_rcu(fs_info,
 		  "cannot remove device %s (devid %llu) due to active swapfile",
 				  rcu_str_deref(device->name), device->devid);
-		ret = -ETXTBSY;
+		return -ETXTBSY;
-		goto out;
 	}
-	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
+	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
-		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
+		return BTRFS_ERROR_DEV_TGT_REPLACE;
-		goto out;
-	}
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
-	    fs_info->fs_devices->rw_devices == 1) {
+	    fs_info->fs_devices->rw_devices == 1)
-		ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
+		return BTRFS_ERROR_DEV_ONLY_WRITABLE;
-		goto out;
-	}
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 		mutex_lock(&fs_info->chunk_mutex);
@@ -2139,14 +2121,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 	if (ret)
 		goto error_undo;
-	/*
+	trans = btrfs_start_transaction(fs_info->chunk_root, 0);
-	 * TODO: the superblock still includes this device in its num_devices
+	if (IS_ERR(trans)) {
-	 * counter although write_all_supers() is not locked out. This
+		ret = PTR_ERR(trans);
-	 * could give a filesystem state which requires a degraded mount.
-	 */
-	ret = btrfs_rm_dev_item(device);
-	if (ret)
 		goto error_undo;
+	}
+	ret = btrfs_rm_dev_item(trans, device);
+	if (ret) {
+		/* Any error in dev item removal is critical */
+		btrfs_crit(fs_info,
+			   "failed to remove device item for devid %llu: %d",
+			   device->devid, ret);
+		btrfs_abort_transaction(trans, ret);
+		btrfs_end_transaction(trans);
+		return ret;
+	}
 	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	btrfs_scrub_cancel_dev(device);
@@ -2229,7 +2219,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		free_fs_devices(cur_devices);
 	}
-out:
+	ret = btrfs_commit_transaction(trans);
 	return ret;
 error_undo:
@@ -2240,7 +2231,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		device->fs_devices->rw_devices++;
 		mutex_unlock(&fs_info->chunk_mutex);
 	}
-	goto out;
+	return ret;
 }
 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)

--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1801,7 +1801,6 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
 	map = em->map_lookup;
 	/* We only support single profile for now */
-	ASSERT(map->num_stripes == 1);
 	device = map->stripes[0].dev;
 	free_extent_map(em);
@@ -1976,18 +1975,16 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
 bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
 {
+	struct btrfs_fs_info *fs_info = fs_devices->fs_info;
 	struct btrfs_device *device;
 	bool ret = false;
-	if (!btrfs_is_zoned(fs_devices->fs_info))
+	if (!btrfs_is_zoned(fs_info))
 		return true;
-	/* Non-single profiles are not supported yet */
-	ASSERT((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0);
 	/* Check if there is a device with active zones left */
-	mutex_lock(&fs_devices->device_list_mutex);
+	mutex_lock(&fs_info->chunk_mutex);
-	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
 		struct btrfs_zoned_device_info *zinfo = device->zone_info;
 		if (!device->bdev)
@@ -1999,7 +1996,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
 			break;
 		}
 	}
-	mutex_unlock(&fs_devices->device_list_mutex);
+	mutex_unlock(&fs_info->chunk_mutex);
 	return ret;
 }