Commit 1263a7bf authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-6.11-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

 - followup fix for direct io and fsync under some conditions, reported
   by QEMU users

 - fix a potential leak when disabling quotas while some extent tracking
   work can still happen

 - in zoned mode handle unexpected change of zone write pointer in
   RAID1-like block groups, turn the zones to read-only

* tag 'for-6.11-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: fix race between direct IO write and fsync when using same fd
  btrfs: zoned: handle broken write pointer on zones
  btrfs: qgroup: don't use extent changeset when not needed
parents d8abb73f cd9253c2
......@@ -459,7 +459,6 @@ struct btrfs_file_private {
void *filldir_buf;
u64 last_index;
struct extent_state *llseek_cached_state;
bool fsync_skip_inode_lock;
};
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
......
......@@ -864,13 +864,6 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (IS_ERR_OR_NULL(dio)) {
ret = PTR_ERR_OR_ZERO(dio);
} else {
struct btrfs_file_private stack_private = { 0 };
struct btrfs_file_private *private;
const bool have_private = (file->private_data != NULL);
if (!have_private)
file->private_data = &stack_private;
/*
* If we have a synchronous write, we must make sure the fsync
* triggered by the iomap_dio_complete() call below doesn't
......@@ -879,13 +872,10 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
* partial writes due to the input buffer (or parts of it) not
* being already faulted in.
*/
private = file->private_data;
private->fsync_skip_inode_lock = true;
ASSERT(current->journal_info == NULL);
current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
ret = iomap_dio_complete(dio);
private->fsync_skip_inode_lock = false;
if (!have_private)
file->private_data = NULL;
current->journal_info = NULL;
}
/* No increment (+=) because iomap returns a cumulative value. */
......
......@@ -1603,7 +1603,6 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
*/
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct btrfs_file_private *private = file->private_data;
struct dentry *dentry = file_dentry(file);
struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
struct btrfs_root *root = inode->root;
......@@ -1613,7 +1612,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
int ret = 0, err;
u64 len;
bool full_sync;
const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false);
bool skip_ilock = false;
if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
skip_ilock = true;
current->journal_info = NULL;
lockdep_assert_held(&inode->vfs_inode.i_rwsem);
}
trace_btrfs_sync_file(file, datasync);
......
......@@ -4346,10 +4346,9 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
int ret;
if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
extent_changeset_init(&changeset);
return clear_record_extent_bits(&inode->io_tree, start,
start + len - 1,
EXTENT_QGROUP_RESERVED, &changeset);
EXTENT_QGROUP_RESERVED, NULL);
}
/* In release case, we shouldn't have @reserved */
......
......@@ -27,6 +27,12 @@ struct btrfs_root_item;
struct btrfs_root;
struct btrfs_path;
/*
* Signal that a direct IO write is in progress, to avoid deadlock for sync
* direct IO writes when fsync is called during the direct IO write path.
*/
#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1)
/* Radix-tree tag for roots that are part of the trasaction. */
#define BTRFS_ROOT_TRANS_TAG 0
......
......@@ -1406,6 +1406,8 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
return -EINVAL;
}
bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
if (zone_info[0].alloc_offset == WP_MISSING_DEV) {
btrfs_err(bg->fs_info,
"zoned: cannot recover write pointer for zone %llu",
......@@ -1432,7 +1434,6 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
}
bg->alloc_offset = zone_info[0].alloc_offset;
bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity);
return 0;
}
......@@ -1450,6 +1451,9 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
return -EINVAL;
}
/* In case a device is missing we have a cap of 0, so don't use it. */
bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
for (i = 0; i < map->num_stripes; i++) {
if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
zone_info[i].alloc_offset == WP_CONVENTIONAL)
......@@ -1471,9 +1475,6 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
if (test_bit(0, active))
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
}
/* In case a device is missing we have a cap of 0, so don't use it. */
bg->zone_capacity = min_not_zero(zone_info[0].capacity,
zone_info[1].capacity);
}
if (zone_info[0].alloc_offset != WP_MISSING_DEV)
......@@ -1563,6 +1564,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
unsigned long *active = NULL;
u64 last_alloc = 0;
u32 num_sequential = 0, num_conventional = 0;
u64 profile;
if (!btrfs_is_zoned(fs_info))
return 0;
......@@ -1623,7 +1625,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
}
switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
switch (profile) {
case 0: /* single */
ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
break;
......@@ -1650,6 +1653,23 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 &&
profile != BTRFS_BLOCK_GROUP_RAID10) {
/*
* Detected broken write pointer. Make this block group
* unallocatable by setting the allocation pointer at the end of
* allocatable region. Relocating this block group will fix the
* mismatch.
*
* Currently, we cannot handle RAID0 or RAID10 case like this
* because we don't have a proper zone_capacity value. But,
* reading from this block group won't work anyway by a missing
* stripe.
*/
cache->alloc_offset = cache->zone_capacity;
ret = 0;
}
out:
/* Reject non SINGLE data profiles without RST */
if ((map->type & BTRFS_BLOCK_GROUP_DATA) &&
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment