Commit e4fc196f authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-6.11-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

 - fix regression in extent map rework when handling insertion of
   overlapping compressed extent

 - fix unexpected file length when appending to a file using direct io
   and buffer not faulted in

 - in zoned mode, fix accounting of unusable space when flipping
   read-only block group back to read-write

 - fix page locking when COWing an inline range, assertion failure found
   by syzbot

 - fix calculation of space info in debugging print

 - tree-checker, add validation of data reference item

 - fix a few -Wmaybe-uninitialized build warnings

* tag 'for-6.11-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: initialize location to fix -Wmaybe-uninitialized in btrfs_lookup_dentry()
  btrfs: fix corruption after buffer fault in during direct IO append write
  btrfs: zoned: fix zone_unusable accounting on making block group read-write again
  btrfs: do not subtract delalloc from avail bytes
  btrfs: make cow_file_range_inline() honor locked_page on error
  btrfs: fix corrupt read due to bad offset of a compressed extent map
  btrfs: tree-checker: validate dref root and objectid
parents e254e0c5 b8e947e9
......@@ -1223,8 +1223,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
block_group->space_info->total_bytes -= block_group->length;
block_group->space_info->bytes_readonly -=
(block_group->length - block_group->zone_unusable);
block_group->space_info->bytes_zone_unusable -=
block_group->zone_unusable;
btrfs_space_info_update_bytes_zone_unusable(fs_info, block_group->space_info,
-block_group->zone_unusable);
block_group->space_info->disk_total -= block_group->length * factor;
spin_unlock(&block_group->space_info->lock);
......@@ -1396,7 +1396,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
if (btrfs_is_zoned(cache->fs_info)) {
/* Migrate zone_unusable bytes to readonly */
sinfo->bytes_readonly += cache->zone_unusable;
sinfo->bytes_zone_unusable -= cache->zone_unusable;
btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo,
-cache->zone_unusable);
cache->zone_unusable = 0;
}
cache->ro++;
......@@ -3056,9 +3057,11 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
if (btrfs_is_zoned(cache->fs_info)) {
/* Migrate zone_unusable bytes back */
cache->zone_unusable =
(cache->alloc_offset - cache->used) +
(cache->alloc_offset - cache->used - cache->pinned -
cache->reserved) +
(cache->length - cache->zone_capacity);
sinfo->bytes_zone_unusable += cache->zone_unusable;
btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo,
cache->zone_unusable);
sinfo->bytes_readonly -= cache->zone_unusable;
}
num_bytes = cache->length - cache->reserved -
......
......@@ -459,6 +459,7 @@ struct btrfs_file_private {
void *filldir_buf;
u64 last_index;
struct extent_state *llseek_cached_state;
bool fsync_skip_inode_lock;
};
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
......
......@@ -856,21 +856,37 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
* So here we disable page faults in the iov_iter and then retry if we
* got -EFAULT, faulting in the pages before the retry.
*/
again:
from->nofault = true;
dio = btrfs_dio_write(iocb, from, written);
from->nofault = false;
/*
* iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
* iocb, and that needs to lock the inode. So unlock it before calling
* iomap_dio_complete() to avoid a deadlock.
*/
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
if (IS_ERR_OR_NULL(dio))
if (IS_ERR_OR_NULL(dio)) {
ret = PTR_ERR_OR_ZERO(dio);
else
} else {
struct btrfs_file_private stack_private = { 0 };
struct btrfs_file_private *private;
const bool have_private = (file->private_data != NULL);
if (!have_private)
file->private_data = &stack_private;
/*
* If we have a synchronous write, we must make sure the fsync
* triggered by the iomap_dio_complete() call below doesn't
* deadlock on the inode lock - we are already holding it and we
* can't call it after unlocking because we may need to complete
* partial writes due to the input buffer (or parts of it) not
* being already faulted in.
*/
private = file->private_data;
private->fsync_skip_inode_lock = true;
ret = iomap_dio_complete(dio);
private->fsync_skip_inode_lock = false;
if (!have_private)
file->private_data = NULL;
}
/* No increment (+=) because iomap returns a cumulative value. */
if (ret > 0)
......@@ -897,10 +913,12 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
} else {
fault_in_iov_iter_readable(from, left);
prev_left = left;
goto relock;
goto again;
}
}
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
/*
* If 'ret' is -ENOTBLK or we have not written all data, then it means
* we must fallback to buffered IO.
......
......@@ -2793,7 +2793,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
readonly = true;
} else if (btrfs_is_zoned(fs_info)) {
/* Need reset before reusing in a zoned block group */
space_info->bytes_zone_unusable += len;
btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info,
len);
readonly = true;
}
spin_unlock(&cache->lock);
......
......@@ -664,7 +664,7 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode,
start_diff = start - em->start;
em->start = start;
em->len = end - start;
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE && !extent_map_is_compressed(em))
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
em->offset += start_diff;
return add_extent_mapping(inode, em, 0);
}
......
......@@ -1603,6 +1603,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
*/
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct btrfs_file_private *private = file->private_data;
struct dentry *dentry = file_dentry(file);
struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
struct btrfs_root *root = inode->root;
......@@ -1612,6 +1613,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
int ret = 0, err;
u64 len;
bool full_sync;
const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false);
trace_btrfs_sync_file(file, datasync);
......@@ -1639,7 +1641,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
goto out;
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
if (skip_ilock)
down_write(&inode->i_mmap_lock);
else
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
atomic_inc(&root->log_batch);
......@@ -1663,7 +1668,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
if (skip_ilock)
up_write(&inode->i_mmap_lock);
else
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
goto out;
}
......@@ -1788,7 +1796,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
if (skip_ilock)
up_write(&inode->i_mmap_lock);
else
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
if (ret == BTRFS_NO_LOG_SYNC) {
ret = btrfs_end_transaction(trans);
......
......@@ -2723,8 +2723,10 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
* If the block group is read-only, we should account freed space into
* bytes_readonly.
*/
if (!block_group->ro)
if (!block_group->ro) {
block_group->zone_unusable += to_unusable;
WARN_ON(block_group->zone_unusable > block_group->length);
}
spin_unlock(&ctl->tree_lock);
if (!used) {
spin_lock(&block_group->lock);
......
......@@ -714,8 +714,9 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offse
return ret;
}
static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 offset,
u64 end,
static noinline int cow_file_range_inline(struct btrfs_inode *inode,
struct page *locked_page,
u64 offset, u64 end,
size_t compressed_size,
int compress_type,
struct folio *compressed_folio,
......@@ -739,7 +740,10 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 offset,
return ret;
}
extent_clear_unlock_delalloc(inode, offset, end, NULL, &cached,
if (ret == 0)
locked_page = NULL;
extent_clear_unlock_delalloc(inode, offset, end, locked_page, &cached,
clear_flags,
PAGE_UNLOCK | PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
......@@ -1043,10 +1047,10 @@ static void compress_file_range(struct btrfs_work *work)
* extent for the subpage case.
*/
if (total_in < actual_end)
ret = cow_file_range_inline(inode, start, end, 0,
ret = cow_file_range_inline(inode, NULL, start, end, 0,
BTRFS_COMPRESS_NONE, NULL, false);
else
ret = cow_file_range_inline(inode, start, end, total_compressed,
ret = cow_file_range_inline(inode, NULL, start, end, total_compressed,
compress_type, folios[0], false);
if (ret <= 0) {
if (ret < 0)
......@@ -1359,7 +1363,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
if (!no_inline) {
/* lets try to make an inline extent */
ret = cow_file_range_inline(inode, start, end, 0,
ret = cow_file_range_inline(inode, locked_page, start, end, 0,
BTRFS_COMPRESS_NONE, NULL, false);
if (ret <= 0) {
/*
......@@ -5660,7 +5664,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
struct inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *sub_root = root;
struct btrfs_key location;
struct btrfs_key location = { 0 };
u8 di_type = 0;
int ret = 0;
......
......@@ -316,7 +316,7 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
found->bytes_used += block_group->used;
found->disk_used += block_group->used * factor;
found->bytes_readonly += block_group->bytes_super;
found->bytes_zone_unusable += block_group->zone_unusable;
btrfs_space_info_update_bytes_zone_unusable(info, found, block_group->zone_unusable);
if (block_group->length > 0)
found->full = 0;
btrfs_try_granting_tickets(info, found);
......@@ -583,8 +583,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
spin_lock(&cache->lock);
avail = cache->length - cache->used - cache->pinned -
cache->reserved - cache->delalloc_bytes -
cache->bytes_super - cache->zone_unusable;
cache->reserved - cache->bytes_super - cache->zone_unusable;
btrfs_info(fs_info,
"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s",
cache->start, cache->length, cache->used, cache->pinned,
......
......@@ -249,6 +249,7 @@ btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \
DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info");
DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned");
DECLARE_SPACE_INFO_UPDATE(bytes_zone_unusable, "zone_unusable");
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
......
......@@ -900,6 +900,102 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
return ret;
}
/*
* Test a regression for compressed extent map adjustment when we attempt to
* add an extent map that is partially overlapped by another existing extent
* map. The resulting extent map offset was left unchanged despite having
* incremented its start offset.
*/
static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
int ret;
int ret2;
em = alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
}
/* Compressed extent for the file range [120K, 128K). */
em->start = SZ_1K * 120;
em->len = SZ_8K;
em->disk_num_bytes = SZ_4K;
em->ram_bytes = SZ_8K;
em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
free_extent_map(em);
if (ret < 0) {
test_err("couldn't add extent map for range [120K, 128K)");
goto out;
}
em = alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
goto out;
}
/*
* Compressed extent for the file range [108K, 144K), which overlaps
* with the [120K, 128K) we previously inserted.
*/
em->start = SZ_1K * 108;
em->len = SZ_1K * 36;
em->disk_num_bytes = SZ_4K;
em->ram_bytes = SZ_1K * 36;
em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
/*
* Try to add the extent map but with a search range of [140K, 144K),
* this should succeed and adjust the extent map to the range
* [128K, 144K), with a length of 16K and an offset of 20K.
*
* This simulates a scenario where in the subvolume tree of an inode we
* have a compressed file extent item for the range [108K, 144K) and we
* have an overlapping compressed extent map for the range [120K, 128K),
* which was created by an encoded write, but its ordered extent was not
* yet completed, so the subvolume tree doesn't have yet the file extent
* item for that range - we only have the extent map in the inode's
* extent map tree.
*/
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, SZ_1K * 140, SZ_4K);
write_unlock(&em_tree->lock);
free_extent_map(em);
if (ret < 0) {
test_err("couldn't add extent map for range [108K, 144K)");
goto out;
}
if (em->start != SZ_128K) {
test_err("unexpected extent map start %llu (should be 128K)", em->start);
ret = -EINVAL;
goto out;
}
if (em->len != SZ_16K) {
test_err("unexpected extent map length %llu (should be 16K)", em->len);
ret = -EINVAL;
goto out;
}
if (em->offset != SZ_1K * 20) {
test_err("unexpected extent map offset %llu (should be 20K)", em->offset);
ret = -EINVAL;
goto out;
}
out:
ret2 = free_extent_map_tree(inode);
if (ret == 0)
ret = ret2;
return ret;
}
struct rmap_test_vector {
u64 raid_type;
u64 physical_start;
......@@ -1076,6 +1172,9 @@ int btrfs_test_extent_map(void)
if (ret)
goto out;
ret = test_case_7(fs_info, BTRFS_I(inode));
if (ret)
goto out;
ret = test_case_8(fs_info, BTRFS_I(inode));
if (ret)
goto out;
......
......@@ -1289,6 +1289,19 @@ static void extent_err(const struct extent_buffer *eb, int slot,
va_end(args);
}
static bool is_valid_dref_root(u64 rootid)
{
/*
* The following tree root objectids are allowed to have a data backref:
* - subvolume trees
* - data reloc tree
* - tree root
* For v1 space cache
*/
return is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID ||
rootid == BTRFS_ROOT_TREE_OBJECTID;
}
static int check_extent_item(struct extent_buffer *leaf,
struct btrfs_key *key, int slot,
struct btrfs_key *prev_key)
......@@ -1441,6 +1454,8 @@ static int check_extent_item(struct extent_buffer *leaf,
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
u64 seq;
u64 dref_root;
u64 dref_objectid;
u64 dref_offset;
u64 inline_offset;
u8 inline_type;
......@@ -1484,11 +1499,26 @@ static int check_extent_item(struct extent_buffer *leaf,
*/
case BTRFS_EXTENT_DATA_REF_KEY:
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
dref_root = btrfs_extent_data_ref_root(leaf, dref);
dref_objectid = btrfs_extent_data_ref_objectid(leaf, dref);
dref_offset = btrfs_extent_data_ref_offset(leaf, dref);
seq = hash_extent_data_ref(
btrfs_extent_data_ref_root(leaf, dref),
btrfs_extent_data_ref_objectid(leaf, dref),
btrfs_extent_data_ref_offset(leaf, dref));
if (unlikely(!is_valid_dref_root(dref_root))) {
extent_err(leaf, slot,
"invalid data ref root value %llu",
dref_root);
return -EUCLEAN;
}
if (unlikely(dref_objectid < BTRFS_FIRST_FREE_OBJECTID ||
dref_objectid > BTRFS_LAST_FREE_OBJECTID)) {
extent_err(leaf, slot,
"invalid data ref objectid value %llu",
dref_root);
return -EUCLEAN;
}
if (unlikely(!IS_ALIGNED(dref_offset,
fs_info->sectorsize))) {
extent_err(leaf, slot,
......@@ -1627,6 +1657,8 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
return -EUCLEAN;
}
for (; ptr < end; ptr += sizeof(*dref)) {
u64 root;
u64 objectid;
u64 offset;
/*
......@@ -1634,7 +1666,22 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
* overflow from the leaf due to hash collisions.
*/
dref = (struct btrfs_extent_data_ref *)ptr;
root = btrfs_extent_data_ref_root(leaf, dref);
objectid = btrfs_extent_data_ref_objectid(leaf, dref);
offset = btrfs_extent_data_ref_offset(leaf, dref);
if (unlikely(!is_valid_dref_root(root))) {
extent_err(leaf, slot,
"invalid extent data backref root value %llu",
root);
return -EUCLEAN;
}
if (unlikely(objectid < BTRFS_FIRST_FREE_OBJECTID ||
objectid > BTRFS_LAST_FREE_OBJECTID)) {
extent_err(leaf, slot,
"invalid extent data backref objectid value %llu",
root);
return -EUCLEAN;
}
if (unlikely(!IS_ALIGNED(offset, leaf->fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid extent data backref offset, have %llu expect aligned to %u",
......
......@@ -2383,6 +2383,14 @@ DEFINE_EVENT(btrfs__space_info_update, update_bytes_pinned,
TP_ARGS(fs_info, sinfo, old, diff)
);
DEFINE_EVENT(btrfs__space_info_update, update_bytes_zone_unusable,
TP_PROTO(const struct btrfs_fs_info *fs_info,
const struct btrfs_space_info *sinfo, u64 old, s64 diff),
TP_ARGS(fs_info, sinfo, old, diff)
);
DECLARE_EVENT_CLASS(btrfs_raid56_bio,
TP_PROTO(const struct btrfs_raid_bio *rbio,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment