Commit 1f3a3e2a authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-6.8-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:
 "A few regular fixes and one fix for space reservation regression since
  6.7 that users have been reporting:

   - fix over-reservation of metadata chunks due to not keeping proper
     balance between global block reserve and delayed refs reserve; in
     practice this leaves behind empty metadata block groups, the
     workaround is to reclaim them by using the '-musage=1' balance
     filter

   - other space reservation fixes:
      - do not delete unused block group if it may be used soon
      - do not reserve space for checksums for NOCOW files

   - fix extent map assertion failure when writing out free space inode

   - reject encoded write if inode has nodatasum flag set

   - fix chunk map leak when loading block group zone info"

* tag 'for-6.8-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: don't refill whole delayed refs block reserve when starting transaction
  btrfs: zoned: fix chunk map leak when loading block group zone info
  btrfs: reject encoded write if inode has nodatasum flag set
  btrfs: don't reserve space for checksums when writing to nocow files
  btrfs: add new unused block groups to the list of unused block groups
  btrfs: do not delete unused block group if it may be used soon
  btrfs: add and use helper to check if block group is used
  btrfs: don't drop extent_map for free space inode on write error
parents 91f842ff 2f6397e4
......@@ -1455,6 +1455,7 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
*/
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
{
LIST_HEAD(retry_list);
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
......@@ -1476,6 +1477,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
u64 used;
int trimming;
block_group = list_first_entry(&fs_info->unused_bgs,
......@@ -1511,9 +1513,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
goto next;
}
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
if (block_group->reserved || block_group->pinned ||
block_group->used || block_group->ro ||
if (btrfs_is_block_group_used(block_group) || block_group->ro ||
list_is_singular(&block_group->list)) {
/*
* We want to bail if we made new allocations or have
......@@ -1523,10 +1525,49 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
*/
trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem);
goto next;
}
/*
* The block group may be unused but there may be space reserved
* accounting with the existence of that block group, that is,
* space_info->bytes_may_use was incremented by a task but no
* space was yet allocated from the block group by the task.
* That space may or may not be allocated, as we are generally
* pessimistic about space reservation for metadata as well as
* for data when using compression (as we reserve space based on
* the worst case, when data can't be compressed, and before
* actually attempting compression, before starting writeback).
*
* So check if the total space of the space_info minus the size
* of this block group is less than the used space of the
* space_info - if that's the case, then it means we have tasks
* that might be relying on the block group in order to allocate
* extents, and add back the block group to the unused list when
* we finish, so that we retry later in case no tasks ended up
* needing to allocate extents from the block group.
*/
used = btrfs_space_info_used(space_info, true);
if (space_info->total_bytes - block_group->length < used) {
/*
* Add a reference for the list, compensate for the ref
* drop under the "next" label for the
* fs_info->unused_bgs list.
*/
btrfs_get_block_group(block_group);
list_add_tail(&block_group->bg_list, &retry_list);
trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem);
goto next;
}
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
/* We don't want to force the issue, only flip if it's ok. */
ret = inc_block_group_ro(block_group, 0);
......@@ -1650,12 +1691,16 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
list_splice_tail(&retry_list, &fs_info->unused_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
return;
flip_async:
btrfs_end_transaction(trans);
spin_lock(&fs_info->unused_bgs_lock);
list_splice_tail(&retry_list, &fs_info->unused_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_put_block_group(block_group);
btrfs_discard_punt_unused_bgs_list(fs_info);
......@@ -2684,6 +2729,37 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
list_del_init(&block_group->bg_list);
clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
/*
* If the block group is still unused, add it to the list of
* unused block groups. The block group may have been created in
* order to satisfy a space reservation, in which case the
* extent allocation only happens later. But often we don't
* actually need to allocate space that we previously reserved,
* so the block group may become unused for a long time. For
* example for metadata we generally reserve space for a worst
* possible scenario, but then don't end up allocating all that
* space or none at all (due to no need to COW, extent buffers
* were already COWed in the current transaction and still
* unwritten, tree heights lower than the maximum possible
* height, etc). For data we generally reserve the axact amount
* of space we are going to allocate later, the exception is
* when using compression, as we must reserve space based on the
* uncompressed data size, because the compression is only done
* when writeback triggered and we don't know how much space we
* are actually going to need, so we reserve the uncompressed
* size because the data may be uncompressible in the worst case.
*/
if (ret == 0) {
bool used;
spin_lock(&block_group->lock);
used = btrfs_is_block_group_used(block_group);
spin_unlock(&block_group->lock);
if (!used)
btrfs_mark_bg_unused(block_group);
}
}
btrfs_trans_release_chunk_metadata(trans);
}
......
......@@ -257,6 +257,13 @@ static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
return (block_group->start + block_group->length);
}
static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
{
lockdep_assert_held(&bg->lock);
return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
}
static inline bool btrfs_is_block_group_data_only(
struct btrfs_block_group *block_group)
{
......
......@@ -245,7 +245,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
u64 reserve_size = 0;
u64 qgroup_rsv_size = 0;
u64 csum_leaves;
unsigned outstanding_extents;
lockdep_assert_held(&inode->lock);
......@@ -260,10 +259,12 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
outstanding_extents);
reserve_size += btrfs_calc_metadata_size(fs_info, 1);
}
csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
inode->csum_bytes);
reserve_size += btrfs_calc_insert_metadata_size(fs_info,
csum_leaves);
if (!(inode->flags & BTRFS_INODE_NODATASUM)) {
u64 csum_leaves;
csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves);
}
/*
* For qgroup rsv, the calculation is very simple:
* account one nodesize for each outstanding extent
......@@ -278,14 +279,20 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
spin_unlock(&block_rsv->lock);
}
static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
static void calc_inode_reservations(struct btrfs_inode *inode,
u64 num_bytes, u64 disk_num_bytes,
u64 *meta_reserve, u64 *qgroup_reserve)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 nr_extents = count_max_extents(fs_info, num_bytes);
u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
u64 csum_leaves;
u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
if (inode->flags & BTRFS_INODE_NODATASUM)
csum_leaves = 0;
else
csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
nr_extents + csum_leaves);
......@@ -337,7 +344,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
* everything out and try again, which is bad. This way we just
* over-reserve slightly, and clean up the mess when we are done.
*/
calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
calc_inode_reservations(inode, num_bytes, disk_num_bytes,
&meta_reserve, &qgroup_reserve);
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
noflush);
......@@ -359,6 +366,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
nr_extents = count_max_extents(fs_info, num_bytes);
spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, nr_extents);
if (!(inode->flags & BTRFS_INODE_NODATASUM))
inode->csum_bytes += disk_num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
......@@ -393,6 +401,7 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
spin_lock(&inode->lock);
if (!(inode->flags & BTRFS_INODE_NODATASUM))
inode->csum_bytes -= num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
......
......@@ -3184,8 +3184,23 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
unwritten_start += logical_len;
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
/* Drop extent maps for the part of the extent we didn't write. */
btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
/*
* Drop extent maps for the part of the extent we didn't write.
*
* We have an exception here for the free_space_inode, this is
* because when we do btrfs_get_extent() on the free space inode
* we will search the commit root. If this is a new block group
* we won't find anything, and we will trip over the assert in
* writepage where we do ASSERT(em->block_start !=
* EXTENT_MAP_HOLE).
*
* Theoretically we could also skip this for any NOCOW extent as
* we don't mess with the extent map tree in the NOCOW case, but
* for now simply skip this if we are the free space inode.
*/
if (!btrfs_is_free_space_inode(inode))
btrfs_drop_extent_map_range(inode, unwritten_start,
end, false);
/*
* If the ordered extent had an IOERR or something else went
......@@ -10273,6 +10288,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
return -EINVAL;
/*
* Compressed extents should always have checksums, so error out if we
* have a NOCOW file or inode was created while mounted with NODATASUM.
*/
if (inode->flags & BTRFS_INODE_NODATASUM)
return -EINVAL;
orig_count = iov_iter_count(from);
/* The extent size must be sane. */
......
......@@ -564,56 +564,22 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
u64 num_bytes,
u64 *delayed_refs_bytes)
{
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
u64 extra_delayed_refs_bytes = 0;
u64 bytes;
u64 bytes = num_bytes + *delayed_refs_bytes;
int ret;
/*
* If there's a gap between the size of the delayed refs reserve and
* its reserved space, than some tasks have added delayed refs or bumped
* its size otherwise (due to block group creation or removal, or block
* group item update). Also try to allocate that gap in order to prevent
* using (and possibly abusing) the global reserve when committing the
* transaction.
*/
if (flush == BTRFS_RESERVE_FLUSH_ALL &&
!btrfs_block_rsv_full(delayed_refs_rsv)) {
spin_lock(&delayed_refs_rsv->lock);
if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
extra_delayed_refs_bytes = delayed_refs_rsv->size -
delayed_refs_rsv->reserved;
spin_unlock(&delayed_refs_rsv->lock);
}
bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;
/*
* We want to reserve all the bytes we may need all at once, so we only
* do 1 enospc flushing cycle per transaction start.
*/
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
if (ret == 0) {
if (extra_delayed_refs_bytes > 0)
btrfs_migrate_to_delayed_refs_rsv(fs_info,
extra_delayed_refs_bytes);
return 0;
}
if (extra_delayed_refs_bytes > 0) {
bytes -= extra_delayed_refs_bytes;
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
if (ret == 0)
return 0;
}
/*
* If we are an emergency flush, which can steal from the global block
* reserve, then attempt to not reserve space for the delayed refs, as
* we will consume space for them from the global block reserve.
*/
if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
bytes -= *delayed_refs_bytes;
*delayed_refs_bytes = 0;
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
......
......@@ -1670,6 +1670,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
bitmap_free(active);
kfree(zone_info);
btrfs_free_chunk_map(map);
return ret;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment