Commit f02bf857 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.14-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs zoned mode fixes from David Sterba:

 - fix deadlock when allocating system chunk

 - fix wrong mutex unlock on an error path

 - fix extent map splitting for append operation

 - update and fix message reporting unusable chunk space

 - don't block when background zone reclaim runs with balance in
   parallel

* tag 'for-5.14-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: zoned: fix wrong mutex unlock on failure to allocate log root tree
  btrfs: don't block if we can't acquire the reclaim lock
  btrfs: properly split extent_map for REQ_OP_ZONE_APPEND
  btrfs: rework chunk allocation to avoid exhaustion of the system chunk array
  btrfs: fix deadlock with concurrent chunk allocations involving system chunks
  btrfs: zoned: print unusable percentage when reclaiming block groups
  btrfs: zoned: fix types for u64 division in btrfs_reclaim_bgs_work
parents 7fef2edf ea32af47
This diff is collapsed.
......@@ -97,6 +97,7 @@ struct btrfs_block_group {
unsigned int removed:1;
unsigned int to_copy:1;
unsigned int relocating_repair:1;
unsigned int chunk_item_inserted:1;
int disk_cache_state;
......@@ -268,8 +269,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work);
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
u64 type, u64 chunk_offset, u64 size);
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 bytes_used, u64 type,
u64 chunk_offset, u64 size);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
bool do_chunk_alloc);
......
......@@ -364,49 +364,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
return 0;
}
static struct extent_buffer *alloc_tree_block_no_bg_flush(
struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent_start,
const struct btrfs_disk_key *disk_key,
int level,
u64 hint,
u64 empty_size,
enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *ret;
/*
* If we are COWing a node/leaf from the extent, chunk, device or free
* space trees, make sure that we do not finish block group creation of
* pending block groups. We do this to avoid a deadlock.
* COWing can result in allocation of a new chunk, and flushing pending
* block groups (btrfs_create_pending_block_groups()) can be triggered
* when finishing allocation of a new chunk. Creation of a pending block
* group modifies the extent, chunk, device and free space trees,
* therefore we could deadlock with ourselves since we are holding a
* lock on an extent buffer that btrfs_create_pending_block_groups() may
* try to COW later.
* For similar reasons, we also need to delay flushing pending block
* groups when splitting a leaf or node, from one of those trees, since
* we are holding a write lock on it and its parent or when inserting a
* new root node for one of those trees.
*/
if (root == fs_info->extent_root ||
root == fs_info->chunk_root ||
root == fs_info->dev_root ||
root == fs_info->free_space_root)
trans->can_flush_pending_bgs = false;
ret = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, disk_key, level,
hint, empty_size, nest);
trans->can_flush_pending_bgs = true;
return ret;
}
/*
* does the dirty work in cow of a single block. The parent block (if
* supplied) is updated to point to the new cow copy. The new buffer is marked
......@@ -455,8 +412,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
parent_start = parent->start;
cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
level, search_start, empty_size, nest);
cow = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, &disk_key, level,
search_start, empty_size, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
......@@ -2458,9 +2416,9 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(lower, &lower_key, 0);
c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
root->node->start, 0,
BTRFS_NESTING_NEW_ROOT);
c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
&lower_key, level, root->node->start, 0,
BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
return PTR_ERR(c);
......@@ -2589,8 +2547,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
mid = (c_nritems + 1) / 2;
btrfs_node_key(c, &disk_key, mid);
split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
c->start, 0, BTRFS_NESTING_SPLIT);
split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
&disk_key, level, c->start, 0,
BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
return PTR_ERR(split);
......@@ -3381,10 +3340,10 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
* BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
* use BTRFS_NESTING_NEW_ROOT.
*/
right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
l->start, 0, num_doubles ?
BTRFS_NESTING_NEW_ROOT :
BTRFS_NESTING_SPLIT);
right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
&disk_key, 0, l->start, 0,
num_doubles ? BTRFS_NESTING_NEW_ROOT :
BTRFS_NESTING_SPLIT);
if (IS_ERR(right))
return PTR_ERR(right);
......
......@@ -2271,13 +2271,127 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
}
/*
* Split an extent_map at [start, start + len]
*
* This function is intended to be used only for extract_ordered_extent().
*/
static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
u64 pre, u64 post)
{
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
struct extent_map *split_pre = NULL;
struct extent_map *split_mid = NULL;
struct extent_map *split_post = NULL;
int ret = 0;
int modified;
unsigned long flags;
/* Sanity check */
if (pre == 0 && post == 0)
return 0;
split_pre = alloc_extent_map();
if (pre)
split_mid = alloc_extent_map();
if (post)
split_post = alloc_extent_map();
if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
ret = -ENOMEM;
goto out;
}
ASSERT(pre + post < len);
lock_extent(&inode->io_tree, start, start + len - 1);
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (!em) {
ret = -EIO;
goto out_unlock;
}
ASSERT(em->len == len);
ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
flags = em->flags;
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
clear_bit(EXTENT_FLAG_LOGGING, &flags);
modified = !list_empty(&em->list);
/* First, replace the em with a new extent_map starting from * em->start */
split_pre->start = em->start;
split_pre->len = (pre ? pre : em->len - post);
split_pre->orig_start = split_pre->start;
split_pre->block_start = em->block_start;
split_pre->block_len = split_pre->len;
split_pre->orig_block_len = split_pre->block_len;
split_pre->ram_bytes = split_pre->len;
split_pre->flags = flags;
split_pre->compress_type = em->compress_type;
split_pre->generation = em->generation;
replace_extent_mapping(em_tree, em, split_pre, modified);
/*
* Now we only have an extent_map at:
* [em->start, em->start + pre] if pre != 0
* [em->start, em->start + em->len - post] if pre == 0
*/
if (pre) {
/* Insert the middle extent_map */
split_mid->start = em->start + pre;
split_mid->len = em->len - pre - post;
split_mid->orig_start = split_mid->start;
split_mid->block_start = em->block_start + pre;
split_mid->block_len = split_mid->len;
split_mid->orig_block_len = split_mid->block_len;
split_mid->ram_bytes = split_mid->len;
split_mid->flags = flags;
split_mid->compress_type = em->compress_type;
split_mid->generation = em->generation;
add_extent_mapping(em_tree, split_mid, modified);
}
if (post) {
split_post->start = em->start + em->len - post;
split_post->len = post;
split_post->orig_start = split_post->start;
split_post->block_start = em->block_start + em->len - post;
split_post->block_len = split_post->len;
split_post->orig_block_len = split_post->block_len;
split_post->ram_bytes = split_post->len;
split_post->flags = flags;
split_post->compress_type = em->compress_type;
split_post->generation = em->generation;
add_extent_mapping(em_tree, split_post, modified);
}
/* Once for us */
free_extent_map(em);
/* Once for the tree */
free_extent_map(em);
out_unlock:
write_unlock(&em_tree->lock);
unlock_extent(&inode->io_tree, start, start + len - 1);
out:
free_extent_map(split_pre);
free_extent_map(split_mid);
free_extent_map(split_post);
return ret;
}
static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
struct bio *bio, loff_t file_offset)
{
struct btrfs_ordered_extent *ordered;
struct extent_map *em = NULL, *em_new = NULL;
struct extent_map_tree *em_tree = &inode->extent_tree;
u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 file_len;
u64 len = bio->bi_iter.bi_size;
u64 end = start + len;
u64 ordered_end;
......@@ -2317,41 +2431,16 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
goto out;
}
file_len = ordered->num_bytes;
pre = start - ordered->disk_bytenr;
post = ordered_end - end;
ret = btrfs_split_ordered_extent(ordered, pre, post);
if (ret)
goto out;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, ordered->file_offset, len);
if (!em) {
read_unlock(&em_tree->lock);
ret = -EIO;
goto out;
}
read_unlock(&em_tree->lock);
ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
/*
* We cannot reuse em_new here but have to create a new one, as
* unpin_extent_cache() expects the start of the extent map to be the
* logical offset of the file, which does not hold true anymore after
* splitting.
*/
em_new = create_io_em(inode, em->start + pre, len,
em->start + pre, em->block_start + pre, len,
len, len, BTRFS_COMPRESS_NONE,
BTRFS_ORDERED_REGULAR);
if (IS_ERR(em_new)) {
ret = PTR_ERR(em_new);
goto out;
}
free_extent_map(em_new);
ret = split_zoned_em(inode, file_offset, file_len, pre, post);
out:
free_extent_map(em);
btrfs_put_ordered_extent(ordered);
return errno_to_blk_status(ret);
......
......@@ -254,23 +254,21 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans)
}
/*
* To be called after all the new block groups attached to the transaction
* handle have been created (btrfs_create_pending_block_groups()).
* To be called after doing the chunk btree updates right after allocating a new
* chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a
* chunk after all chunk btree updates and after finishing the second phase of
* chunk allocation (btrfs_create_pending_block_groups()) in case some block
* group had its chunk item insertion delayed to the second phase.
*/
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
if (!trans->chunk_bytes_reserved)
return;
WARN_ON_ONCE(!list_empty(&trans->new_bgs));
btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
trans->chunk_bytes_reserved, NULL);
atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved);
cond_wake_up(&cur_trans->chunk_reserve_wait);
trans->chunk_bytes_reserved = 0;
}
......@@ -386,8 +384,6 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
spin_lock_init(&cur_trans->dropped_roots_lock);
INIT_LIST_HEAD(&cur_trans->releasing_ebs);
spin_lock_init(&cur_trans->releasing_ebs_lock);
atomic64_set(&cur_trans->chunk_bytes_reserved, 0);
init_waitqueue_head(&cur_trans->chunk_reserve_wait);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
......@@ -701,7 +697,6 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
h->fs_info = root->fs_info;
h->type = type;
h->can_flush_pending_bgs = true;
INIT_LIST_HEAD(&h->new_bgs);
smp_mb();
......
......@@ -96,13 +96,6 @@ struct btrfs_transaction {
spinlock_t releasing_ebs_lock;
struct list_head releasing_ebs;
/*
* The number of bytes currently reserved, by all transaction handles
* attached to this transaction, for metadata extents of the chunk tree.
*/
atomic64_t chunk_bytes_reserved;
wait_queue_head_t chunk_reserve_wait;
};
#define __TRANS_FREEZABLE (1U << 0)
......@@ -139,7 +132,7 @@ struct btrfs_trans_handle {
short aborted;
bool adding_csums;
bool allocating_chunk;
bool can_flush_pending_bgs;
bool removing_chunk;
bool reloc_reserved;
bool in_fsync;
struct btrfs_root *root;
......
......@@ -3173,7 +3173,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (!log_root_tree->node) {
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
if (ret) {
mutex_unlock(&fs_info->tree_log_mutex);
mutex_unlock(&fs_info->tree_root->log_mutex);
goto out;
}
}
......
This diff is collapsed.
......@@ -450,7 +450,8 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
struct btrfs_io_geometry *io_geom);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);
struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
u64 type);
void btrfs_mapping_tree_free(struct extent_map_tree *tree);
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int mirror_num);
......@@ -509,6 +510,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical);
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
u64 chunk_offset, u64 chunk_size);
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
u64 logical, u64 length);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment