Commit 82708bb1 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.19-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

 - zoned relocation fixes:
      - fix critical section end for extent writeback, this could lead
        to out of order write
      - prevent writing to previous data relocation block group if space
        gets low

 - reflink fixes:
      - fix race between reflinking and ordered extent completion
      - proper error handling when block reserve migration fails
      - add missing inode iversion/mtime/ctime updates on each iteration
        when replacing extents

 - fix deadlock when running fsync/fiemap/commit at the same time

 - fix false-positive KCSAN report regarding pid tracking for read locks
   and data race

 - minor documentation update and link to new site

* tag 'for-5.19-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  Documentation: update btrfs list of features and link to readthedocs.io
  btrfs: fix deadlock with fsync+fiemap+transaction commit
  btrfs: don't set lock_owner when locking extent buffer for reading
  btrfs: zoned: fix critical section of relocation inode writeback
  btrfs: zoned: prevent allocation from previous data relocation BG
  btrfs: do not BUG_ON() on failure to migrate space when replacing extents
  btrfs: add missing inode updates on each iteration when replacing extents
  btrfs: fix race between reflinking and ordered extent completion
parents c898c67d 037e1274
......@@ -19,13 +19,23 @@ The main Btrfs features include:
* Subvolumes (separate internal filesystem roots)
* Object level mirroring and striping
* Checksums on data and metadata (multiple algorithms available)
* Compression
* Compression (multiple algorithms available)
* Reflink, deduplication
* Scrub (on-line checksum verification)
* Hierarchical quota groups (subvolume and snapshot support)
* Integrated multiple device support, with several raid algorithms
* Offline filesystem check
* Efficient incremental backup and FS mirroring
* Efficient incremental backup and FS mirroring (send/receive)
* Trim/discard
* Online filesystem defragmentation
* Swapfile support
* Zoned mode
* Read/write metadata verification
* Online resize (shrink, grow)
For more information please refer to the wiki
For more information please refer to the documentation site or wiki
https://btrfs.readthedocs.io
https://btrfs.wiki.kernel.org
......
......@@ -104,6 +104,7 @@ struct btrfs_block_group {
unsigned int relocating_repair:1;
unsigned int chunk_item_inserted:1;
unsigned int zone_is_active:1;
unsigned int zoned_data_reloc_ongoing:1;
int disk_cache_state;
......
......@@ -1330,6 +1330,8 @@ struct btrfs_replace_extent_info {
* existing extent into a file range.
*/
bool is_new_extent;
/* Indicate if we should update the inode's mtime and ctime. */
bool update_times;
/* Meaningful only if is_new_extent is true. */
int qgroup_reserved;
/*
......
......@@ -3832,7 +3832,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
block_group->start == fs_info->data_reloc_bg ||
fs_info->data_reloc_bg == 0);
if (block_group->ro) {
if (block_group->ro || block_group->zoned_data_reloc_ongoing) {
ret = 1;
goto out;
}
......@@ -3894,8 +3894,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
out:
if (ret && ffe_ctl->for_treelog)
fs_info->treelog_bg = 0;
if (ret && ffe_ctl->for_data_reloc)
if (ret && ffe_ctl->for_data_reloc &&
fs_info->data_reloc_bg == block_group->start) {
/*
* Do not allow further allocations from this block group.
* Compared to increasing the ->ro, setting the
* ->zoned_data_reloc_ongoing flag still allows nocow
* writers to come in. See btrfs_inc_nocow_writers().
*
* We need to disable an allocation to avoid an allocation of
* regular (non-relocation data) extent. With mix of relocation
* extents and regular extents, we can dispatch WRITE commands
* (for relocation extents) and ZONE APPEND commands (for
* regular extents) at the same time to the same zone, which
* easily break the write pointer.
*/
block_group->zoned_data_reloc_ongoing = 1;
fs_info->data_reloc_bg = 0;
}
spin_unlock(&fs_info->relocation_bg_lock);
spin_unlock(&fs_info->treelog_bg_lock);
spin_unlock(&block_group->lock);
......
......@@ -5241,13 +5241,14 @@ int extent_writepages(struct address_space *mapping,
*/
btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
ret = extent_write_cache_pages(mapping, wbc, &epd);
btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
ASSERT(ret <= 0);
if (ret < 0) {
btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
end_write_bio(&epd, ret);
return ret;
}
flush_write_bio(&epd);
btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
return ret;
}
......
......@@ -2323,25 +2323,62 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
if (ret != BTRFS_NO_LOG_SYNC) {
if (ret == BTRFS_NO_LOG_SYNC) {
ret = btrfs_end_transaction(trans);
goto out;
}
/* We successfully logged the inode, attempt to sync the log. */
if (!ret) {
ret = btrfs_sync_log(trans, root, &ctx);
if (!ret) {
ret = btrfs_sync_log(trans, root, &ctx);
if (!ret) {
ret = btrfs_end_transaction(trans);
goto out;
}
}
if (!full_sync) {
ret = btrfs_wait_ordered_range(inode, start, len);
if (ret) {
btrfs_end_transaction(trans);
goto out;
}
ret = btrfs_end_transaction(trans);
goto out;
}
ret = btrfs_commit_transaction(trans);
} else {
}
/*
* At this point we need to commit the transaction because we had
* btrfs_need_log_full_commit() or some other error.
*
* If we didn't do a full sync we have to stop the trans handle, wait on
* the ordered extents, start it again and commit the transaction. If
* we attempt to wait on the ordered extents here we could deadlock with
* something like fallocate() that is holding the extent lock trying to
* start a transaction while some other thread is trying to commit the
* transaction while we (fsync) are currently holding the transaction
* open.
*/
if (!full_sync) {
ret = btrfs_end_transaction(trans);
if (ret)
goto out;
ret = btrfs_wait_ordered_range(inode, start, len);
if (ret)
goto out;
/*
* This is safe to use here because we're only interested in
* making sure the transaction that had the ordered extents is
* committed. We aren't waiting on anything past this point,
* we're purely getting the transaction and committing it.
*/
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
/*
* We committed the transaction and there's no currently
* running transaction, this means everything we care
* about made it to disk and we are done.
*/
if (ret == -ENOENT)
ret = 0;
goto out;
}
}
ret = btrfs_commit_transaction(trans);
out:
ASSERT(list_empty(&ctx.list));
err = file_check_and_advance_wb_err(file);
......@@ -2719,7 +2756,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
min_size, false);
BUG_ON(ret);
if (WARN_ON(ret))
goto out_trans;
trans->block_rsv = rsv;
cur_offset = start;
......@@ -2803,6 +2841,25 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
extent_info->file_offset += replace_len;
}
/*
* We are releasing our handle on the transaction, balance the
* dirty pages of the btree inode and flush delayed items, and
* then get a new transaction handle, which may now point to a
* new transaction in case someone else may have committed the
* transaction we used to replace/drop file extent items. So
* bump the inode's iversion and update mtime and ctime except
* if we are called from a dedupe context. This is because a
* power failure/crash may happen after the transaction is
* committed and before we finish replacing/dropping all the
* file extent items we need.
*/
inode_inc_iversion(&inode->vfs_inode);
if (!extent_info || extent_info->update_times) {
inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode);
inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime;
}
ret = btrfs_update_inode(trans, root, inode);
if (ret)
break;
......@@ -2819,7 +2876,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
rsv, min_size, false);
BUG_ON(ret); /* shouldn't happen */
if (WARN_ON(ret))
break;
trans->block_rsv = rsv;
cur_offset = drop_args.drop_end;
......
......@@ -3195,6 +3195,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->file_offset,
ordered_extent->file_offset +
logical_len);
btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
ordered_extent->disk_num_bytes);
} else {
BUG_ON(root == fs_info->tree_root);
ret = insert_ordered_extent_file_extent(trans, ordered_extent);
......@@ -9897,6 +9899,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
extent_info.file_offset = file_offset;
extent_info.extent_buf = (char *)&stack_fi;
extent_info.is_new_extent = true;
extent_info.update_times = true;
extent_info.qgroup_reserved = qgroup_released;
extent_info.insertions = 0;
......
......@@ -45,7 +45,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne
start_ns = ktime_get_ns();
down_read_nested(&eb->lock, nest);
eb->lock_owner = current->pid;
trace_btrfs_tree_read_lock(eb, start_ns);
}
......@@ -62,7 +61,6 @@ void btrfs_tree_read_lock(struct extent_buffer *eb)
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
if (down_read_trylock(&eb->lock)) {
eb->lock_owner = current->pid;
trace_btrfs_try_tree_read_lock(eb);
return 1;
}
......@@ -90,7 +88,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
trace_btrfs_tree_read_unlock(eb);
eb->lock_owner = 0;
up_read(&eb->lock);
}
......
......@@ -344,6 +344,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
int ret;
const u64 len = olen_aligned;
u64 last_dest_end = destoff;
u64 prev_extent_end = off;
ret = -ENOMEM;
buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
......@@ -363,7 +364,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
key.offset = off;
while (1) {
u64 next_key_min_offset = key.offset + 1;
struct btrfs_file_extent_item *extent;
u64 extent_gen;
int type;
......@@ -431,14 +431,21 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
* The first search might have left us at an extent item that
* ends before our target range's start, can happen if we have
* holes and NO_HOLES feature enabled.
*
* Subsequent searches may leave us on a file range we have
* processed before - this happens due to a race with ordered
* extent completion for a file range that is outside our source
* range, but that range was part of a file extent item that
* also covered a leading part of our source range.
*/
if (key.offset + datal <= off) {
if (key.offset + datal <= prev_extent_end) {
path->slots[0]++;
goto process_slot;
} else if (key.offset >= off + len) {
break;
}
next_key_min_offset = key.offset + datal;
prev_extent_end = key.offset + datal;
size = btrfs_item_size(leaf, slot);
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
size);
......@@ -489,6 +496,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
clone_info.file_offset = new_key.offset;
clone_info.extent_buf = buf;
clone_info.is_new_extent = false;
clone_info.update_times = !no_time_update;
ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
drop_start, new_key.offset + datal - 1,
&clone_info, &trans);
......@@ -550,7 +558,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
break;
btrfs_release_path(path);
key.offset = next_key_min_offset;
key.offset = prev_extent_end;
if (fatal_signal_pending(current)) {
ret = -EINTR;
......
......@@ -2139,3 +2139,30 @@ bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
factor = div64_u64(used * 100, total);
return factor >= fs_info->bg_reclaim_threshold;
}
void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
u64 length)
{
struct btrfs_block_group *block_group;
if (!btrfs_is_zoned(fs_info))
return;
block_group = btrfs_lookup_block_group(fs_info, logical);
/* It should be called on a previous data relocation block group. */
ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
spin_lock(&block_group->lock);
if (!block_group->zoned_data_reloc_ongoing)
goto out;
/* All relocation extents are written. */
if (block_group->start + block_group->alloc_offset == logical + length) {
/* Now, release this block group for further allocations. */
block_group->zoned_data_reloc_ongoing = 0;
}
out:
spin_unlock(&block_group->lock);
btrfs_put_block_group(block_group);
}
......@@ -77,6 +77,8 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
u64 length);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
......@@ -243,6 +245,9 @@ static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
{
return false;
}
static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info,
u64 logical, u64 length) { }
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment