Commit 3ee65c0f authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.17-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:
 "A few more fixes for various problems that have user visible effects
  or seem to be urgent:

   - fix corruption when combining DIO and non-blocking io_uring over
     multiple extents (seen on MariaDB)

   - fix relocation crash due to premature return from commit

   - fix quota deadlock between rescan and qgroup removal

   - fix item data bounds checks in tree-checker (found on a fuzzed
     image)

   - fix fsync of prealloc extents after EOF

   - add missing run of delayed items after unlink during log replay

   - don't start relocation until snapshot drop is finished

   - fix reversed condition for subpage writers locking

   - fix warning on page error"

* tag 'for-5.17-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: fallback to blocking mode when doing async dio over multiple extents
  btrfs: add missing run of delayed items after unlink during log replay
  btrfs: qgroup: fix deadlock between rescan worker and remove qgroup
  btrfs: fix relocation crash due to premature return from btrfs_commit_transaction()
  btrfs: do not start relocation until in progress drops are done
  btrfs: tree-checker: use u64 for item data end to avoid overflow
  btrfs: do not WARN_ON() if we have PageError set
  btrfs: fix lost prealloc extents beyond eof after full fsync
  btrfs: subpage: fix a wrong check on subpage->writers
parents f81664f7 ca93e44b
......@@ -602,6 +602,9 @@ enum {
/* Indicate that we want the transaction kthread to commit right now. */
BTRFS_FS_COMMIT_TRANS,
/* Indicate we have half completed snapshot deletions pending. */
BTRFS_FS_UNFINISHED_DROPS,
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
......@@ -1106,8 +1109,15 @@ enum {
BTRFS_ROOT_QGROUP_FLUSHING,
/* We started the orphan cleanup for this root. */
BTRFS_ROOT_ORPHAN_CLEANUP,
/* This root has a drop operation that was started previously. */
BTRFS_ROOT_UNFINISHED_DROP,
};
static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
{
clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
}
/*
* Record swapped tree blocks of a subvolume tree for delayed subtree trace
* code. For detail check comment in fs/btrfs/qgroup.c.
......
......@@ -3813,6 +3813,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
/* Kick the cleaner thread so it'll start deleting snapshots. */
if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
wake_up_process(fs_info->cleaner_kthread);
clear_oneshot:
btrfs_clear_oneshot_options(fs_info);
return 0;
......@@ -4538,6 +4542,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
*/
kthread_park(fs_info->cleaner_kthread);
/*
* If we had UNFINISHED_DROPS we could still be processing them, so
* clear that bit and wake up relocation so it can stop.
*/
btrfs_wake_unfinished_drop(fs_info);
/* wait for the qgroup rescan worker to stop */
btrfs_qgroup_wait_for_completion(fs_info, false);
......
......@@ -5622,6 +5622,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
int ret;
int level;
bool root_dropped = false;
bool unfinished_drop = false;
btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
......@@ -5664,6 +5665,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
* already dropped.
*/
set_bit(BTRFS_ROOT_DELETING, &root->state);
unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
path->nodes[level] = btrfs_lock_root_node(root);
......@@ -5838,6 +5841,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
kfree(wc);
btrfs_free_path(path);
out:
/*
* We were an unfinished drop root, check to see if there are any
* pending, and if not clear and wake up any waiters.
*/
if (!err && unfinished_drop)
btrfs_maybe_wake_unfinished_drop(fs_info);
/*
* So if we need to stop dropping the snapshot for whatever reason we
* need to make sure to add it back to the dead root list so that we
......
......@@ -6841,14 +6841,24 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,
{
struct btrfs_fs_info *fs_info = eb->fs_info;
/*
* If we are using the commit root we could potentially clear a page
* Uptodate while we're using the extent buffer that we've previously
* looked up. We don't want to complain in this case, as the page was
* valid before, we just didn't write it out. Instead we want to catch
* the case where we didn't actually read the block properly, which
* would have !PageUptodate && !PageError, as we clear PageError before
* reading.
*/
if (fs_info->sectorsize < PAGE_SIZE) {
bool uptodate;
bool uptodate, error;
uptodate = btrfs_subpage_test_uptodate(fs_info, page,
eb->start, eb->len);
WARN_ON(!uptodate);
error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len);
WARN_ON(!uptodate && !error);
} else {
WARN_ON(!PageUptodate(page));
WARN_ON(!PageUptodate(page) && !PageError(page));
}
}
......
......@@ -7600,6 +7600,34 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
}
len = min(len, em->len - (start - em->start));
/*
* If we have a NOWAIT request and the range contains multiple extents
* (or a mix of extents and holes), then we return -EAGAIN to make the
* caller fallback to a context where it can do a blocking (without
* NOWAIT) request. This way we avoid doing partial IO and returning
* success to the caller, which is not optimal for writes and for reads
* it can result in unexpected behaviour for an application.
*
* When doing a read, because we use IOMAP_DIO_PARTIAL when calling
* iomap_dio_rw(), we can end up returning less data then what the caller
* asked for, resulting in an unexpected, and incorrect, short read.
* That is, the caller asked to read N bytes and we return less than that,
* which is wrong unless we are crossing EOF. This happens if we get a
* page fault error when trying to fault in pages for the buffer that is
* associated to the struct iov_iter passed to iomap_dio_rw(), and we
* have previously submitted bios for other extents in the range, in
* which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
* those bios have completed by the time we get the page fault error,
* which we return back to our caller - we should only return EIOCBQUEUED
* after we have submitted bios for all the extents in the range.
*/
if ((flags & IOMAP_NOWAIT) && len < length) {
free_extent_map(em);
ret = -EAGAIN;
goto unlock_err;
}
if (write) {
ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
start, len);
......
......@@ -1196,6 +1196,14 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
if (!fs_info->quota_root)
goto out;
/*
* Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
* complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
* to lock that mutex while holding a transaction handle and the rescan
* worker needs to commit a transaction.
*/
mutex_unlock(&fs_info->qgroup_ioctl_lock);
/*
* Request qgroup rescan worker to complete and wait for it. This wait
* must be done before transaction start for quota disable since it may
......@@ -1203,7 +1211,6 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
*/
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
btrfs_qgroup_wait_for_completion(fs_info, false);
mutex_unlock(&fs_info->qgroup_ioctl_lock);
/*
* 1 For the root item
......
......@@ -3960,6 +3960,19 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
int rw = 0;
int err = 0;
/*
* This only gets set if we had a half-deleted snapshot on mount. We
* cannot allow relocation to start while we're still trying to clean up
* these pending deletions.
*/
ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
if (ret)
return ret;
/* We may have been woken up by close_ctree, so bail if we're closing. */
if (btrfs_fs_closing(fs_info))
return -EINTR;
bg = btrfs_lookup_block_group(fs_info, group_start);
if (!bg)
return -ENOENT;
......
......@@ -278,6 +278,21 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state));
if (btrfs_root_refs(&root->root_item) == 0) {
struct btrfs_key drop_key;
btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress);
/*
* If we have a non-zero drop_progress then we know we
* made it partly through deleting this snapshot, and
* thus we need to make sure we block any balance from
* happening until this snapshot is completely dropped.
*/
if (drop_key.objectid != 0 || drop_key.type != 0 ||
drop_key.offset != 0) {
set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
}
set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
btrfs_add_dead_root(root);
}
......
......@@ -736,7 +736,7 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
* Since we own the page lock, no one else could touch subpage::writers
* and we are safe to do several atomic operations without spinlock.
*/
if (atomic_read(&subpage->writers))
if (atomic_read(&subpage->writers) == 0)
/* No writers, locked by plain lock_page() */
return unlock_page(page);
......
......@@ -854,7 +854,37 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
static noinline void wait_for_commit(struct btrfs_transaction *commit,
const enum btrfs_trans_state min_state)
{
wait_event(commit->commit_wait, commit->state >= min_state);
struct btrfs_fs_info *fs_info = commit->fs_info;
u64 transid = commit->transid;
bool put = false;
while (1) {
wait_event(commit->commit_wait, commit->state >= min_state);
if (put)
btrfs_put_transaction(commit);
if (min_state < TRANS_STATE_COMPLETED)
break;
/*
* A transaction isn't really completed until all of the
* previous transactions are completed, but with fsync we can
* end up with SUPER_COMMITTED transactions before a COMPLETED
* transaction. Wait for those.
*/
spin_lock(&fs_info->trans_lock);
commit = list_first_entry_or_null(&fs_info->trans_list,
struct btrfs_transaction,
list);
if (!commit || commit->transid > transid) {
spin_unlock(&fs_info->trans_lock);
break;
}
refcount_inc(&commit->use_count);
put = true;
spin_unlock(&fs_info->trans_lock);
}
}
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
......@@ -1319,6 +1349,32 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
return 0;
}
/*
* If we had a pending drop we need to see if there are any others left in our
* dead roots list, and if not clear our bit and wake any waiters.
*/
void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
{
/*
* We put the drop in progress roots at the front of the list, so if the
* first entry doesn't have UNFINISHED_DROP set we can wake everybody
* up.
*/
spin_lock(&fs_info->trans_lock);
if (!list_empty(&fs_info->dead_roots)) {
struct btrfs_root *root = list_first_entry(&fs_info->dead_roots,
struct btrfs_root,
root_list);
if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) {
spin_unlock(&fs_info->trans_lock);
return;
}
}
spin_unlock(&fs_info->trans_lock);
btrfs_wake_unfinished_drop(fs_info);
}
/*
* dead roots are old snapshots that need to be deleted. This allocates
* a dirty root struct and adds it into the list of dead roots that need to
......@@ -1331,7 +1387,12 @@ void btrfs_add_dead_root(struct btrfs_root *root)
spin_lock(&fs_info->trans_lock);
if (list_empty(&root->root_list)) {
btrfs_grab_root(root);
list_add_tail(&root->root_list, &fs_info->dead_roots);
/* We want to process the partially complete drops first. */
if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state))
list_add(&root->root_list, &fs_info->dead_roots);
else
list_add_tail(&root->root_list, &fs_info->dead_roots);
}
spin_unlock(&fs_info->trans_lock);
}
......
......@@ -216,6 +216,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
void btrfs_add_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root);
void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
......
......@@ -1682,6 +1682,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
*/
for (slot = 0; slot < nritems; slot++) {
u32 item_end_expected;
u64 item_data_end;
int ret;
btrfs_item_key_to_cpu(leaf, &key, slot);
......@@ -1696,6 +1697,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
return -EUCLEAN;
}
item_data_end = (u64)btrfs_item_offset(leaf, slot) +
btrfs_item_size(leaf, slot);
/*
* Make sure the offset and ends are right, remember that the
* item data starts at the end of the leaf and grows towards the
......@@ -1706,11 +1709,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
else
item_end_expected = btrfs_item_offset(leaf,
slot - 1);
if (unlikely(btrfs_item_data_end(leaf, slot) != item_end_expected)) {
if (unlikely(item_data_end != item_end_expected)) {
generic_err(leaf, slot,
"unexpected item end, have %u expect %u",
btrfs_item_data_end(leaf, slot),
item_end_expected);
"unexpected item end, have %llu expect %u",
item_data_end, item_end_expected);
return -EUCLEAN;
}
......@@ -1719,12 +1721,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
* just in case all the items are consistent to each other, but
* all point outside of the leaf.
*/
if (unlikely(btrfs_item_data_end(leaf, slot) >
BTRFS_LEAF_DATA_SIZE(fs_info))) {
if (unlikely(item_data_end > BTRFS_LEAF_DATA_SIZE(fs_info))) {
generic_err(leaf, slot,
"slot end outside of leaf, have %u expect range [0, %u]",
btrfs_item_data_end(leaf, slot),
BTRFS_LEAF_DATA_SIZE(fs_info));
"slot end outside of leaf, have %llu expect range [0, %u]",
item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info));
return -EUCLEAN;
}
......
......@@ -1362,6 +1362,15 @@ static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
inode, name, namelen);
kfree(name);
iput(dir);
/*
* Whenever we need to check if a name exists or not, we
* check the subvolume tree. So after an unlink we must
* run delayed items, so that future checks for a name
* during log replay see that the name does not exists
* anymore.
*/
if (!ret)
ret = btrfs_run_delayed_items(trans);
if (ret)
goto out;
goto again;
......@@ -1614,6 +1623,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
*/
if (!ret && inode->i_nlink == 0)
inc_nlink(inode);
/*
* Whenever we need to check if a name exists or
* not, we check the subvolume tree. So after an
* unlink we must run delayed items, so that future
* checks for a name during log replay see that the
* name does not exists anymore.
*/
if (!ret)
ret = btrfs_run_delayed_items(trans);
}
if (ret < 0)
goto out;
......@@ -4635,7 +4653,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
/*
* Log all prealloc extents beyond the inode's i_size to make sure we do not
* lose them after doing a fast fsync and replaying the log. We scan the
* lose them after doing a full/fast fsync and replaying the log. We scan the
* subvolume's root instead of iterating the inode's extent map tree because
* otherwise we can log incorrect extent items based on extent map conversion.
* That can happen due to the fact that extent maps are merged when they
......@@ -5414,6 +5432,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx,
bool *need_log_inode_item)
{
const u64 i_size = i_size_read(&inode->vfs_inode);
struct btrfs_root *root = inode->root;
int ins_start_slot = 0;
int ins_nr = 0;
......@@ -5434,13 +5453,21 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
if (min_key->type > max_key->type)
break;
if (min_key->type == BTRFS_INODE_ITEM_KEY)
if (min_key->type == BTRFS_INODE_ITEM_KEY) {
*need_log_inode_item = false;
if ((min_key->type == BTRFS_INODE_REF_KEY ||
min_key->type == BTRFS_INODE_EXTREF_KEY) &&
inode->generation == trans->transid &&
!recursive_logging) {
} else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
min_key->offset >= i_size) {
/*
* Extents at and beyond eof are logged with
* btrfs_log_prealloc_extents().
* Only regular files have BTRFS_EXTENT_DATA_KEY keys,
* and no keys greater than that, so bail out.
*/
break;
} else if ((min_key->type == BTRFS_INODE_REF_KEY ||
min_key->type == BTRFS_INODE_EXTREF_KEY) &&
inode->generation == trans->transid &&
!recursive_logging) {
u64 other_ino = 0;
u64 other_parent = 0;
......@@ -5471,10 +5498,8 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
goto next_key;
}
}
/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
} else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
/* Skip xattrs, logged later with btrfs_log_all_xattrs() */
if (ins_nr == 0)
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
......@@ -5527,9 +5552,21 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
break;
}
}
if (ins_nr)
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
ins_nr, inode_only, logged_isize);
if (ret)
return ret;
}
if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
/*
* Release the path because otherwise we might attempt to double
* lock the same leaf with btrfs_log_prealloc_extents() below.
*/
btrfs_release_path(path);
ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
}
return ret;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment