Commit 81aa0968 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.12-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:
 "There are still regressions being found and fixed in the zoned mode
  and subpage code, the rest are fixes for bugs reported by users.

  Regressions:

   - subpage block support:
      - readahead works on the proper block size
      - fix last page zeroing

   - zoned mode:
      - linked list corruption for tree log

  Fixes:

   - qgroup leak after falloc failure

   - tree mod log and backref resolving:
      - extent buffer cloning race when resolving backrefs
      - pin deleted leaves with active tree mod log users

   - drop debugging flag from slab cache"

* tag 'for-5.12-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: always pin deleted leaves when there are active tree mod log users
  btrfs: fix race when cloning extent buffer during rewind of an old root
  btrfs: fix slab cache flags for free space tree bitmap
  btrfs: subpage: make readahead work properly
  btrfs: subpage: fix wild pointer access during metadata read failure
  btrfs: zoned: fix linked list corruption after log root tree allocation failure
  btrfs: fix qgroup data rsv leak caused by falloc failure
  btrfs: track qgroup released data in own variable in insert_prealloc_file_extent
  btrfs: fix wrong offset to zero out range beyond i_size
parents dc033799 485df755
...@@ -1365,7 +1365,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq) ...@@ -1365,7 +1365,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
"failed to read tree block %llu from get_old_root", "failed to read tree block %llu from get_old_root",
logical); logical);
} else { } else {
btrfs_tree_read_lock(old);
eb = btrfs_clone_extent_buffer(old); eb = btrfs_clone_extent_buffer(old);
btrfs_tree_read_unlock(old);
free_extent_buffer(old); free_extent_buffer(old);
} }
} else if (old_root) { } else if (old_root) {
......
...@@ -3323,6 +3323,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, ...@@ -3323,6 +3323,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
if (last_ref && btrfs_header_generation(buf) == trans->transid) { if (last_ref && btrfs_header_generation(buf) == trans->transid) {
struct btrfs_block_group *cache; struct btrfs_block_group *cache;
bool must_pin = false;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
ret = check_ref_cleanup(trans, buf->start); ret = check_ref_cleanup(trans, buf->start);
...@@ -3340,7 +3341,27 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, ...@@ -3340,7 +3341,27 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
goto out; goto out;
} }
if (btrfs_is_zoned(fs_info)) { /*
* If this is a leaf and there are tree mod log users, we may
* have recorded mod log operations that point to this leaf.
* So we must make sure no one reuses this leaf's extent before
* mod log operations are applied to a node, otherwise after
* rewinding a node using the mod log operations we get an
* inconsistent btree, as the leaf's extent may now be used as
* a node or leaf for another different btree.
* We are safe from races here because at this point no other
* node or root points to this extent buffer, so if after this
* check a new tree mod log user joins, it will not be able to
* find a node pointing to this leaf and record operations that
* point to this leaf.
*/
if (btrfs_header_level(buf) == 0) {
read_lock(&fs_info->tree_mod_log_lock);
must_pin = !list_empty(&fs_info->tree_mod_seq_list);
read_unlock(&fs_info->tree_mod_log_lock);
}
if (must_pin || btrfs_is_zoned(fs_info)) {
btrfs_redirty_list_add(trans->transaction, buf); btrfs_redirty_list_add(trans->transaction, buf);
pin_down_extent(trans, cache, buf->start, buf->len, 1); pin_down_extent(trans, cache, buf->start, buf->len, 1);
btrfs_put_block_group(cache); btrfs_put_block_group(cache);
......
...@@ -2885,6 +2885,35 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) ...@@ -2885,6 +2885,35 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
btrfs_subpage_end_reader(fs_info, page, start, len); btrfs_subpage_end_reader(fs_info, page, start, len);
} }
/*
* Find extent buffer for a givne bytenr.
*
* This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
* in endio context.
*/
static struct extent_buffer *find_extent_buffer_readpage(
struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
{
struct extent_buffer *eb;
/*
* For regular sectorsize, we can use page->private to grab extent
* buffer
*/
if (fs_info->sectorsize == PAGE_SIZE) {
ASSERT(PagePrivate(page) && page->private);
return (struct extent_buffer *)page->private;
}
/* For subpage case, we need to lookup buffer radix tree */
rcu_read_lock();
eb = radix_tree_lookup(&fs_info->buffer_radix,
bytenr >> fs_info->sectorsize_bits);
rcu_read_unlock();
ASSERT(eb);
return eb;
}
/* /*
* after a readpage IO is done, we need to: * after a readpage IO is done, we need to:
* clear the uptodate bits on error * clear the uptodate bits on error
...@@ -2996,7 +3025,7 @@ static void end_bio_extent_readpage(struct bio *bio) ...@@ -2996,7 +3025,7 @@ static void end_bio_extent_readpage(struct bio *bio)
} else { } else {
struct extent_buffer *eb; struct extent_buffer *eb;
eb = (struct extent_buffer *)page->private; eb = find_extent_buffer_readpage(fs_info, page, start);
set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = mirror; eb->read_mirror = mirror;
atomic_dec(&eb->io_pages); atomic_dec(&eb->io_pages);
...@@ -3020,7 +3049,7 @@ static void end_bio_extent_readpage(struct bio *bio) ...@@ -3020,7 +3049,7 @@ static void end_bio_extent_readpage(struct bio *bio)
*/ */
if (page->index == end_index && i_size <= end) { if (page->index == end_index && i_size <= end) {
u32 zero_start = max(offset_in_page(i_size), u32 zero_start = max(offset_in_page(i_size),
offset_in_page(end)); offset_in_page(start));
zero_user_segment(page, zero_start, zero_user_segment(page, zero_start,
offset_in_page(end) + 1); offset_in_page(end) + 1);
......
...@@ -9008,7 +9008,7 @@ int __init btrfs_init_cachep(void) ...@@ -9008,7 +9008,7 @@ int __init btrfs_init_cachep(void)
btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
PAGE_SIZE, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE,
SLAB_RED_ZONE, NULL); SLAB_MEM_SPREAD, NULL);
if (!btrfs_free_space_bitmap_cachep) if (!btrfs_free_space_bitmap_cachep)
goto fail; goto fail;
...@@ -9877,6 +9877,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( ...@@ -9877,6 +9877,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
struct btrfs_path *path; struct btrfs_path *path;
u64 start = ins->objectid; u64 start = ins->objectid;
u64 len = ins->offset; u64 len = ins->offset;
int qgroup_released;
int ret; int ret;
memset(&stack_fi, 0, sizeof(stack_fi)); memset(&stack_fi, 0, sizeof(stack_fi));
...@@ -9889,16 +9890,16 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( ...@@ -9889,16 +9890,16 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
/* Encryption and other encoding is reserved and all 0 */ /* Encryption and other encoding is reserved and all 0 */
ret = btrfs_qgroup_release_data(inode, file_offset, len); qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
if (ret < 0) if (qgroup_released < 0)
return ERR_PTR(ret); return ERR_PTR(qgroup_released);
if (trans) { if (trans) {
ret = insert_reserved_file_extent(trans, inode, ret = insert_reserved_file_extent(trans, inode,
file_offset, &stack_fi, file_offset, &stack_fi,
true, ret); true, qgroup_released);
if (ret) if (ret)
return ERR_PTR(ret); goto free_qgroup;
return trans; return trans;
} }
...@@ -9909,21 +9910,35 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( ...@@ -9909,21 +9910,35 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
extent_info.file_offset = file_offset; extent_info.file_offset = file_offset;
extent_info.extent_buf = (char *)&stack_fi; extent_info.extent_buf = (char *)&stack_fi;
extent_info.is_new_extent = true; extent_info.is_new_extent = true;
extent_info.qgroup_reserved = ret; extent_info.qgroup_reserved = qgroup_released;
extent_info.insertions = 0; extent_info.insertions = 0;
path = btrfs_alloc_path(); path = btrfs_alloc_path();
if (!path) if (!path) {
return ERR_PTR(-ENOMEM); ret = -ENOMEM;
goto free_qgroup;
}
ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset, ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset,
file_offset + len - 1, &extent_info, file_offset + len - 1, &extent_info,
&trans); &trans);
btrfs_free_path(path); btrfs_free_path(path);
if (ret) if (ret)
return ERR_PTR(ret); goto free_qgroup;
return trans; return trans;
free_qgroup:
/*
* We have released qgroup data range at the beginning of the function,
* and normally qgroup_released bytes will be freed when committing
* transaction.
* But if we error out early, we have to free what we have released
* or we leak qgroup data reservation.
*/
btrfs_qgroup_free_refroot(inode->root->fs_info,
inode->root->root_key.objectid, qgroup_released,
BTRFS_QGROUP_RSV_DATA);
return ERR_PTR(ret);
} }
static int __btrfs_prealloc_file_range(struct inode *inode, int mode, static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
......
...@@ -209,7 +209,7 @@ int btree_readahead_hook(struct extent_buffer *eb, int err) ...@@ -209,7 +209,7 @@ int btree_readahead_hook(struct extent_buffer *eb, int err)
/* find extent */ /* find extent */
spin_lock(&fs_info->reada_lock); spin_lock(&fs_info->reada_lock);
re = radix_tree_lookup(&fs_info->reada_tree, re = radix_tree_lookup(&fs_info->reada_tree,
eb->start >> PAGE_SHIFT); eb->start >> fs_info->sectorsize_bits);
if (re) if (re)
re->refcnt++; re->refcnt++;
spin_unlock(&fs_info->reada_lock); spin_unlock(&fs_info->reada_lock);
...@@ -240,7 +240,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, ...@@ -240,7 +240,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
zone = NULL; zone = NULL;
spin_lock(&fs_info->reada_lock); spin_lock(&fs_info->reada_lock);
ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
logical >> PAGE_SHIFT, 1); logical >> fs_info->sectorsize_bits, 1);
if (ret == 1 && logical >= zone->start && logical <= zone->end) { if (ret == 1 && logical >= zone->start && logical <= zone->end) {
kref_get(&zone->refcnt); kref_get(&zone->refcnt);
spin_unlock(&fs_info->reada_lock); spin_unlock(&fs_info->reada_lock);
...@@ -283,13 +283,13 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, ...@@ -283,13 +283,13 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
spin_lock(&fs_info->reada_lock); spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&dev->reada_zones, ret = radix_tree_insert(&dev->reada_zones,
(unsigned long)(zone->end >> PAGE_SHIFT), (unsigned long)(zone->end >> fs_info->sectorsize_bits),
zone); zone);
if (ret == -EEXIST) { if (ret == -EEXIST) {
kfree(zone); kfree(zone);
ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
logical >> PAGE_SHIFT, 1); logical >> fs_info->sectorsize_bits, 1);
if (ret == 1 && logical >= zone->start && logical <= zone->end) if (ret == 1 && logical >= zone->start && logical <= zone->end)
kref_get(&zone->refcnt); kref_get(&zone->refcnt);
else else
...@@ -315,7 +315,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, ...@@ -315,7 +315,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
u64 length; u64 length;
int real_stripes; int real_stripes;
int nzones = 0; int nzones = 0;
unsigned long index = logical >> PAGE_SHIFT; unsigned long index = logical >> fs_info->sectorsize_bits;
int dev_replace_is_ongoing; int dev_replace_is_ongoing;
int have_zone = 0; int have_zone = 0;
...@@ -497,7 +497,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info, ...@@ -497,7 +497,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
struct reada_extent *re) struct reada_extent *re)
{ {
int i; int i;
unsigned long index = re->logical >> PAGE_SHIFT; unsigned long index = re->logical >> fs_info->sectorsize_bits;
spin_lock(&fs_info->reada_lock); spin_lock(&fs_info->reada_lock);
if (--re->refcnt) { if (--re->refcnt) {
...@@ -538,11 +538,12 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info, ...@@ -538,11 +538,12 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
static void reada_zone_release(struct kref *kref) static void reada_zone_release(struct kref *kref)
{ {
struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
struct btrfs_fs_info *fs_info = zone->device->fs_info;
lockdep_assert_held(&zone->device->fs_info->reada_lock); lockdep_assert_held(&fs_info->reada_lock);
radix_tree_delete(&zone->device->reada_zones, radix_tree_delete(&zone->device->reada_zones,
zone->end >> PAGE_SHIFT); zone->end >> fs_info->sectorsize_bits);
kfree(zone); kfree(zone);
} }
...@@ -593,7 +594,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical, ...@@ -593,7 +594,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical,
static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
{ {
int i; int i;
unsigned long index = zone->end >> PAGE_SHIFT; unsigned long index = zone->end >> zone->device->fs_info->sectorsize_bits;
for (i = 0; i < zone->ndevs; ++i) { for (i = 0; i < zone->ndevs; ++i) {
struct reada_zone *peer; struct reada_zone *peer;
...@@ -628,7 +629,7 @@ static int reada_pick_zone(struct btrfs_device *dev) ...@@ -628,7 +629,7 @@ static int reada_pick_zone(struct btrfs_device *dev)
(void **)&zone, index, 1); (void **)&zone, index, 1);
if (ret == 0) if (ret == 0)
break; break;
index = (zone->end >> PAGE_SHIFT) + 1; index = (zone->end >> dev->fs_info->sectorsize_bits) + 1;
if (zone->locked) { if (zone->locked) {
if (zone->elems > top_locked_elems) { if (zone->elems > top_locked_elems) {
top_locked_elems = zone->elems; top_locked_elems = zone->elems;
...@@ -709,7 +710,7 @@ static int reada_start_machine_dev(struct btrfs_device *dev) ...@@ -709,7 +710,7 @@ static int reada_start_machine_dev(struct btrfs_device *dev)
* plugging to speed things up * plugging to speed things up
*/ */
ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
dev->reada_next >> PAGE_SHIFT, 1); dev->reada_next >> fs_info->sectorsize_bits, 1);
if (ret == 0 || re->logical > dev->reada_curr_zone->end) { if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
ret = reada_pick_zone(dev); ret = reada_pick_zone(dev);
if (!ret) { if (!ret) {
...@@ -718,7 +719,7 @@ static int reada_start_machine_dev(struct btrfs_device *dev) ...@@ -718,7 +719,7 @@ static int reada_start_machine_dev(struct btrfs_device *dev)
} }
re = NULL; re = NULL;
ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
dev->reada_next >> PAGE_SHIFT, 1); dev->reada_next >> fs_info->sectorsize_bits, 1);
} }
if (ret == 0) { if (ret == 0) {
spin_unlock(&fs_info->reada_lock); spin_unlock(&fs_info->reada_lock);
...@@ -885,7 +886,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) ...@@ -885,7 +886,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
pr_cont(" curr off %llu", pr_cont(" curr off %llu",
device->reada_next - zone->start); device->reada_next - zone->start);
pr_cont("\n"); pr_cont("\n");
index = (zone->end >> PAGE_SHIFT) + 1; index = (zone->end >> fs_info->sectorsize_bits) + 1;
} }
cnt = 0; cnt = 0;
index = 0; index = 0;
...@@ -910,7 +911,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) ...@@ -910,7 +911,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
} }
} }
pr_cont("\n"); pr_cont("\n");
index = (re->logical >> PAGE_SHIFT) + 1; index = (re->logical >> fs_info->sectorsize_bits) + 1;
if (++cnt > 15) if (++cnt > 15)
break; break;
} }
...@@ -926,7 +927,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) ...@@ -926,7 +927,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
if (ret == 0) if (ret == 0)
break; break;
if (!re->scheduled) { if (!re->scheduled) {
index = (re->logical >> PAGE_SHIFT) + 1; index = (re->logical >> fs_info->sectorsize_bits) + 1;
continue; continue;
} }
pr_debug("re: logical %llu size %u list empty %d scheduled %d", pr_debug("re: logical %llu size %u list empty %d scheduled %d",
...@@ -942,7 +943,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) ...@@ -942,7 +943,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
} }
} }
pr_cont("\n"); pr_cont("\n");
index = (re->logical >> PAGE_SHIFT) + 1; index = (re->logical >> fs_info->sectorsize_bits) + 1;
} }
spin_unlock(&fs_info->reada_lock); spin_unlock(&fs_info->reada_lock);
} }
......
...@@ -3169,10 +3169,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ...@@ -3169,10 +3169,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_lock(&log_root_tree->log_mutex); mutex_lock(&log_root_tree->log_mutex);
index2 = log_root_tree->log_transid % 2;
list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
root_log_ctx.log_transid = log_root_tree->log_transid;
if (btrfs_is_zoned(fs_info)) { if (btrfs_is_zoned(fs_info)) {
if (!log_root_tree->node) { if (!log_root_tree->node) {
ret = btrfs_alloc_log_tree_node(trans, log_root_tree); ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
...@@ -3183,6 +3179,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ...@@ -3183,6 +3179,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
} }
} }
index2 = log_root_tree->log_transid % 2;
list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
root_log_ctx.log_transid = log_root_tree->log_transid;
/* /*
* Now we are safe to update the log_root_tree because we're under the * Now we are safe to update the log_root_tree because we're under the
* log_mutex, and we're a current writer so we're holding the commit * log_mutex, and we're a current writer so we're holding the commit
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment