Commit d4b450cd authored by Filipe Manana's avatar Filipe Manana Committed by Chris Mason

Btrfs: fix race between transaction commit and empty block group removal

Committing a transaction can race with automatic removal of empty block
groups (cleaner kthread), leading to a BUG_ON() in the transaction
commit code while running btrfs_finish_extent_commit(). The following
sequence diagram shows how it can happen:

           CPU 1                                       CPU 2

btrfs_commit_transaction()
  fs_info->running_transaction = NULL
  btrfs_finish_extent_commit()
    find_first_extent_bit()
      -> found range for block group X
         in fs_info->freed_extents[]

                                               btrfs_delete_unused_bgs()
                                                 -> found block group X

                                                 Removed block group X's range
                                                 from fs_info->freed_extents[]

                                                 btrfs_remove_chunk()
                                                    btrfs_remove_block_group(bg X)

    unpin_extent_range(bg X range)
       btrfs_lookup_block_group(bg X)
          -> returns NULL
            -> BUG_ON()

The trace that results from the BUG_ON() is:

[48665.187808] ------------[ cut here ]------------
[48665.188032] kernel BUG at fs/btrfs/extent-tree.c:5675!
[48665.188032] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC
[48665.188032] Modules linked in: dm_flakey dm_mod crc32c_generic btrfs xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop parport_pc evdev microcode
[48665.197388] CPU: 2 PID: 31211 Comm: kworker/u32:16 Tainted: G        W      3.19.0-rc5-btrfs-next-4+ #1
[48665.197388] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
[48665.197388] Workqueue: events_unbound btrfs_async_reclaim_metadata_space [btrfs]
[48665.197388] task: ffff880222011810 ti: ffff8801b56a4000 task.ti: ffff8801b56a4000
[48665.197388] RIP: 0010:[<ffffffffa0350d05>]  [<ffffffffa0350d05>] unpin_extent_range+0x6a/0x1ba [btrfs]
[48665.197388] RSP: 0018:ffff8801b56a7b88  EFLAGS: 00010246
[48665.197388] RAX: 0000000000000000 RBX: ffff8802143a6000 RCX: ffff8802220120c8
[48665.197388] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff8800a3c140b0
[48665.197388] RBP: ffff8801b56a7bd8 R08: 0000000000000003 R09: 0000000000000000
[48665.197388] R10: 0000000000000000 R11: 000000000000bbac R12: 0000000012e8e000
[48665.197388] R13: ffff8800a3c14000 R14: 0000000000000000 R15: 0000000000000000
[48665.197388] FS:  0000000000000000(0000) GS:ffff88023ec40000(0000) knlGS:0000000000000000
[48665.197388] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[48665.197388] CR2: 00007f065e42f270 CR3: 0000000206f70000 CR4: 00000000000006e0
[48665.197388] Stack:
[48665.197388]  ffff8801b56a7bd8 0000000012ea0000 01ff8800a3c14138 0000000012e9ffff
[48665.197388]  ffff880141df3dd8 ffff8802143a6000 ffff8800a3c14138 ffff880141df3df0
[48665.197388]  ffff880141df3dd8 0000000000000000 ffff8801b56a7c08 ffffffffa0354227
[48665.197388] Call Trace:
[48665.197388]  [<ffffffffa0354227>] btrfs_finish_extent_commit+0xb0/0xd9 [btrfs]
[48665.197388]  [<ffffffffa0366b4b>] btrfs_commit_transaction+0x791/0x92c [btrfs]
[48665.197388]  [<ffffffffa0352432>] flush_space+0x43d/0x452 [btrfs]
[48665.197388]  [<ffffffff814295c3>] ? _raw_spin_unlock+0x28/0x33
[48665.197388]  [<ffffffffa035255f>] btrfs_async_reclaim_metadata_space+0x118/0x164 [btrfs]
[48665.197388]  [<ffffffff81059917>] ? process_one_work+0x14b/0x3ab
[48665.197388]  [<ffffffff810599ac>] process_one_work+0x1e0/0x3ab
[48665.197388]  [<ffffffff81079fa9>] ? trace_hardirqs_off+0xd/0xf
[48665.197388]  [<ffffffff8105a55b>] worker_thread+0x210/0x2d0
[48665.197388]  [<ffffffff8105a34b>] ? rescuer_thread+0x2c3/0x2c3
[48665.197388]  [<ffffffff8105e5c0>] kthread+0xef/0xf7
[48665.197388]  [<ffffffff81429682>] ? _raw_spin_unlock_irq+0x2d/0x39
[48665.197388]  [<ffffffff8105e4d1>] ? __kthread_parkme+0xad/0xad
[48665.197388]  [<ffffffff81429dec>] ret_from_fork+0x7c/0xb0
[48665.197388]  [<ffffffff8105e4d1>] ? __kthread_parkme+0xad/0xad
[48665.197388] Code: 85 f6 74 14 49 8b 06 49 03 46 09 49 39 c4 72 1d 4c 89 f7 e8 83 ec ff ff 4c 89 e6 4c 89 ef e8 1e f1 ff ff 48 85 c0 49 89 c6 75 02 <0f> 0b 49 8b 1e 49 03 5e 09 48 8b
[48665.197388] RIP  [<ffffffffa0350d05>] unpin_extent_range+0x6a/0x1ba [btrfs]
[48665.197388]  RSP <ffff8801b56a7b88>
[48665.272246] ---[ end trace b9c6ab9957521376 ]---

Fix this by ensuring that unpining the block group's range in
btrfs_finish_extent_commit() is done in a synchronized fashion
with removing the block group's range from freed_extents[]
in btrfs_delete_unused_bgs()

This race got introduced with the change:

    Btrfs: remove empty block groups automatically
    commit 47ab2a6cSigned-off-by: default avatarFilipe Manana <fdmanana@suse.com>
Signed-off-by: default avatarChris Mason <clm@fb.com>
parent e3540eab
...@@ -1744,6 +1744,7 @@ struct btrfs_fs_info { ...@@ -1744,6 +1744,7 @@ struct btrfs_fs_info {
spinlock_t unused_bgs_lock; spinlock_t unused_bgs_lock;
struct list_head unused_bgs; struct list_head unused_bgs;
struct mutex unused_bg_unpin_mutex;
/* For btrfs to record security options */ /* For btrfs to record security options */
struct security_mnt_opts security_opts; struct security_mnt_opts security_opts;
......
...@@ -2242,6 +2242,7 @@ int open_ctree(struct super_block *sb, ...@@ -2242,6 +2242,7 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->qgroup_op_lock);
spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock); spin_lock_init(&fs_info->unused_bgs_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
rwlock_init(&fs_info->tree_mod_log_lock); rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex); mutex_init(&fs_info->delalloc_root_mutex);
......
...@@ -5735,10 +5735,13 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, ...@@ -5735,10 +5735,13 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
unpin = &fs_info->freed_extents[0]; unpin = &fs_info->freed_extents[0];
while (1) { while (1) {
mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end, ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY, NULL); EXTENT_DIRTY, NULL);
if (ret) if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break; break;
}
if (btrfs_test_opt(root, DISCARD)) if (btrfs_test_opt(root, DISCARD))
ret = btrfs_discard_extent(root, start, ret = btrfs_discard_extent(root, start,
...@@ -5746,6 +5749,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, ...@@ -5746,6 +5749,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
clear_extent_dirty(unpin, start, end, GFP_NOFS); clear_extent_dirty(unpin, start, end, GFP_NOFS);
unpin_extent_range(root, start, end, true); unpin_extent_range(root, start, end, true);
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
cond_resched(); cond_resched();
} }
...@@ -9561,18 +9565,33 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) ...@@ -9561,18 +9565,33 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
*/ */
start = block_group->key.objectid; start = block_group->key.objectid;
end = start + block_group->key.offset - 1; end = start + block_group->key.offset - 1;
/*
* Hold the unused_bg_unpin_mutex lock to avoid racing with
* btrfs_finish_extent_commit(). If we are at transaction N,
* another task might be running finish_extent_commit() for the
* previous transaction N - 1, and have seen a range belonging
* to the block group in freed_extents[] before we were able to
* clear the whole block group range from freed_extents[]. This
* means that task can lookup for the block group after we
* unpinned it from freed_extents[] and removed it, leading to
* a BUG_ON() at btrfs_unpin_extent_range().
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
EXTENT_DIRTY, GFP_NOFS); EXTENT_DIRTY, GFP_NOFS);
if (ret) { if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
btrfs_set_block_group_rw(root, block_group); btrfs_set_block_group_rw(root, block_group);
goto end_trans; goto end_trans;
} }
ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
EXTENT_DIRTY, GFP_NOFS); EXTENT_DIRTY, GFP_NOFS);
if (ret) { if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
btrfs_set_block_group_rw(root, block_group); btrfs_set_block_group_rw(root, block_group);
goto end_trans; goto end_trans;
} }
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
/* Reset pinned so btrfs_put_block_group doesn't complain */ /* Reset pinned so btrfs_put_block_group doesn't complain */
block_group->pinned = 0; block_group->pinned = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment