Commit 0e24f6d8 authored by Josef Bacik's avatar Josef Bacik Committed by David Sterba

btrfs: do not infinite loop in data reclaim if we aborted

Error injection stressing uncovered a busy loop in our data reclaim
loop.  There are two cases here, one where we loop creating block groups
until space_info->full is set, or in the main loop we will skip erroring
out any tickets if space_info->full == 0.  Unfortunately if we aborted
the transaction then we will never allocate chunks or reclaim any space
and thus never get ->full, and you'll see stack traces like this:

  watchdog: BUG: soft lockup - CPU#0 stuck for 26s! [kworker/u4:4:139]
  CPU: 0 PID: 139 Comm: kworker/u4:4 Tainted: G        W         5.13.0-rc1+ #328
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
  Workqueue: events_unbound btrfs_async_reclaim_data_space
  RIP: 0010:btrfs_join_transaction+0x12/0x20
  RSP: 0018:ffffb2b780b77de0 EFLAGS: 00000246
  RAX: ffffb2b781863d58 RBX: 0000000000000000 RCX: 0000000000000000
  RDX: 0000000000000801 RSI: ffff987952b57400 RDI: ffff987940aa3000
  RBP: ffff987954d55000 R08: 0000000000000001 R09: ffff98795539e8f0
  R10: 000000000000000f R11: 000000000000000f R12: ffffffffffffffff
  R13: ffff987952b574c8 R14: ffff987952b57400 R15: 0000000000000008
  FS:  0000000000000000(0000) GS:ffff9879bbc00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007f0703da4000 CR3: 0000000113398004 CR4: 0000000000370ef0
  Call Trace:
   flush_space+0x4a8/0x660
   btrfs_async_reclaim_data_space+0x55/0x130
   process_one_work+0x1e9/0x380
   worker_thread+0x53/0x3e0
   ? process_one_work+0x380/0x380
   kthread+0x118/0x140
   ? __kthread_bind_mask+0x60/0x60
   ret_from_fork+0x1f/0x30

Fix this by checking to see if we have a btrfs fs error in either of the
reclaim loops, and if so fail the tickets and bail.  In addition to
this, fix maybe_fail_all_tickets() to not try to grant tickets if we've
aborted, simply fail everything.
Reviewed-by: default avatarNikolay Borisov <nborisov@suse.com>
Reviewed-by: default avatarFilipe Manana <fdmanana@suse.com>
Signed-off-by: default avatarJosef Bacik <josef@toxicpanda.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 84961539
...@@ -885,6 +885,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, ...@@ -885,6 +885,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
{ {
struct reserve_ticket *ticket; struct reserve_ticket *ticket;
u64 tickets_id = space_info->tickets_id; u64 tickets_id = space_info->tickets_id;
const bool aborted = BTRFS_FS_ERROR(fs_info);
trace_btrfs_fail_all_tickets(fs_info, space_info); trace_btrfs_fail_all_tickets(fs_info, space_info);
...@@ -898,16 +899,19 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, ...@@ -898,16 +899,19 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
ticket = list_first_entry(&space_info->tickets, ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list); struct reserve_ticket, list);
if (ticket->steal && if (!aborted && ticket->steal &&
steal_from_global_rsv(fs_info, space_info, ticket)) steal_from_global_rsv(fs_info, space_info, ticket))
return true; return true;
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
btrfs_info(fs_info, "failing ticket with %llu bytes", btrfs_info(fs_info, "failing ticket with %llu bytes",
ticket->bytes); ticket->bytes);
remove_ticket(space_info, ticket); remove_ticket(space_info, ticket);
ticket->error = -ENOSPC; if (aborted)
ticket->error = -EIO;
else
ticket->error = -ENOSPC;
wake_up(&ticket->wait); wake_up(&ticket->wait);
/* /*
...@@ -916,7 +920,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, ...@@ -916,7 +920,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
* here to see if we can make progress with the next ticket in * here to see if we can make progress with the next ticket in
* the list. * the list.
*/ */
btrfs_try_granting_tickets(fs_info, space_info); if (!aborted)
btrfs_try_granting_tickets(fs_info, space_info);
} }
return (tickets_id != space_info->tickets_id); return (tickets_id != space_info->tickets_id);
} }
...@@ -1172,6 +1177,10 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) ...@@ -1172,6 +1177,10 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
spin_unlock(&space_info->lock); spin_unlock(&space_info->lock);
return; return;
} }
/* Something happened, fail everything and bail. */
if (BTRFS_FS_ERROR(fs_info))
goto aborted_fs;
last_tickets_id = space_info->tickets_id; last_tickets_id = space_info->tickets_id;
spin_unlock(&space_info->lock); spin_unlock(&space_info->lock);
} }
...@@ -1202,9 +1211,20 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) ...@@ -1202,9 +1211,20 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
} else { } else {
flush_state = 0; flush_state = 0;
} }
/* Something happened, fail everything and bail. */
if (BTRFS_FS_ERROR(fs_info))
goto aborted_fs;
} }
spin_unlock(&space_info->lock); spin_unlock(&space_info->lock);
} }
return;
aborted_fs:
maybe_fail_all_tickets(fs_info, space_info);
space_info->flush = 0;
spin_unlock(&space_info->lock);
} }
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment