Commit 16d997b7 authored by NeilBrown's avatar NeilBrown Committed by Shaohua Li

md/raid5: simplfy delaying of writes while metadata is updated.

If a device fails during a write, we must ensure the failure is
recorded in the metadata before the completion of the write is
acknowleged.

Commit c3cce6cd ("md/raid5: ensure device failure recorded before
write request returns.")  added code for this, but it was
unnecessarily complicated.  We already had similar functionality for
handling updates to the bad-block-list, thanks to Commit de393cde
("md: make it easier to wait for bad blocks to be acknowledged.")

So revert most of the former commit, and instead avoid collecting
completed writes if MD_CHANGE_PENDING is set.  raid5d() will then flush
the metadata and retry the stripe_head.
As this change can leave a stripe_head ready for handling immediately
after handle_active_stripes() returns, we change raid5_do_work() to
pause when MD_CHANGE_PENDING is set, so that it doesn't spin.

We check MD_CHANGE_PENDING *after* analyse_stripe() as it could be set
asynchronously.  After analyse_stripe(), we have collected stable data
about the state of devices, which will be used to make decisions.
Signed-off-by: default avatarNeilBrown <neilb@suse.com>
Signed-off-by: default avatarShaohua Li <shli@fb.com>
parent 49728050
...@@ -4691,7 +4691,8 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -4691,7 +4691,8 @@ static void handle_stripe(struct stripe_head *sh)
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
goto finish; goto finish;
if (s.handle_bad_blocks) { if (s.handle_bad_blocks ||
test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
goto finish; goto finish;
} }
...@@ -5021,15 +5022,8 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -5021,15 +5022,8 @@ static void handle_stripe(struct stripe_head *sh)
md_wakeup_thread(conf->mddev->thread); md_wakeup_thread(conf->mddev->thread);
} }
if (!bio_list_empty(&s.return_bi)) { if (!bio_list_empty(&s.return_bi))
if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
spin_lock_irq(&conf->device_lock);
bio_list_merge(&conf->return_bi, &s.return_bi);
spin_unlock_irq(&conf->device_lock);
md_wakeup_thread(conf->mddev->thread);
} else
return_io(&s.return_bi); return_io(&s.return_bi);
}
clear_bit_unlock(STRIPE_ACTIVE, &sh->state); clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
} }
...@@ -6226,6 +6220,7 @@ static void raid5_do_work(struct work_struct *work) ...@@ -6226,6 +6220,7 @@ static void raid5_do_work(struct work_struct *work)
struct r5worker *worker = container_of(work, struct r5worker, work); struct r5worker *worker = container_of(work, struct r5worker, work);
struct r5worker_group *group = worker->group; struct r5worker_group *group = worker->group;
struct r5conf *conf = group->conf; struct r5conf *conf = group->conf;
struct mddev *mddev = conf->mddev;
int group_id = group - conf->worker_groups; int group_id = group - conf->worker_groups;
int handled; int handled;
struct blk_plug plug; struct blk_plug plug;
...@@ -6246,6 +6241,9 @@ static void raid5_do_work(struct work_struct *work) ...@@ -6246,6 +6241,9 @@ static void raid5_do_work(struct work_struct *work)
if (!batch_size && !released) if (!batch_size && !released)
break; break;
handled += batch_size; handled += batch_size;
wait_event_lock_irq(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
conf->device_lock);
} }
pr_debug("%d stripes handled\n", handled); pr_debug("%d stripes handled\n", handled);
...@@ -6273,18 +6271,6 @@ static void raid5d(struct md_thread *thread) ...@@ -6273,18 +6271,6 @@ static void raid5d(struct md_thread *thread)
md_check_recovery(mddev); md_check_recovery(mddev);
if (!bio_list_empty(&conf->return_bi) &&
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
struct bio_list tmp = BIO_EMPTY_LIST;
spin_lock_irq(&conf->device_lock);
if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
bio_list_merge(&tmp, &conf->return_bi);
bio_list_init(&conf->return_bi);
}
spin_unlock_irq(&conf->device_lock);
return_io(&tmp);
}
blk_start_plug(&plug); blk_start_plug(&plug);
handled = 0; handled = 0;
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
...@@ -6936,7 +6922,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) ...@@ -6936,7 +6922,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
INIT_LIST_HEAD(&conf->hold_list); INIT_LIST_HEAD(&conf->hold_list);
INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->delayed_list);
INIT_LIST_HEAD(&conf->bitmap_list); INIT_LIST_HEAD(&conf->bitmap_list);
bio_list_init(&conf->return_bi);
init_llist_head(&conf->released_stripes); init_llist_head(&conf->released_stripes);
atomic_set(&conf->active_stripes, 0); atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0);
......
...@@ -638,9 +638,6 @@ struct r5conf { ...@@ -638,9 +638,6 @@ struct r5conf {
int skip_copy; /* Don't copy data from bio to stripe cache */ int skip_copy; /* Don't copy data from bio to stripe cache */
struct list_head *last_hold; /* detect hold_list promotions */ struct list_head *last_hold; /* detect hold_list promotions */
/* bios to have bi_end_io called after metadata is synced */
struct bio_list return_bi;
atomic_t reshape_stripes; /* stripes with pending writes for reshape */ atomic_t reshape_stripes; /* stripes with pending writes for reshape */
/* unfortunately we need two cache names as we temporarily have /* unfortunately we need two cache names as we temporarily have
* two caches. * two caches.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment