Commit c1dadcc1 authored by NeilBrown's avatar NeilBrown Committed by Greg Kroah-Hartman

md/raid5: fix interaction of 'replace' and 'recovery'.

commit f94c0b66 upstream.

If a device in a RAID4/5/6 is being replaced while another is being
recovered, then the writes to the replacement device currently don't
happen, resulting in corruption when the replacement completes and the
new drive takes over.

This is because the replacement writes are only triggered when
's.replacing' is set and not when the similar 's.sync' is set (which
is the case during resync and recovery - it means all devices need to
be read).

So schedule those writes when s.replacing is set as well.

In this case we cannot use "STRIPE_INSYNC" to record that the
replacement has happened as that is needed for recording that any
parity calculation is complete.  So introduce STRIPE_REPLACED to
record if the replacement has happened.

For safety we should also check that STRIPE_COMPUTE_RUN is not set.
This has a similar effect to the "s.locked == 0" test.  The latter
ensure that now IO has been flagged but not started.  The former
checks if any parity calculation has been flagged by not started.
We must wait for both of these to complete before triggering the
'replace'.

Add a similar test to the subsequent check for "are we finished yet".
This possibly isn't needed (is subsumed in the STRIPE_INSYNC test),
but it makes it more obvious that the REPLACE will happen before we
think we are finished.

Finally if a NeedReplace device is not UPTODATE then that is an
error.  We really must trigger a warning.

This bug was introduced in commit 9a3e1101
(md/raid5:  detect and handle replacements during recovery.)
which introduced replacement for raid5.
That was in 3.3-rc3, so any stable kernel since then would benefit
from this fix.
Reported-by: default avatarqindehua <13691222965@163.com>
Tested-by: default avatarqindehua <qindehua@163.com>
Signed-off-by: default avatarNeilBrown <neilb@suse.de>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 8afb90da
...@@ -3462,6 +3462,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -3462,6 +3462,7 @@ static void handle_stripe(struct stripe_head *sh)
test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
set_bit(STRIPE_SYNCING, &sh->state); set_bit(STRIPE_SYNCING, &sh->state);
clear_bit(STRIPE_INSYNC, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state);
clear_bit(STRIPE_REPLACED, &sh->state);
} }
spin_unlock(&sh->stripe_lock); spin_unlock(&sh->stripe_lock);
} }
...@@ -3607,19 +3608,23 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -3607,19 +3608,23 @@ static void handle_stripe(struct stripe_head *sh)
handle_parity_checks5(conf, sh, &s, disks); handle_parity_checks5(conf, sh, &s, disks);
} }
if (s.replacing && s.locked == 0 if ((s.replacing || s.syncing) && s.locked == 0
&& !test_bit(STRIPE_INSYNC, &sh->state)) { && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
&& !test_bit(STRIPE_REPLACED, &sh->state)) {
/* Write out to replacement devices where possible */ /* Write out to replacement devices where possible */
for (i = 0; i < conf->raid_disks; i++) for (i = 0; i < conf->raid_disks; i++)
if (test_bit(R5_UPTODATE, &sh->dev[i].flags) && if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
test_bit(R5_NeedReplace, &sh->dev[i].flags)) { WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
set_bit(R5_WantReplace, &sh->dev[i].flags); set_bit(R5_WantReplace, &sh->dev[i].flags);
set_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(R5_LOCKED, &sh->dev[i].flags);
s.locked++; s.locked++;
} }
set_bit(STRIPE_INSYNC, &sh->state); if (s.replacing)
set_bit(STRIPE_INSYNC, &sh->state);
set_bit(STRIPE_REPLACED, &sh->state);
} }
if ((s.syncing || s.replacing) && s.locked == 0 && if ((s.syncing || s.replacing) && s.locked == 0 &&
!test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
test_bit(STRIPE_INSYNC, &sh->state)) { test_bit(STRIPE_INSYNC, &sh->state)) {
md_done_sync(conf->mddev, STRIPE_SECTORS, 1); md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
clear_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_SYNCING, &sh->state);
......
...@@ -306,6 +306,7 @@ enum { ...@@ -306,6 +306,7 @@ enum {
STRIPE_SYNC_REQUESTED, STRIPE_SYNC_REQUESTED,
STRIPE_SYNCING, STRIPE_SYNCING,
STRIPE_INSYNC, STRIPE_INSYNC,
STRIPE_REPLACED,
STRIPE_PREREAD_ACTIVE, STRIPE_PREREAD_ACTIVE,
STRIPE_DELAYED, STRIPE_DELAYED,
STRIPE_DEGRADED, STRIPE_DEGRADED,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment