Commit dc50fd2c authored by Neil Brown's avatar Neil Brown Committed by Linus Torvalds

[PATCH] md: Record location of incomplete resync at shutdown and restart from there.

Add a new field to the md superblock, in an used area, to record where
resync was up-to on a clean shutdown while resync is active.  Restart from
this point.

The extra field is verified by having a second copy of the event counter.
If the second event counter is wrong, we ignore the extra field.

This patch thanks to  Angus Sawyer <angus.sawyer@dsl.pipex.com>
parent 2aa80952
...@@ -578,10 +578,19 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -578,10 +578,19 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
mddev->level = sb->level; mddev->level = sb->level;
mddev->layout = sb->layout; mddev->layout = sb->layout;
mddev->raid_disks = sb->raid_disks; mddev->raid_disks = sb->raid_disks;
mddev->state = sb->state;
mddev->size = sb->size; mddev->size = sb->size;
mddev->events = md_event(sb); mddev->events = md_event(sb);
if (sb->state & (1<<MD_SB_CLEAN))
mddev->recovery_cp = MaxSector;
else {
if (sb->events_hi == sb->cp_events_hi &&
sb->events_lo == sb->cp_events_lo) {
mddev->recovery_cp = sb->recovery_cp;
} else
mddev->recovery_cp = 0;
}
memcpy(mddev->uuid+0, &sb->set_uuid0, 4); memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
memcpy(mddev->uuid+4, &sb->set_uuid1, 4); memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
memcpy(mddev->uuid+8, &sb->set_uuid2, 4); memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
...@@ -657,10 +666,22 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -657,10 +666,22 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->md_minor = mddev->__minor; sb->md_minor = mddev->__minor;
sb->not_persistent = !mddev->persistent; sb->not_persistent = !mddev->persistent;
sb->utime = mddev->utime; sb->utime = mddev->utime;
sb->state = mddev->state; sb->state = 0;
sb->events_hi = (mddev->events>>32); sb->events_hi = (mddev->events>>32);
sb->events_lo = (u32)mddev->events; sb->events_lo = (u32)mddev->events;
if (mddev->in_sync)
{
sb->recovery_cp = mddev->recovery_cp;
sb->cp_events_hi = (mddev->events>>32);
sb->cp_events_lo = (u32)mddev->events;
if (mddev->recovery_cp == MaxSector) {
printk(KERN_INFO "md: marking sb clean...\n");
sb->state = (1<< MD_SB_CLEAN);
}
} else
sb->recovery_cp = 0;
sb->layout = mddev->layout; sb->layout = mddev->layout;
sb->chunk_size = mddev->chunk_size; sb->chunk_size = mddev->chunk_size;
...@@ -1198,7 +1219,7 @@ static int analyze_sbs(mddev_t * mddev) ...@@ -1198,7 +1219,7 @@ static int analyze_sbs(mddev_t * mddev)
goto abort; goto abort;
} }
if ((mddev->state != (1 << MD_SB_CLEAN)) && ((mddev->level == 1) || if ((mddev->recovery_cp != MaxSector) && ((mddev->level == 1) ||
(mddev->level == 4) || (mddev->level == 5))) (mddev->level == 4) || (mddev->level == 5)))
printk(NOT_CLEAN_IGNORE, mdidx(mddev)); printk(NOT_CLEAN_IGNORE, mdidx(mddev));
...@@ -1469,13 +1490,11 @@ static int do_md_run(mddev_t * mddev) ...@@ -1469,13 +1490,11 @@ static int do_md_run(mddev_t * mddev)
mddev->pers = NULL; mddev->pers = NULL;
return -EINVAL; return -EINVAL;
} }
mddev->in_sync = (mddev->state & (1<<MD_SB_CLEAN));
/* if personality doesn't have "sync_request", then
* a dirty array doesn't mean anything
*/
if (mddev->pers->sync_request) if (mddev->pers->sync_request)
mddev->state &= ~(1 << MD_SB_CLEAN); mddev->in_sync = 0;
else
mddev->in_sync = 1;
md_update_sb(mddev); md_update_sb(mddev);
md_recover_arrays(); md_recover_arrays();
set_capacity(disk, md_size[mdidx(mddev)]<<1); set_capacity(disk, md_size[mdidx(mddev)]<<1);
...@@ -1502,6 +1521,8 @@ static int restart_array(mddev_t *mddev) ...@@ -1502,6 +1521,8 @@ static int restart_array(mddev_t *mddev)
if (!mddev->ro) if (!mddev->ro)
goto out; goto out;
mddev->in_sync = 0;
md_update_sb(mddev);
mddev->ro = 0; mddev->ro = 0;
set_disk_ro(disk, 0); set_disk_ro(disk, 0);
...@@ -1541,7 +1562,7 @@ static int do_md_stop(mddev_t * mddev, int ro) ...@@ -1541,7 +1562,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
if (mddev->pers) { if (mddev->pers) {
if (mddev->sync_thread) { if (mddev->sync_thread) {
if (mddev->recovery_running > 0) if (mddev->recovery_running > 0)
mddev->recovery_running = -EINTR; mddev->recovery_running = -1;
md_unregister_thread(mddev->sync_thread); md_unregister_thread(mddev->sync_thread);
mddev->sync_thread = NULL; mddev->sync_thread = NULL;
} }
...@@ -1567,14 +1588,8 @@ static int do_md_stop(mddev_t * mddev, int ro) ...@@ -1567,14 +1588,8 @@ static int do_md_stop(mddev_t * mddev, int ro)
mddev->ro = 0; mddev->ro = 0;
} }
if (mddev->raid_disks) { if (mddev->raid_disks) {
/* /* mark array as shutdown cleanly */
* mark it clean only if there was no resync mddev->in_sync = 1;
* interrupted.
*/
if (mddev->in_sync) {
printk(KERN_INFO "md: marking sb clean...\n");
mddev->state |= 1 << MD_SB_CLEAN;
}
md_update_sb(mddev); md_update_sb(mddev);
} }
if (ro) if (ro)
...@@ -1840,7 +1855,9 @@ static int get_array_info(mddev_t * mddev, void * arg) ...@@ -1840,7 +1855,9 @@ static int get_array_info(mddev_t * mddev, void * arg)
info.not_persistent= !mddev->persistent; info.not_persistent= !mddev->persistent;
info.utime = mddev->utime; info.utime = mddev->utime;
info.state = mddev->state; info.state = 0;
if (mddev->recovery_cp == MaxSector)
info.state = (1<<MD_SB_CLEAN);
info.active_disks = active; info.active_disks = active;
info.working_disks = working; info.working_disks = working;
info.failed_disks = failed; info.failed_disks = failed;
...@@ -2111,7 +2128,10 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) ...@@ -2111,7 +2128,10 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
/* don't set __minor, it is determined by which /dev/md* was /* don't set __minor, it is determined by which /dev/md* was
* openned * openned
*/ */
mddev->state = info->state; if (info->state & (1<<MD_SB_CLEAN))
mddev->recovery_cp = MaxSector;
else
mddev->recovery_cp = 0;
mddev->persistent = ! info->not_persistent; mddev->persistent = ! info->not_persistent;
mddev->layout = info->layout; mddev->layout = info->layout;
...@@ -2770,7 +2790,8 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) ...@@ -2770,7 +2790,8 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
atomic_sub(blocks, &mddev->recovery_active); atomic_sub(blocks, &mddev->recovery_active);
wake_up(&mddev->recovery_wait); wake_up(&mddev->recovery_wait);
if (!ok) { if (!ok) {
mddev->recovery_running = -EIO; mddev->recovery_error = -EIO;
mddev->recovery_running = -1;
md_recover_arrays(); md_recover_arrays();
// stop recovery, signal do_sync .... // stop recovery, signal do_sync ....
} }
...@@ -2841,7 +2862,7 @@ static void md_do_sync(void *data) ...@@ -2841,7 +2862,7 @@ static void md_do_sync(void *data)
is_mddev_idle(mddev); /* this also initializes IO event counters */ is_mddev_idle(mddev); /* this also initializes IO event counters */
for (m = 0; m < SYNC_MARKS; m++) { for (m = 0; m < SYNC_MARKS; m++) {
mark[m] = jiffies; mark[m] = jiffies;
mark_cnt[m] = 0; mark_cnt[m] = mddev->recovery_cp;
} }
last_mark = 0; last_mark = 0;
mddev->resync_mark = mark[last_mark]; mddev->resync_mark = mark[last_mark];
...@@ -2857,7 +2878,13 @@ static void md_do_sync(void *data) ...@@ -2857,7 +2878,13 @@ static void md_do_sync(void *data)
atomic_set(&mddev->recovery_active, 0); atomic_set(&mddev->recovery_active, 0);
init_waitqueue_head(&mddev->recovery_wait); init_waitqueue_head(&mddev->recovery_wait);
last_check = 0; last_check = 0;
for (j = 0; j < max_sectors;) {
mddev->recovery_error = 0;
if (mddev->recovery_cp)
printk(KERN_INFO "md: resuming recovery of md%d from checkpoint.\n", mdidx(mddev));
for (j = mddev->recovery_cp; j < max_sectors;) {
int sectors; int sectors;
sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min); sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min);
...@@ -2927,16 +2954,25 @@ static void md_do_sync(void *data) ...@@ -2927,16 +2954,25 @@ static void md_do_sync(void *data)
*/ */
out: out:
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
if (mddev->recovery_running < 0 &&
!mddev->recovery_error && mddev->curr_resync > 2)
{
/* interrupted but no write errors */
printk(KERN_INFO "md: checkpointing recovery of md%d.\n", mdidx(mddev));
mddev->recovery_cp = mddev->curr_resync;
}
/* tell personality that we are finished */ /* tell personality that we are finished */
mddev->pers->sync_request(mddev, max_sectors, 1); mddev->pers->sync_request(mddev, max_sectors, 1);
skip: skip:
mddev->curr_resync = 0; mddev->curr_resync = 0;
if (err) if (err)
mddev->recovery_running = err; mddev->recovery_running = -1;
if (mddev->recovery_running > 0) if (mddev->recovery_running > 0)
mddev->recovery_running = 0; mddev->recovery_running = 0;
if (mddev->recovery_running == 0) if (mddev->recovery_running == 0)
mddev->in_sync = 1; mddev->recovery_cp = MaxSector;
md_recover_arrays(); md_recover_arrays();
} }
...@@ -3017,14 +3053,16 @@ void md_do_recovery(void *data) ...@@ -3017,14 +3053,16 @@ void md_do_recovery(void *data)
ITERATE_RDEV(mddev,rdev,rtmp) ITERATE_RDEV(mddev,rdev,rtmp)
if (rdev->raid_disk < 0 if (rdev->raid_disk < 0
&& !rdev->faulty) { && !rdev->faulty) {
if (mddev->pers->hot_add_disk(mddev,rdev)) if (mddev->pers->hot_add_disk(mddev,rdev)) {
mddev->spares++; mddev->spares++;
mddev->recovery_cp = 0;
}
else else
break; break;
} }
} }
if (!mddev->spares && mddev->in_sync) { if (!mddev->spares && (mddev->recovery_cp == MaxSector )) {
/* nothing we can do ... */ /* nothing we can do ... */
goto unlock; goto unlock;
} }
......
...@@ -975,7 +975,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) ...@@ -975,7 +975,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
sector_t max_sector, nr_sectors; sector_t max_sector, nr_sectors;
int disk, partial; int disk, partial;
if (sector_nr == 0) if (!conf->r1buf_pool)
if (init_resync(conf)) if (init_resync(conf))
return -ENOMEM; return -ENOMEM;
...@@ -1149,7 +1149,7 @@ static int run(mddev_t *mddev) ...@@ -1149,7 +1149,7 @@ static int run(mddev_t *mddev)
conf->mddev = mddev; conf->mddev = mddev;
conf->device_lock = SPIN_LOCK_UNLOCKED; conf->device_lock = SPIN_LOCK_UNLOCKED;
if (conf->working_disks == 1) if (conf->working_disks == 1)
mddev->state |= (1 << MD_SB_CLEAN); mddev->recovery_cp = MaxSector;
conf->resync_lock = SPIN_LOCK_UNLOCKED; conf->resync_lock = SPIN_LOCK_UNLOCKED;
init_waitqueue_head(&conf->wait_idle); init_waitqueue_head(&conf->wait_idle);
......
...@@ -1471,7 +1471,7 @@ static int run (mddev_t *mddev) ...@@ -1471,7 +1471,7 @@ static int run (mddev_t *mddev)
} }
if (mddev->degraded == 1 && if (mddev->degraded == 1 &&
!(mddev->state & (1<<MD_SB_CLEAN))) { mddev->recovery_cp != MaxSector) {
printk(KERN_ERR "raid5: cannot start dirty degraded array for md%d\n", mdidx(mddev)); printk(KERN_ERR "raid5: cannot start dirty degraded array for md%d\n", mdidx(mddev));
goto abort; goto abort;
} }
......
...@@ -28,6 +28,8 @@ ...@@ -28,6 +28,8 @@
#define LEVEL_MULTIPATH (-4) #define LEVEL_MULTIPATH (-4)
#define LEVEL_LINEAR (-1) #define LEVEL_LINEAR (-1)
#define MaxSector (~(sector_t)0)
static inline int pers_to_level (int pers) static inline int pers_to_level (int pers)
{ {
switch (pers) { switch (pers) {
...@@ -198,7 +200,6 @@ struct mddev_s ...@@ -198,7 +200,6 @@ struct mddev_s
int level, layout; int level, layout;
int raid_disks; int raid_disks;
int max_disks; int max_disks;
unsigned long state;
sector_t size; /* used size of component devices */ sector_t size; /* used size of component devices */
__u64 events; __u64 events;
...@@ -215,6 +216,7 @@ struct mddev_s ...@@ -215,6 +216,7 @@ struct mddev_s
* it can only be set > 0 under reconfig_sem * it can only be set > 0 under reconfig_sem
*/ */
int recovery_running; int recovery_running;
int recovery_error; /* error from recovery write */
int in_sync; /* know to not need resync */ int in_sync; /* know to not need resync */
struct semaphore reconfig_sem; struct semaphore reconfig_sem;
atomic_t active; atomic_t active;
...@@ -226,6 +228,7 @@ struct mddev_s ...@@ -226,6 +228,7 @@ struct mddev_s
atomic_t recovery_active; /* blocks scheduled, but not written */ atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait; wait_queue_head_t recovery_wait;
sector_t recovery_cp;
request_queue_t queue; /* for plugging ... */ request_queue_t queue; /* for plugging ... */
......
...@@ -131,11 +131,16 @@ typedef struct mdp_superblock_s { ...@@ -131,11 +131,16 @@ typedef struct mdp_superblock_s {
#ifdef __BIG_ENDIAN #ifdef __BIG_ENDIAN
__u32 events_hi; /* 7 high-order of superblock update count */ __u32 events_hi; /* 7 high-order of superblock update count */
__u32 events_lo; /* 8 low-order of superblock update count */ __u32 events_lo; /* 8 low-order of superblock update count */
__u32 cp_events_hi; /* 9 high-order of checkpoint update count */
__u32 cp_events_lo; /* 10 low-order of checkpoint update count */
#else #else
__u32 events_lo; /* 7 low-order of superblock update count */ __u32 events_lo; /* 7 low-order of superblock update count */
__u32 events_hi; /* 8 high-order of superblock update count */ __u32 events_hi; /* 8 high-order of superblock update count */
__u32 cp_events_lo; /* 9 low-order of checkpoint update count */
__u32 cp_events_hi; /* 10 high-order of checkpoint update count */
#endif #endif
__u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9]; __u32 recovery_cp; /* 11 recovery checkpoint sector count */
__u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 12];
/* /*
* Personality information * Personality information
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment