Commit 8db87912 authored by Guoqing Jiang's avatar Guoqing Jiang Committed by Shaohua Li

md-cluster: Use a small window for raid10 resync

Suspending the entire device for resync could take
too long. Resync in small chunks.

cluster's resync window is maintained in r10conf as
cluster_sync_low and cluster_sync_high, and processed
in raid10's sync_request(). If the current resync is
outside the cluster resync window:

1. Set the cluster_sync_low to curr_resync_completed.
2. Set cluster_sync_high to cluster_sync_low + stripe
   size.
3. Send a message to all nodes so they may add it in
   their suspension list.

Note:
We only support "near" raid10 so far, resync a far or
offset raid10 array could have trouble. So raid10_run
checks the layout of clustered raid10, it will refuse
to run if the layout is not correct.

With the "near" layout we process one stripe at a time
progressing monotonically through the address space.
So we can have a sliding window of whole-stripes which
moves through the array suspending IO on other nodes,
and both resync which uses array addresses and recovery
which uses device addresses can stay within this window.
Signed-off-by: default avatarGuoqing Jiang <gqjiang@suse.com>
Signed-off-by: default avatarShaohua Li <shli@fb.com>
parent cb8a7a7e
...@@ -136,10 +136,13 @@ static void r10bio_pool_free(void *r10_bio, void *data) ...@@ -136,10 +136,13 @@ static void r10bio_pool_free(void *r10_bio, void *data)
kfree(r10_bio); kfree(r10_bio);
} }
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
/* amount of memory to reserve for resync requests */ /* amount of memory to reserve for resync requests */
#define RESYNC_WINDOW (1024*1024) #define RESYNC_WINDOW (1024*1024)
/* maximum number of concurrent requests, memory permitting */ /* maximum number of concurrent requests, memory permitting */
#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
/* /*
* When performing a resync, we need to read and compare, so * When performing a resync, we need to read and compare, so
...@@ -2840,6 +2843,43 @@ static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf) ...@@ -2840,6 +2843,43 @@ static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
return r10bio; return r10bio;
} }
/*
* Set cluster_sync_high since we need other nodes to add the
* range [cluster_sync_low, cluster_sync_high] to suspend list.
*/
static void raid10_set_cluster_sync_high(struct r10conf *conf)
{
sector_t window_size;
int extra_chunk, chunks;
/*
* First, here we define "stripe" as a unit which across
* all member devices one time, so we get chunks by use
* raid_disks / near_copies. Otherwise, if near_copies is
* close to raid_disks, then resync window could increases
* linearly with the increase of raid_disks, which means
* we will suspend a really large IO window while it is not
* necessary. If raid_disks is not divisible by near_copies,
* an extra chunk is needed to ensure the whole "stripe" is
* covered.
*/
chunks = conf->geo.raid_disks / conf->geo.near_copies;
if (conf->geo.raid_disks % conf->geo.near_copies == 0)
extra_chunk = 0;
else
extra_chunk = 1;
window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;
/*
* At least use a 32M window to align with raid1's resync window
*/
window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
conf->cluster_sync_high = conf->cluster_sync_low + window_size;
}
/* /*
* perform a "sync" on one "block" * perform a "sync" on one "block"
* *
...@@ -2912,6 +2952,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2912,6 +2952,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sector = mddev->resync_max_sectors; max_sector = mddev->resync_max_sectors;
if (sector_nr >= max_sector) { if (sector_nr >= max_sector) {
conf->cluster_sync_low = 0;
conf->cluster_sync_high = 0;
/* If we aborted, we need to abort the /* If we aborted, we need to abort the
* sync on the 'current' bitmap chucks (there can * sync on the 'current' bitmap chucks (there can
* be several when recovering multiple devices). * be several when recovering multiple devices).
...@@ -3266,7 +3309,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -3266,7 +3309,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
/* resync. Schedule a read for every block at this virt offset */ /* resync. Schedule a read for every block at this virt offset */
int count = 0; int count = 0;
bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0); /*
* Since curr_resync_completed could probably not update in
* time, and we will set cluster_sync_low based on it.
* Let's check against "sector_nr + 2 * RESYNC_SECTORS" for
* safety reason, which ensures curr_resync_completed is
* updated in bitmap_cond_end_sync.
*/
bitmap_cond_end_sync(mddev->bitmap, sector_nr,
mddev_is_clustered(mddev) &&
(sector_nr + 2 * RESYNC_SECTORS >
conf->cluster_sync_high));
if (!bitmap_start_sync(mddev->bitmap, sector_nr, if (!bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, mddev->degraded) && &sync_blocks, mddev->degraded) &&
...@@ -3400,6 +3453,52 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -3400,6 +3453,52 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
} while (++page_idx < RESYNC_PAGES); } while (++page_idx < RESYNC_PAGES);
r10_bio->sectors = nr_sectors; r10_bio->sectors = nr_sectors;
if (mddev_is_clustered(mddev) &&
test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
/* It is resync not recovery */
if (conf->cluster_sync_high < sector_nr + nr_sectors) {
conf->cluster_sync_low = mddev->curr_resync_completed;
raid10_set_cluster_sync_high(conf);
/* Send resync message */
md_cluster_ops->resync_info_update(mddev,
conf->cluster_sync_low,
conf->cluster_sync_high);
}
} else if (mddev_is_clustered(mddev)) {
/* This is recovery not resync */
sector_t sect_va1, sect_va2;
bool broadcast_msg = false;
for (i = 0; i < conf->geo.raid_disks; i++) {
/*
* sector_nr is a device address for recovery, so we
* need translate it to array address before compare
* with cluster_sync_high.
*/
sect_va1 = raid10_find_virt(conf, sector_nr, i);
if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
broadcast_msg = true;
/*
* curr_resync_completed is similar as
* sector_nr, so make the translation too.
*/
sect_va2 = raid10_find_virt(conf,
mddev->curr_resync_completed, i);
if (conf->cluster_sync_low == 0 ||
conf->cluster_sync_low > sect_va2)
conf->cluster_sync_low = sect_va2;
}
}
if (broadcast_msg) {
raid10_set_cluster_sync_high(conf);
md_cluster_ops->resync_info_update(mddev,
conf->cluster_sync_low,
conf->cluster_sync_high);
}
}
while (biolist) { while (biolist) {
bio = biolist; bio = biolist;
biolist = biolist->bi_next; biolist = biolist->bi_next;
...@@ -3659,6 +3758,18 @@ static int raid10_run(struct mddev *mddev) ...@@ -3659,6 +3758,18 @@ static int raid10_run(struct mddev *mddev)
if (!conf) if (!conf)
goto out; goto out;
if (mddev_is_clustered(conf->mddev)) {
int fc, fo;
fc = (mddev->layout >> 8) & 255;
fo = mddev->layout & (1<<16);
if (fc > 1 || fo > 0) {
pr_err("only near layout is supported by clustered"
" raid10\n");
goto out;
}
}
mddev->thread = conf->thread; mddev->thread = conf->thread;
conf->thread = NULL; conf->thread = NULL;
......
...@@ -88,6 +88,12 @@ struct r10conf { ...@@ -88,6 +88,12 @@ struct r10conf {
* the new thread here until we fully activate the array. * the new thread here until we fully activate the array.
*/ */
struct md_thread *thread; struct md_thread *thread;
/*
* Keep track of cluster resync window to send to other nodes.
*/
sector_t cluster_sync_low;
sector_t cluster_sync_high;
}; };
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment