md-cluster: Use a small window for raid10 resync
Suspending the entire device for resync could take too long. Resync in small chunks. cluster's resync window is maintained in r10conf as cluster_sync_low and cluster_sync_high, and processed in raid10's sync_request(). If the current resync is outside the cluster resync window: 1. Set the cluster_sync_low to curr_resync_completed. 2. Set cluster_sync_high to cluster_sync_low + stripe size. 3. Send a message to all nodes so they may add it in their suspension list. Note: We only support "near" raid10 so far, resync a far or offset raid10 array could have trouble. So raid10_run checks the layout of clustered raid10, it will refuse to run if the layout is not correct. With the "near" layout we process one stripe at a time progressing monotonically through the address space. So we can have a sliding window of whole-stripes which moves through the array suspending IO on other nodes, and both resync which uses array addresses and recovery which uses device addresses can stay within this window. Signed-off-by: Guoqing Jiang <gqjiang@suse.com> Signed-off-by: Shaohua Li <shli@fb.com>
This commit is contained in:
parent
cb8a7a7e10
commit
8db87912c9
@ -136,10 +136,13 @@ static void r10bio_pool_free(void *r10_bio, void *data)
|
||||
kfree(r10_bio);
|
||||
}
|
||||
|
||||
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
|
||||
/* amount of memory to reserve for resync requests */
|
||||
#define RESYNC_WINDOW (1024*1024)
|
||||
/* maximum number of concurrent requests, memory permitting */
|
||||
#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
|
||||
#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
|
||||
#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
|
||||
|
||||
/*
|
||||
* When performing a resync, we need to read and compare, so
|
||||
@ -2840,6 +2843,43 @@ static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
|
||||
return r10bio;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set cluster_sync_high since we need other nodes to add the
|
||||
* range [cluster_sync_low, cluster_sync_high] to suspend list.
|
||||
*/
|
||||
static void raid10_set_cluster_sync_high(struct r10conf *conf)
|
||||
{
|
||||
sector_t window_size;
|
||||
int extra_chunk, chunks;
|
||||
|
||||
/*
|
||||
* First, here we define "stripe" as a unit which across
|
||||
* all member devices one time, so we get chunks by use
|
||||
* raid_disks / near_copies. Otherwise, if near_copies is
|
||||
* close to raid_disks, then resync window could increases
|
||||
* linearly with the increase of raid_disks, which means
|
||||
* we will suspend a really large IO window while it is not
|
||||
* necessary. If raid_disks is not divisible by near_copies,
|
||||
* an extra chunk is needed to ensure the whole "stripe" is
|
||||
* covered.
|
||||
*/
|
||||
|
||||
chunks = conf->geo.raid_disks / conf->geo.near_copies;
|
||||
if (conf->geo.raid_disks % conf->geo.near_copies == 0)
|
||||
extra_chunk = 0;
|
||||
else
|
||||
extra_chunk = 1;
|
||||
window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;
|
||||
|
||||
/*
|
||||
* At least use a 32M window to align with raid1's resync window
|
||||
*/
|
||||
window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
|
||||
CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
|
||||
|
||||
conf->cluster_sync_high = conf->cluster_sync_low + window_size;
|
||||
}
|
||||
|
||||
/*
|
||||
* perform a "sync" on one "block"
|
||||
*
|
||||
@ -2912,6 +2952,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
|
||||
max_sector = mddev->resync_max_sectors;
|
||||
if (sector_nr >= max_sector) {
|
||||
conf->cluster_sync_low = 0;
|
||||
conf->cluster_sync_high = 0;
|
||||
|
||||
/* If we aborted, we need to abort the
|
||||
* sync on the 'current' bitmap chucks (there can
|
||||
* be several when recovering multiple devices).
|
||||
@ -3266,7 +3309,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
/* resync. Schedule a read for every block at this virt offset */
|
||||
int count = 0;
|
||||
|
||||
bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0);
|
||||
/*
|
||||
* Since curr_resync_completed could probably not update in
|
||||
* time, and we will set cluster_sync_low based on it.
|
||||
* Let's check against "sector_nr + 2 * RESYNC_SECTORS" for
|
||||
* safety reason, which ensures curr_resync_completed is
|
||||
* updated in bitmap_cond_end_sync.
|
||||
*/
|
||||
bitmap_cond_end_sync(mddev->bitmap, sector_nr,
|
||||
mddev_is_clustered(mddev) &&
|
||||
(sector_nr + 2 * RESYNC_SECTORS >
|
||||
conf->cluster_sync_high));
|
||||
|
||||
if (!bitmap_start_sync(mddev->bitmap, sector_nr,
|
||||
&sync_blocks, mddev->degraded) &&
|
||||
@ -3400,6 +3453,52 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
} while (++page_idx < RESYNC_PAGES);
|
||||
r10_bio->sectors = nr_sectors;
|
||||
|
||||
if (mddev_is_clustered(mddev) &&
|
||||
test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
|
||||
/* It is resync not recovery */
|
||||
if (conf->cluster_sync_high < sector_nr + nr_sectors) {
|
||||
conf->cluster_sync_low = mddev->curr_resync_completed;
|
||||
raid10_set_cluster_sync_high(conf);
|
||||
/* Send resync message */
|
||||
md_cluster_ops->resync_info_update(mddev,
|
||||
conf->cluster_sync_low,
|
||||
conf->cluster_sync_high);
|
||||
}
|
||||
} else if (mddev_is_clustered(mddev)) {
|
||||
/* This is recovery not resync */
|
||||
sector_t sect_va1, sect_va2;
|
||||
bool broadcast_msg = false;
|
||||
|
||||
for (i = 0; i < conf->geo.raid_disks; i++) {
|
||||
/*
|
||||
* sector_nr is a device address for recovery, so we
|
||||
* need translate it to array address before compare
|
||||
* with cluster_sync_high.
|
||||
*/
|
||||
sect_va1 = raid10_find_virt(conf, sector_nr, i);
|
||||
|
||||
if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
|
||||
broadcast_msg = true;
|
||||
/*
|
||||
* curr_resync_completed is similar as
|
||||
* sector_nr, so make the translation too.
|
||||
*/
|
||||
sect_va2 = raid10_find_virt(conf,
|
||||
mddev->curr_resync_completed, i);
|
||||
|
||||
if (conf->cluster_sync_low == 0 ||
|
||||
conf->cluster_sync_low > sect_va2)
|
||||
conf->cluster_sync_low = sect_va2;
|
||||
}
|
||||
}
|
||||
if (broadcast_msg) {
|
||||
raid10_set_cluster_sync_high(conf);
|
||||
md_cluster_ops->resync_info_update(mddev,
|
||||
conf->cluster_sync_low,
|
||||
conf->cluster_sync_high);
|
||||
}
|
||||
}
|
||||
|
||||
while (biolist) {
|
||||
bio = biolist;
|
||||
biolist = biolist->bi_next;
|
||||
@ -3659,6 +3758,18 @@ static int raid10_run(struct mddev *mddev)
|
||||
if (!conf)
|
||||
goto out;
|
||||
|
||||
if (mddev_is_clustered(conf->mddev)) {
|
||||
int fc, fo;
|
||||
|
||||
fc = (mddev->layout >> 8) & 255;
|
||||
fo = mddev->layout & (1<<16);
|
||||
if (fc > 1 || fo > 0) {
|
||||
pr_err("only near layout is supported by clustered"
|
||||
" raid10\n");
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
mddev->thread = conf->thread;
|
||||
conf->thread = NULL;
|
||||
|
||||
|
@ -88,6 +88,12 @@ struct r10conf {
|
||||
* the new thread here until we fully activate the array.
|
||||
*/
|
||||
struct md_thread *thread;
|
||||
|
||||
/*
|
||||
* Keep track of cluster resync window to send to other nodes.
|
||||
*/
|
||||
sector_t cluster_sync_low;
|
||||
sector_t cluster_sync_high;
|
||||
};
|
||||
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user