From 203d27b0226a05202438ddb39ef0ef1acb14a759 Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Tue, 20 Oct 2015 12:09:12 -0400 Subject: [PATCH 1/5] md/raid1: submit_bio_wait() returns 0 on success This was introduced with 9e882242c6193ae6f416f2d8d8db0d9126bd996b which changed the return value of submit_bio_wait() to return != 0 on error, but didn't update the caller accordingly. Fixes: 9e882242c6 ("block: Add submit_bio_wait(), remove from md") Cc: stable@vger.kernel.org (v3.10) Reported-by: Bill Kuzeja Signed-off-by: Jes Sorensen Signed-off-by: NeilBrown --- drivers/md/raid1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ddd8a5f572aa..cfca6edf7813 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2195,7 +2195,7 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) bio_trim(wbio, sector - r1_bio->sector, sectors); wbio->bi_iter.bi_sector += rdev->data_offset; wbio->bi_bdev = rdev->bdev; - if (submit_bio_wait(WRITE, wbio) == 0) + if (submit_bio_wait(WRITE, wbio) < 0) /* failure! */ ok = rdev_set_badblocks(rdev, sector, sectors, 0) From 681ab4696062f5aa939c9e04d058732306a97176 Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Tue, 20 Oct 2015 12:09:13 -0400 Subject: [PATCH 2/5] md/raid10: submit_bio_wait() returns 0 on success This was introduced with 9e882242c6193ae6f416f2d8d8db0d9126bd996b which changed the return value of submit_bio_wait() to return != 0 on error, but didn't update the caller accordingly. Fixes: 9e882242c6 ("block: Add submit_bio_wait(), remove from md") Cc: stable@vger.kernel.org (v3.10) Reported-by: Bill Kuzeja Signed-off-by: Jes Sorensen Signed-off-by: NeilBrown --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 9f69dc526f8c..a9ecec4e9a13 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2467,7 +2467,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) choose_data_offset(r10_bio, rdev) + (sector - r10_bio->sector)); wbio->bi_bdev = rdev->bdev; - if (submit_bio_wait(WRITE, wbio) == 0) + if (submit_bio_wait(WRITE, wbio) < 0) /* Failure! */ ok = rdev_set_badblocks(rdev, sector, sectors, 0) From bd8688a199b864944bf62eebed0ca13b46249453 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sat, 24 Oct 2015 16:02:16 +1100 Subject: [PATCH 3/5] md/raid1: don't clear bitmap bit when bad-block-list write fails. When a write fails and a bad-block-list is present, we can update the bad-block-list instead of writing the data. If this succeeds then it is OK clear the relevant bitmap-bit as no further 'sync' of the block is needed. However if writing the bad-block-list fails then we need to treat the write as failed and particularly must not clear the bitmap bit. Otherwise the device can be re-added (after any hardware connection issues are resolved) and because the relevant bit in the bitmap is clear, that block will not be resynced. This leads to data corruption. We already delay the final bio_endio() on the write until the bad-block-list is written so that when the write returns: either that data is safe, the bad-block record is safe, or the fact that the device is faulty is safe. However we *don't* delay the clearing of the bitmap, so the bitmap bit can be recorded as cleared before we know if the bad-block-list was written safely. So: delay that until the write really is safe. i.e. move the call to close_write() until just before calling bio_endio(), and recheck the 'is array degraded' status before making that call. This bug goes back to v3.1 when bad-block-lists were introduced, though it only affects arrays created with mdadm-3.3 or later as only those have bad-block lists. Backports will require at least Commit: 55ce74d4bfe1 ("md/raid1: ensure device failure recorded before write request returns.") as well. I'll send that to 'stable' separately. Note that of the two tests of R1BIO_WriteError that this patch adds, the first is certain to fail and the second is certain to succeed. However doing it this way makes the patch more obviously correct. I will tidy the code up in a future merge window. Reported-and-tested-by: Nate Dailey Cc: Jes Sorensen Fixes: cd5ff9a16f08 ("md/raid1: Handle write errors by updating badblock log.") Signed-off-by: NeilBrown --- drivers/md/raid1.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cfca6edf7813..d9d031ede4bf 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2258,15 +2258,16 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) rdev_dec_pending(conf->mirrors[m].rdev, conf->mddev); } - if (test_bit(R1BIO_WriteError, &r1_bio->state)) - close_write(r1_bio); if (fail) { spin_lock_irq(&conf->device_lock); list_add(&r1_bio->retry_list, &conf->bio_end_io_list); spin_unlock_irq(&conf->device_lock); md_wakeup_thread(conf->mddev->thread); - } else + } else { + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + close_write(r1_bio); raid_end_bio_io(r1_bio); + } } static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) @@ -2385,6 +2386,10 @@ static void raid1d(struct md_thread *thread) r1_bio = list_first_entry(&tmp, struct r1bio, retry_list); list_del(&r1_bio->retry_list); + if (mddev->degraded) + set_bit(R1BIO_Degraded, &r1_bio->state); + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + close_write(r1_bio); raid_end_bio_io(r1_bio); } } From c340702ca26a628832fade4f133d8160a55c29cc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sat, 24 Oct 2015 16:23:48 +1100 Subject: [PATCH 4/5] md/raid10: don't clear bitmap bit when bad-block-list write fails. When a write fails and a bad-block-list is present, we can update the bad-block-list instead of writing the data. If this succeeds then it is OK clear the relevant bitmap-bit as no further 'sync' of the block is needed. However if writing the bad-block-list fails then we need to treat the write as failed and particularly must not clear the bitmap bit. Otherwise the device can be re-added (after any hardware connection issues are resolved) and because the relevant bit in the bitmap is clear, that block will not be resynced. This leads to data corruption. We already delay the final bio_endio() on the write until the bad-block-list is written so that when the write returns: either that data is safe, the bad-block record is safe, or the fact that the device is faulty is safe. However we *don't* delay the clearing of the bitmap, so the bitmap bit can be recorded as cleared before we know if the bad-block-list was written safely. So: delay that until the write really is safe. i.e. move the call to close_write() until just before calling bio_endio(), and recheck the 'is array degraded' status before making that call. This bug goes back to v3.1 when bad-block-lists were introduced, though it only affects arrays created with mdadm-3.3 or later as only those have bad-block lists. Backports will require at least Commit: 95af587e95aa ("md/raid10: ensure device failure recorded before write request returns.") as well. I'll send that to 'stable' separately. Note that of the two tests of R10BIO_WriteError that this patch adds, the first is certain to fail and the second is certain to succeed. However doing it this way makes the patch more obviously correct. I will tidy the code up in a future merge window. Reported-by: Nate Dailey Fixes: bd870a16c594 ("md/raid10: Handle write errors by updating badblock log.") Signed-off-by: NeilBrown --- drivers/md/raid10.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a9ecec4e9a13..23de2144ee13 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2654,16 +2654,17 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_dec_pending(rdev, conf->mddev); } } - if (test_bit(R10BIO_WriteError, - &r10_bio->state)) - close_write(r10_bio); if (fail) { spin_lock_irq(&conf->device_lock); list_add(&r10_bio->retry_list, &conf->bio_end_io_list); spin_unlock_irq(&conf->device_lock); md_wakeup_thread(conf->mddev->thread); - } else + } else { + if (test_bit(R10BIO_WriteError, + &r10_bio->state)) + close_write(r10_bio); raid_end_bio_io(r10_bio); + } } } @@ -2691,6 +2692,12 @@ static void raid10d(struct md_thread *thread) r10_bio = list_first_entry(&tmp, struct r10bio, retry_list); list_del(&r10_bio->retry_list); + if (mddev->degraded) + set_bit(R10BIO_Degraded, &r10_bio->state); + + if (test_bit(R10BIO_WriteError, + &r10_bio->state)) + close_write(r10_bio); raid_end_bio_io(r10_bio); } } From 8bce6d35b308d73cdb2ee273c95d711a55be688c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 22 Oct 2015 13:20:15 +1100 Subject: [PATCH 5/5] md/raid10: fix the 'new' raid10 layout to work correctly. In Linux 3.9 we introduce a new 'far' layout for RAID10 which was supposed to rotate the replicas differently and so provide better resilience. In particular it could survive more combinations of 2 drive failures. Unfortunately. due to a coding error, this some did what was wanted, sometimes improved less than we hoped, and sometimes - in very unlikely circumstances - put multiple replicas on the same device so the redundancy was harmed. No public user-space tool has created arrays using this layout so it is very unlikely that zero-redundancy arrays actually exist. Probably no arrays using any form of the new layout exist. But we cannot be certain. So use another bit in the 'layout' number and introduce a bug-fixed version of the layout. Also when assembling an array, if it has a zero-redundancy layout, give a warning. Reported-by: Heinz Mauelshagen Signed-off-by: NeilBrown --- drivers/md/raid10.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 23de2144ee13..96f365968306 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -39,6 +39,7 @@ * far_copies (stored in second byte of layout) * far_offset (stored in bit 16 of layout ) * use_far_sets (stored in bit 17 of layout ) + * use_far_sets_bugfixed (stored in bit 18 of layout ) * * The data to be stored is divided into chunks using chunksize. Each device * is divided into far_copies sections. In each section, chunks are laid out @@ -1497,6 +1498,8 @@ static void status(struct seq_file *seq, struct mddev *mddev) seq_printf(seq, " %d offset-copies", conf->geo.far_copies); else seq_printf(seq, " %d far-copies", conf->geo.far_copies); + if (conf->geo.far_set_size != conf->geo.raid_disks) + seq_printf(seq, " %d devices per set", conf->geo.far_set_size); } seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, conf->geo.raid_disks - mddev->degraded); @@ -3394,7 +3397,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) disks = mddev->raid_disks + mddev->delta_disks; break; } - if (layout >> 18) + if (layout >> 19) return -1; if (chunk < (PAGE_SIZE >> 9) || !is_power_of_2(chunk)) @@ -3406,7 +3409,22 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) geo->near_copies = nc; geo->far_copies = fc; geo->far_offset = fo; - geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks; + switch (layout >> 17) { + case 0: /* original layout. simple but not always optimal */ + geo->far_set_size = disks; + break; + case 1: /* "improved" layout which was buggy. Hopefully no-one is + * actually using this, but leave code here just in case.*/ + geo->far_set_size = disks/fc; + WARN(geo->far_set_size < fc, + "This RAID10 layout does not provide data safety - please backup and create new array\n"); + break; + case 2: /* "improved" layout fixed to match documentation */ + geo->far_set_size = fc * nc; + break; + default: /* Not a valid layout */ + return -1; + } geo->chunk_mask = chunk - 1; geo->chunk_shift = ffz(~chunk); return nc*fc;