62f7b1989c
Currently md raid0/linear are not provided with any mechanism to validate if an array member got removed or failed. The driver keeps sending BIOs regardless of the state of array members, and kernel shows state 'clean' in the 'array_state' sysfs attribute. This leads to the following situation: if a raid0/linear array member is removed and the array is mounted, some user writing to this array won't realize that errors are happening unless they check dmesg or perform one fsync per written file. Despite udev signaling the member device is gone, 'mdadm' cannot issue the STOP_ARRAY ioctl successfully, given the array is mounted. In other words, no -EIO is returned and writes (except direct ones) appear normal. Meaning the user might think the wrote data is correctly stored in the array, but instead garbage was written given that raid0 does stripping (and so, it requires all its members to be working in order to not corrupt data). For md/linear, writes to the available members will work fine, but if the writes go to the missing member(s), it'll cause a file corruption situation, whereas the portion of the writes to the missing devices aren't written effectively. This patch changes this behavior: we check if the block device's gendisk is UP when submitting the BIO to the array member, and if it isn't, we flag the md device as MD_BROKEN and fail subsequent I/Os to that device; a read request to the array requiring data from a valid member is still completed. While flagging the device as MD_BROKEN, we also show a rate-limited warning in the kernel log. A new array state 'broken' was added too: it mimics the state 'clean' in every aspect, being useful only to distinguish if the array has some member missing. We rely on the MD_BROKEN flag to put the array in the 'broken' state. This state cannot be written in 'array_state' as it just shows one or more members of the array are missing but acts like 'clean', it wouldn't make sense to write it. With this patch, the filesystem reacts much faster to the event of missing array member: after some I/O errors, ext4 for instance aborts the journal and prevents corruption. Without this change, we're able to keep writing in the disk and after a machine reboot, e2fsck shows some severe fs errors that demand fixing. This patch was tested in ext4 and xfs filesystems, and requires a 'mdadm' counterpart to handle the 'broken' state. Cc: Song Liu <songliubraving@fb.com> Reviewed-by: NeilBrown <neilb@suse.de> Signed-off-by: Guilherme G. Piccoli <gpiccoli@canonical.com> Signed-off-by: Song Liu <songliubraving@fb.com>
346 lines
8.7 KiB
C
346 lines
8.7 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/*
|
|
linear.c : Multiple Devices driver for Linux
|
|
Copyright (C) 1994-96 Marc ZYNGIER
|
|
<zyngier@ufr-info-p7.ibp.fr> or
|
|
<maz@gloups.fdn.fr>
|
|
|
|
Linear mode management functions.
|
|
|
|
*/
|
|
|
|
#include <linux/blkdev.h>
|
|
#include <linux/raid/md_u.h>
|
|
#include <linux/seq_file.h>
|
|
#include <linux/module.h>
|
|
#include <linux/slab.h>
|
|
#include <trace/events/block.h>
|
|
#include "md.h"
|
|
#include "md-linear.h"
|
|
|
|
/*
|
|
* find which device holds a particular offset
|
|
*/
|
|
static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
|
|
{
|
|
int lo, mid, hi;
|
|
struct linear_conf *conf;
|
|
|
|
lo = 0;
|
|
hi = mddev->raid_disks - 1;
|
|
conf = mddev->private;
|
|
|
|
/*
|
|
* Binary Search
|
|
*/
|
|
|
|
while (hi > lo) {
|
|
|
|
mid = (hi + lo) / 2;
|
|
if (sector < conf->disks[mid].end_sector)
|
|
hi = mid;
|
|
else
|
|
lo = mid + 1;
|
|
}
|
|
|
|
return conf->disks + lo;
|
|
}
|
|
|
|
/*
|
|
* In linear_congested() conf->raid_disks is used as a copy of
|
|
* mddev->raid_disks to iterate conf->disks[], because conf->raid_disks
|
|
* and conf->disks[] are created in linear_conf(), they are always
|
|
* consitent with each other, but mddev->raid_disks does not.
|
|
*/
|
|
static int linear_congested(struct mddev *mddev, int bits)
|
|
{
|
|
struct linear_conf *conf;
|
|
int i, ret = 0;
|
|
|
|
rcu_read_lock();
|
|
conf = rcu_dereference(mddev->private);
|
|
|
|
for (i = 0; i < conf->raid_disks && !ret ; i++) {
|
|
struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
|
|
ret |= bdi_congested(q->backing_dev_info, bits);
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
return ret;
|
|
}
|
|
|
|
static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks)
|
|
{
|
|
struct linear_conf *conf;
|
|
sector_t array_sectors;
|
|
|
|
conf = mddev->private;
|
|
WARN_ONCE(sectors || raid_disks,
|
|
"%s does not support generic reshape\n", __func__);
|
|
array_sectors = conf->array_sectors;
|
|
|
|
return array_sectors;
|
|
}
|
|
|
|
static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
|
|
{
|
|
struct linear_conf *conf;
|
|
struct md_rdev *rdev;
|
|
int i, cnt;
|
|
bool discard_supported = false;
|
|
|
|
conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL);
|
|
if (!conf)
|
|
return NULL;
|
|
|
|
cnt = 0;
|
|
conf->array_sectors = 0;
|
|
|
|
rdev_for_each(rdev, mddev) {
|
|
int j = rdev->raid_disk;
|
|
struct dev_info *disk = conf->disks + j;
|
|
sector_t sectors;
|
|
|
|
if (j < 0 || j >= raid_disks || disk->rdev) {
|
|
pr_warn("md/linear:%s: disk numbering problem. Aborting!\n",
|
|
mdname(mddev));
|
|
goto out;
|
|
}
|
|
|
|
disk->rdev = rdev;
|
|
if (mddev->chunk_sectors) {
|
|
sectors = rdev->sectors;
|
|
sector_div(sectors, mddev->chunk_sectors);
|
|
rdev->sectors = sectors * mddev->chunk_sectors;
|
|
}
|
|
|
|
disk_stack_limits(mddev->gendisk, rdev->bdev,
|
|
rdev->data_offset << 9);
|
|
|
|
conf->array_sectors += rdev->sectors;
|
|
cnt++;
|
|
|
|
if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
|
|
discard_supported = true;
|
|
}
|
|
if (cnt != raid_disks) {
|
|
pr_warn("md/linear:%s: not enough drives present. Aborting!\n",
|
|
mdname(mddev));
|
|
goto out;
|
|
}
|
|
|
|
if (!discard_supported)
|
|
blk_queue_flag_clear(QUEUE_FLAG_DISCARD, mddev->queue);
|
|
else
|
|
blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue);
|
|
|
|
/*
|
|
* Here we calculate the device offsets.
|
|
*/
|
|
conf->disks[0].end_sector = conf->disks[0].rdev->sectors;
|
|
|
|
for (i = 1; i < raid_disks; i++)
|
|
conf->disks[i].end_sector =
|
|
conf->disks[i-1].end_sector +
|
|
conf->disks[i].rdev->sectors;
|
|
|
|
/*
|
|
* conf->raid_disks is copy of mddev->raid_disks. The reason to
|
|
* keep a copy of mddev->raid_disks in struct linear_conf is,
|
|
* mddev->raid_disks may not be consistent with pointers number of
|
|
* conf->disks[] when it is updated in linear_add() and used to
|
|
* iterate old conf->disks[] earray in linear_congested().
|
|
* Here conf->raid_disks is always consitent with number of
|
|
* pointers in conf->disks[] array, and mddev->private is updated
|
|
* with rcu_assign_pointer() in linear_addr(), such race can be
|
|
* avoided.
|
|
*/
|
|
conf->raid_disks = raid_disks;
|
|
|
|
return conf;
|
|
|
|
out:
|
|
kfree(conf);
|
|
return NULL;
|
|
}
|
|
|
|
static int linear_run (struct mddev *mddev)
|
|
{
|
|
struct linear_conf *conf;
|
|
int ret;
|
|
|
|
if (md_check_no_bitmap(mddev))
|
|
return -EINVAL;
|
|
conf = linear_conf(mddev, mddev->raid_disks);
|
|
|
|
if (!conf)
|
|
return 1;
|
|
mddev->private = conf;
|
|
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
|
|
|
|
ret = md_integrity_register(mddev);
|
|
if (ret) {
|
|
kfree(conf);
|
|
mddev->private = NULL;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
|
|
{
|
|
/* Adding a drive to a linear array allows the array to grow.
|
|
* It is permitted if the new drive has a matching superblock
|
|
* already on it, with raid_disk equal to raid_disks.
|
|
* It is achieved by creating a new linear_private_data structure
|
|
* and swapping it in in-place of the current one.
|
|
* The current one is never freed until the array is stopped.
|
|
* This avoids races.
|
|
*/
|
|
struct linear_conf *newconf, *oldconf;
|
|
|
|
if (rdev->saved_raid_disk != mddev->raid_disks)
|
|
return -EINVAL;
|
|
|
|
rdev->raid_disk = rdev->saved_raid_disk;
|
|
rdev->saved_raid_disk = -1;
|
|
|
|
newconf = linear_conf(mddev,mddev->raid_disks+1);
|
|
|
|
if (!newconf)
|
|
return -ENOMEM;
|
|
|
|
/* newconf->raid_disks already keeps a copy of * the increased
|
|
* value of mddev->raid_disks, WARN_ONCE() is just used to make
|
|
* sure of this. It is possible that oldconf is still referenced
|
|
* in linear_congested(), therefore kfree_rcu() is used to free
|
|
* oldconf until no one uses it anymore.
|
|
*/
|
|
mddev_suspend(mddev);
|
|
oldconf = rcu_dereference_protected(mddev->private,
|
|
lockdep_is_held(&mddev->reconfig_mutex));
|
|
mddev->raid_disks++;
|
|
WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
|
|
"copied raid_disks doesn't match mddev->raid_disks");
|
|
rcu_assign_pointer(mddev->private, newconf);
|
|
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
|
|
set_capacity(mddev->gendisk, mddev->array_sectors);
|
|
mddev_resume(mddev);
|
|
revalidate_disk(mddev->gendisk);
|
|
kfree_rcu(oldconf, rcu);
|
|
return 0;
|
|
}
|
|
|
|
static void linear_free(struct mddev *mddev, void *priv)
|
|
{
|
|
struct linear_conf *conf = priv;
|
|
|
|
kfree(conf);
|
|
}
|
|
|
|
static bool linear_make_request(struct mddev *mddev, struct bio *bio)
|
|
{
|
|
char b[BDEVNAME_SIZE];
|
|
struct dev_info *tmp_dev;
|
|
sector_t start_sector, end_sector, data_offset;
|
|
sector_t bio_sector = bio->bi_iter.bi_sector;
|
|
|
|
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
|
|
md_flush_request(mddev, bio);
|
|
return true;
|
|
}
|
|
|
|
tmp_dev = which_dev(mddev, bio_sector);
|
|
start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
|
|
end_sector = tmp_dev->end_sector;
|
|
data_offset = tmp_dev->rdev->data_offset;
|
|
|
|
if (unlikely(bio_sector >= end_sector ||
|
|
bio_sector < start_sector))
|
|
goto out_of_bounds;
|
|
|
|
if (unlikely(is_mddev_broken(tmp_dev->rdev, "linear"))) {
|
|
bio_io_error(bio);
|
|
return true;
|
|
}
|
|
|
|
if (unlikely(bio_end_sector(bio) > end_sector)) {
|
|
/* This bio crosses a device boundary, so we have to split it */
|
|
struct bio *split = bio_split(bio, end_sector - bio_sector,
|
|
GFP_NOIO, &mddev->bio_set);
|
|
bio_chain(split, bio);
|
|
generic_make_request(bio);
|
|
bio = split;
|
|
}
|
|
|
|
bio_set_dev(bio, tmp_dev->rdev->bdev);
|
|
bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
|
|
start_sector + data_offset;
|
|
|
|
if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
|
|
!blk_queue_discard(bio->bi_disk->queue))) {
|
|
/* Just ignore it */
|
|
bio_endio(bio);
|
|
} else {
|
|
if (mddev->gendisk)
|
|
trace_block_bio_remap(bio->bi_disk->queue,
|
|
bio, disk_devt(mddev->gendisk),
|
|
bio_sector);
|
|
mddev_check_writesame(mddev, bio);
|
|
mddev_check_write_zeroes(mddev, bio);
|
|
generic_make_request(bio);
|
|
}
|
|
return true;
|
|
|
|
out_of_bounds:
|
|
pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n",
|
|
mdname(mddev),
|
|
(unsigned long long)bio->bi_iter.bi_sector,
|
|
bdevname(tmp_dev->rdev->bdev, b),
|
|
(unsigned long long)tmp_dev->rdev->sectors,
|
|
(unsigned long long)start_sector);
|
|
bio_io_error(bio);
|
|
return true;
|
|
}
|
|
|
|
static void linear_status (struct seq_file *seq, struct mddev *mddev)
|
|
{
|
|
seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
|
|
}
|
|
|
|
static void linear_quiesce(struct mddev *mddev, int state)
|
|
{
|
|
}
|
|
|
|
static struct md_personality linear_personality =
|
|
{
|
|
.name = "linear",
|
|
.level = LEVEL_LINEAR,
|
|
.owner = THIS_MODULE,
|
|
.make_request = linear_make_request,
|
|
.run = linear_run,
|
|
.free = linear_free,
|
|
.status = linear_status,
|
|
.hot_add_disk = linear_add,
|
|
.size = linear_size,
|
|
.quiesce = linear_quiesce,
|
|
.congested = linear_congested,
|
|
};
|
|
|
|
static int __init linear_init (void)
|
|
{
|
|
return register_md_personality (&linear_personality);
|
|
}
|
|
|
|
static void linear_exit (void)
|
|
{
|
|
unregister_md_personality (&linear_personality);
|
|
}
|
|
|
|
module_init(linear_init);
|
|
module_exit(linear_exit);
|
|
MODULE_LICENSE("GPL");
|
|
MODULE_DESCRIPTION("Linear device concatenation personality for MD");
|
|
MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
|
|
MODULE_ALIAS("md-linear");
|
|
MODULE_ALIAS("md-level--1");
|