From 8b48ec23cc51a4e7c8dbaef5f34ebe67e1a80934 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Sat, 13 Feb 2021 01:49:59 +0100 Subject: [PATCH 01/23] md: don't unregister sync_thread with reconfig_mutex held Unregister sync_thread doesn't need to hold reconfig_mutex since it doesn't reconfigure array. And it could cause deadlock problem for raid5 as follows: 1. process A tried to reap sync thread with reconfig_mutex held after echo idle to sync_action. 2. raid5 sync thread was blocked if there were too many active stripes. 3. SB_CHANGE_PENDING was set (because of write IO comes from upper layer) which causes the number of active stripes can't be decreased. 4. SB_CHANGE_PENDING can't be cleared since md_check_recovery was not able to hold reconfig_mutex. More details in the link: https://lore.kernel.org/linux-raid/5ed54ffc-ce82-bf66-4eff-390cb23bc1ac@molgen.mpg.de/T/#t And add one parameter to md_reap_sync_thread since it could be called by dm-raid which doesn't hold reconfig_mutex. Reported-and-tested-by: Donald Buczek Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/dm-raid.c | 2 +- drivers/md/md.c | 14 +++++++++----- drivers/md/md.h | 2 +- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 9526ccbedafb..5e41fbae3f6b 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3725,7 +3725,7 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); + md_reap_sync_thread(mddev, false); } } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle) return -EBUSY; diff --git a/drivers/md/md.c b/drivers/md/md.c index 707e802d0082..1958a14a0c79 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4844,7 +4844,7 @@ action_store(struct mddev *mddev, const char *page, size_t len) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); + md_reap_sync_thread(mddev, true); } mddev_unlock(mddev); } @@ -6213,7 +6213,7 @@ static void __md_stop_writes(struct mddev *mddev) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); + md_reap_sync_thread(mddev, true); } del_timer_sync(&mddev->safemode_timer); @@ -9324,7 +9324,7 @@ void md_check_recovery(struct mddev *mddev) * ->spare_active and clear saved_raid_disk */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); + md_reap_sync_thread(mddev, true); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); @@ -9359,7 +9359,7 @@ void md_check_recovery(struct mddev *mddev) goto unlock; } if (mddev->sync_thread) { - md_reap_sync_thread(mddev); + md_reap_sync_thread(mddev, true); goto unlock; } /* Set RUNNING before clearing NEEDED to avoid @@ -9432,14 +9432,18 @@ void md_check_recovery(struct mddev *mddev) } EXPORT_SYMBOL(md_check_recovery); -void md_reap_sync_thread(struct mddev *mddev) +void md_reap_sync_thread(struct mddev *mddev, bool reconfig_mutex_held) { struct md_rdev *rdev; sector_t old_dev_sectors = mddev->dev_sectors; bool is_reshaped = false; + if (reconfig_mutex_held) + mddev_unlock(mddev); /* resync has finished, collect result */ md_unregister_thread(&mddev->sync_thread); + if (reconfig_mutex_held) + mddev_lock_nointr(mddev); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && mddev->degraded != mddev->raid_disks) { diff --git a/drivers/md/md.h b/drivers/md/md.h index cf2cbb17acbd..5f62c46ac2d3 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -719,7 +719,7 @@ extern struct md_thread *md_register_thread( extern void md_unregister_thread(struct md_thread **threadp); extern void md_wakeup_thread(struct md_thread *thread); extern void md_check_recovery(struct mddev *mddev); -extern void md_reap_sync_thread(struct mddev *mddev); +extern void md_reap_sync_thread(struct mddev *mddev, bool reconfig_mutex_held); extern int mddev_init_writes_pending(struct mddev *mddev); extern bool md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_inc(struct mddev *mddev, struct bio *bi); From 1e267742283a4b5a8ca65755c44166be27e9aa0f Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 29 Apr 2022 16:49:09 +0800 Subject: [PATCH 02/23] md: protect md_unregister_thread from reentrancy Generally, the md_unregister_thread is called with reconfig_mutex, but raid_message in dm-raid doesn't hold reconfig_mutex to unregister thread, so md_unregister_thread can be called simulitaneously from two call sites in theory. Then after previous commit which remove the protection of reconfig_mutex for md_unregister_thread completely, the potential issue could be worse than before. Let's take pers_lock at the beginning of function to ensure reentrancy. Reported-by: Donald Buczek Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 1958a14a0c79..53787a32166d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7963,17 +7963,22 @@ EXPORT_SYMBOL(md_register_thread); void md_unregister_thread(struct md_thread **threadp) { - struct md_thread *thread = *threadp; - if (!thread) - return; - pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); - /* Locking ensures that mddev_unlock does not wake_up a + struct md_thread *thread; + + /* + * Locking ensures that mddev_unlock does not wake_up a * non-existent thread */ spin_lock(&pers_lock); + thread = *threadp; + if (!thread) { + spin_unlock(&pers_lock); + return; + } *threadp = NULL; spin_unlock(&pers_lock); + pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); kthread_stop(thread->tsk); kfree(thread); } From 913cce5a1e588e3470ea064fe4ea336037d3a454 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 12 May 2022 08:19:13 +0200 Subject: [PATCH 03/23] md: remove most calls to bdevname Use the %pg format specifier to save on stack consumption and code size. Signed-off-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/md-linear.c | 5 +- drivers/md/md-multipath.c | 15 ++-- drivers/md/md.c | 152 ++++++++++++++++---------------------- drivers/md/raid0.c | 28 +++---- drivers/md/raid1.c | 24 +++--- drivers/md/raid10.c | 54 ++++++-------- drivers/md/raid5-cache.c | 5 +- drivers/md/raid5-ppl.c | 27 +++---- drivers/md/raid5.c | 37 ++++------ 9 files changed, 147 insertions(+), 200 deletions(-) diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 138a3b25c5c8..6e7797b4e738 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -206,7 +206,6 @@ static void linear_free(struct mddev *mddev, void *priv) static bool linear_make_request(struct mddev *mddev, struct bio *bio) { - char b[BDEVNAME_SIZE]; struct dev_info *tmp_dev; sector_t start_sector, end_sector, data_offset; sector_t bio_sector = bio->bi_iter.bi_sector; @@ -256,10 +255,10 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) return true; out_of_bounds: - pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n", + pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %pg: %llu sectors, offset %llu\n", mdname(mddev), (unsigned long long)bio->bi_iter.bi_sector, - bdevname(tmp_dev->rdev->bdev, b), + tmp_dev->rdev->bdev, (unsigned long long)tmp_dev->rdev->sectors, (unsigned long long)start_sector); bio_io_error(bio); diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 1c6dbf92c136..66edf5e72bd6 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -87,10 +87,9 @@ static void multipath_end_request(struct bio *bio) /* * oops, IO error: */ - char b[BDEVNAME_SIZE]; md_error (mp_bh->mddev, rdev); - pr_info("multipath: %s: rescheduling sector %llu\n", - bdevname(rdev->bdev,b), + pr_info("multipath: %pg: rescheduling sector %llu\n", + rdev->bdev, (unsigned long long)bio->bi_iter.bi_sector); multipath_reschedule_retry(mp_bh); } else @@ -154,7 +153,6 @@ static void multipath_status(struct seq_file *seq, struct mddev *mddev) static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) { struct mpconf *conf = mddev->private; - char b[BDEVNAME_SIZE]; if (conf->raid_disks - mddev->degraded <= 1) { /* @@ -177,9 +175,9 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) } set_bit(Faulty, &rdev->flags); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - pr_err("multipath: IO failure on %s, disabling IO path.\n" + pr_err("multipath: IO failure on %pg, disabling IO path.\n" "multipath: Operation continuing on %d IO paths.\n", - bdevname(rdev->bdev, b), + rdev->bdev, conf->raid_disks - mddev->degraded); } @@ -197,12 +195,11 @@ static void print_multipath_conf (struct mpconf *conf) conf->raid_disks); for (i = 0; i < conf->raid_disks; i++) { - char b[BDEVNAME_SIZE]; tmp = conf->multipaths + i; if (tmp->rdev) - pr_debug(" disk%d, o:%d, dev:%s\n", + pr_debug(" disk%d, o:%d, dev:%pg\n", i,!test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev,b)); + tmp->rdev->bdev); } } diff --git a/drivers/md/md.c b/drivers/md/md.c index 53787a32166d..2303fbd1dcad 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1021,8 +1021,6 @@ EXPORT_SYMBOL_GPL(sync_page_io); static int read_disk_sb(struct md_rdev *rdev, int size) { - char b[BDEVNAME_SIZE]; - if (rdev->sb_loaded) return 0; @@ -1032,8 +1030,8 @@ static int read_disk_sb(struct md_rdev *rdev, int size) return 0; fail: - pr_err("md: disabled device %s, could not read superblock.\n", - bdevname(rdev->bdev,b)); + pr_err("md: disabled device %pg, could not read superblock.\n", + rdev->bdev); return -EINVAL; } @@ -1179,7 +1177,6 @@ EXPORT_SYMBOL(md_check_no_bitmap); */ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { - char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; mdp_super_t *sb; int ret; bool spare_disk = true; @@ -1198,19 +1195,19 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor ret = -EINVAL; - bdevname(rdev->bdev, b); sb = page_address(rdev->sb_page); if (sb->md_magic != MD_SB_MAGIC) { - pr_warn("md: invalid raid superblock magic on %s\n", b); + pr_warn("md: invalid raid superblock magic on %pg\n", + rdev->bdev); goto abort; } if (sb->major_version != 0 || sb->minor_version < 90 || sb->minor_version > 91) { - pr_warn("Bad version number %d.%d on %s\n", - sb->major_version, sb->minor_version, b); + pr_warn("Bad version number %d.%d on %pg\n", + sb->major_version, sb->minor_version, rdev->bdev); goto abort; } @@ -1218,7 +1215,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor goto abort; if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { - pr_warn("md: invalid superblock checksum on %s\n", b); + pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); goto abort; } @@ -1250,13 +1247,13 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor __u64 ev1, ev2; mdp_super_t *refsb = page_address(refdev->sb_page); if (!md_uuid_equal(refsb, sb)) { - pr_warn("md: %s has different UUID to %s\n", - b, bdevname(refdev->bdev,b2)); + pr_warn("md: %pg has different UUID to %pg\n", + rdev->bdev, refdev->bdev); goto abort; } if (!md_sb_equal(refsb, sb)) { - pr_warn("md: %s has same UUID but different superblock to %s\n", - b, bdevname(refdev->bdev, b2)); + pr_warn("md: %pg has same UUID but different superblock to %pg\n", + rdev->bdev, refdev->bdev); goto abort; } ev1 = md_event(sb); @@ -1620,7 +1617,6 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ int ret; sector_t sb_start; sector_t sectors; - char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; int bmask; bool spare_disk = true; @@ -1664,13 +1660,13 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ return -EINVAL; if (calc_sb_1_csum(sb) != sb->sb_csum) { - pr_warn("md: invalid superblock checksum on %s\n", - bdevname(rdev->bdev,b)); + pr_warn("md: invalid superblock checksum on %pg\n", + rdev->bdev); return -EINVAL; } if (le64_to_cpu(sb->data_size) < 10) { - pr_warn("md: data_size too small on %s\n", - bdevname(rdev->bdev,b)); + pr_warn("md: data_size too small on %pg\n", + rdev->bdev); return -EINVAL; } if (sb->pad0 || @@ -1776,9 +1772,9 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ sb->level != refsb->level || sb->layout != refsb->layout || sb->chunksize != refsb->chunksize) { - pr_warn("md: %s has strangely different superblock to %s\n", - bdevname(rdev->bdev,b), - bdevname(refdev->bdev,b2)); + pr_warn("md: %pg has strangely different superblock to %pg\n", + rdev->bdev, + refdev->bdev); return -EINVAL; } ev1 = le64_to_cpu(sb->events); @@ -2365,7 +2361,6 @@ EXPORT_SYMBOL(md_integrity_register); int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) { struct blk_integrity *bi_mddev; - char name[BDEVNAME_SIZE]; if (!mddev->gendisk) return 0; @@ -2376,8 +2371,8 @@ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) return 0; if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { - pr_err("%s: incompatible integrity profile for %s\n", - mdname(mddev), bdevname(rdev->bdev, name)); + pr_err("%s: incompatible integrity profile for %pg\n", + mdname(mddev), rdev->bdev); return -ENXIO; } @@ -2486,11 +2481,9 @@ static void rdev_delayed_delete(struct work_struct *ws) static void unbind_rdev_from_array(struct md_rdev *rdev) { - char b[BDEVNAME_SIZE]; - bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); - pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); + pr_debug("md: unbind<%pg>\n", rdev->bdev); mddev_destroy_serial_pool(rdev->mddev, rdev, false); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); @@ -2543,9 +2536,7 @@ void md_autodetect_dev(dev_t dev); static void export_rdev(struct md_rdev *rdev) { - char b[BDEVNAME_SIZE]; - - pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b)); + pr_debug("md: export_rdev(%pg)\n", rdev->bdev); md_rdev_clear(rdev); #ifndef MODULE if (test_bit(AutoDetected, &rdev->flags)) @@ -2803,8 +2794,6 @@ repeat: rewrite: md_bitmap_update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { - char b[BDEVNAME_SIZE]; - if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ @@ -2812,8 +2801,8 @@ rewrite: md_super_write(mddev,rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); - pr_debug("md: (write) %s's sb offset: %llu\n", - bdevname(rdev->bdev, b), + pr_debug("md: (write) %pg's sb offset: %llu\n", + rdev->bdev, (unsigned long long)rdev->sb_start); rdev->sb_events = mddev->events; if (rdev->badblocks.size) { @@ -2825,8 +2814,8 @@ rewrite: } } else - pr_debug("md: %s (skipping faulty)\n", - bdevname(rdev->bdev, b)); + pr_debug("md: %pg (skipping faulty)\n", + rdev->bdev); if (mddev->level == LEVEL_MULTIPATH) /* only need to write one superblock... */ @@ -3701,7 +3690,6 @@ EXPORT_SYMBOL_GPL(md_rdev_init); */ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) { - char b[BDEVNAME_SIZE]; int err; struct md_rdev *rdev; sector_t size; @@ -3725,8 +3713,8 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; if (!size) { - pr_warn("md: %s has zero or unknown size, marking faulty!\n", - bdevname(rdev->bdev,b)); + pr_warn("md: %pg has zero or unknown size, marking faulty!\n", + rdev->bdev); err = -EINVAL; goto abort_free; } @@ -3735,14 +3723,14 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe err = super_types[super_format]. load_super(rdev, NULL, super_minor); if (err == -EINVAL) { - pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n", - bdevname(rdev->bdev,b), + pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", + rdev->bdev, super_format, super_minor); goto abort_free; } if (err < 0) { - pr_warn("md: could not read %s's sb, not importing!\n", - bdevname(rdev->bdev,b)); + pr_warn("md: could not read %pg's sb, not importing!\n", + rdev->bdev); goto abort_free; } } @@ -3765,7 +3753,6 @@ static int analyze_sbs(struct mddev *mddev) { int i; struct md_rdev *rdev, *freshest, *tmp; - char b[BDEVNAME_SIZE]; freshest = NULL; rdev_for_each_safe(rdev, tmp, mddev) @@ -3777,8 +3764,8 @@ static int analyze_sbs(struct mddev *mddev) case 0: break; default: - pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n", - bdevname(rdev->bdev,b)); + pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", + rdev->bdev); md_kick_rdev_from_array(rdev); } @@ -3796,8 +3783,8 @@ static int analyze_sbs(struct mddev *mddev) if (mddev->max_disks && (rdev->desc_nr >= mddev->max_disks || i > mddev->max_disks)) { - pr_warn("md: %s: %s: only %d devices permitted\n", - mdname(mddev), bdevname(rdev->bdev, b), + pr_warn("md: %s: %pg: only %d devices permitted\n", + mdname(mddev), rdev->bdev, mddev->max_disks); md_kick_rdev_from_array(rdev); continue; @@ -3805,8 +3792,8 @@ static int analyze_sbs(struct mddev *mddev) if (rdev != freshest) { if (super_types[mddev->major_version]. validate_super(mddev, rdev)) { - pr_warn("md: kicking non-fresh %s from array!\n", - bdevname(rdev->bdev,b)); + pr_warn("md: kicking non-fresh %pg from array!\n", + rdev->bdev); md_kick_rdev_from_array(rdev); continue; } @@ -5912,7 +5899,6 @@ int md_run(struct mddev *mddev) /* Warn if this is a potentially silly * configuration. */ - char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; struct md_rdev *rdev2; int warned = 0; @@ -5921,10 +5907,10 @@ int md_run(struct mddev *mddev) if (rdev < rdev2 && rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { - pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n", + pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", mdname(mddev), - bdevname(rdev->bdev,b), - bdevname(rdev2->bdev,b2)); + rdev->bdev, + rdev2->bdev); warned = 1; } } @@ -6452,8 +6438,7 @@ static void autorun_array(struct mddev *mddev) pr_info("md: running: "); rdev_for_each(rdev, mddev) { - char b[BDEVNAME_SIZE]; - pr_cont("<%s>", bdevname(rdev->bdev,b)); + pr_cont("<%pg>", rdev->bdev); } pr_cont("\n"); @@ -6480,7 +6465,6 @@ static void autorun_devices(int part) { struct md_rdev *rdev0, *rdev, *tmp; struct mddev *mddev; - char b[BDEVNAME_SIZE]; pr_info("md: autorun ...\n"); while (!list_empty(&pending_raid_disks)) { @@ -6490,12 +6474,12 @@ static void autorun_devices(int part) rdev0 = list_entry(pending_raid_disks.next, struct md_rdev, same_set); - pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b)); + pr_debug("md: considering %pg ...\n", rdev0->bdev); INIT_LIST_HEAD(&candidates); rdev_for_each_list(rdev, tmp, &pending_raid_disks) if (super_90_load(rdev, rdev0, 0) >= 0) { - pr_debug("md: adding %s ...\n", - bdevname(rdev->bdev,b)); + pr_debug("md: adding %pg ...\n", + rdev->bdev); list_move(&rdev->same_set, &candidates); } /* @@ -6512,8 +6496,8 @@ static void autorun_devices(int part) unit = MINOR(dev); } if (rdev0->preferred_minor != unit) { - pr_warn("md: unit number in %s is bad: %d\n", - bdevname(rdev0->bdev, b), rdev0->preferred_minor); + pr_warn("md: unit number in %pg is bad: %d\n", + rdev0->bdev, rdev0->preferred_minor); break; } @@ -6526,8 +6510,8 @@ static void autorun_devices(int part) pr_warn("md: %s locked, cannot run\n", mdname(mddev)); else if (mddev->raid_disks || mddev->major_version || !list_empty(&mddev->disks)) { - pr_warn("md: %s already running, cannot run %s\n", - mdname(mddev), bdevname(rdev0->bdev,b)); + pr_warn("md: %s already running, cannot run %pg\n", + mdname(mddev), rdev0->bdev); mddev_unlock(mddev); } else { pr_debug("md: created %s\n", mdname(mddev)); @@ -6701,7 +6685,6 @@ static int get_disk_info(struct mddev *mddev, void __user * arg) int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) { - char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; struct md_rdev *rdev; dev_t dev = MKDEV(info->major,info->minor); @@ -6731,9 +6714,9 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) err = super_types[mddev->major_version] .load_super(rdev, rdev0, mddev->minor_version); if (err < 0) { - pr_warn("md: %s has different UUID to %s\n", - bdevname(rdev->bdev,b), - bdevname(rdev0->bdev,b2)); + pr_warn("md: %pg has different UUID to %pg\n", + rdev->bdev, + rdev0->bdev); export_rdev(rdev); return -EINVAL; } @@ -6908,7 +6891,6 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) static int hot_remove_disk(struct mddev *mddev, dev_t dev) { - char b[BDEVNAME_SIZE]; struct md_rdev *rdev; if (!mddev->pers) @@ -6943,14 +6925,13 @@ kick_rdev: return 0; busy: - pr_debug("md: cannot remove active disk %s from %s ...\n", - bdevname(rdev->bdev,b), mdname(mddev)); + pr_debug("md: cannot remove active disk %pg from %s ...\n", + rdev->bdev, mdname(mddev)); return -EBUSY; } static int hot_add_disk(struct mddev *mddev, dev_t dev) { - char b[BDEVNAME_SIZE]; int err; struct md_rdev *rdev; @@ -6983,8 +6964,8 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) rdev->sectors = rdev->sb_start; if (test_bit(Faulty, &rdev->flags)) { - pr_warn("md: can not hot-add faulty %s disk to %s!\n", - bdevname(rdev->bdev,b), mdname(mddev)); + pr_warn("md: can not hot-add faulty %pg disk to %s!\n", + rdev->bdev, mdname(mddev)); err = -EINVAL; goto abort_export; } @@ -7011,8 +6992,8 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) * disable on the whole MD. */ if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) { - pr_info("%s: Disabling nowait because %s does not support nowait\n", - mdname(mddev), bdevname(rdev->bdev, b)); + pr_info("%s: Disabling nowait because %pg does not support nowait\n", + mdname(mddev), rdev->bdev); blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue); } /* @@ -8017,10 +7998,8 @@ static void status_unused(struct seq_file *seq) seq_printf(seq, "unused devices: "); list_for_each_entry(rdev, &pending_raid_disks, same_set) { - char b[BDEVNAME_SIZE]; i++; - seq_printf(seq, "%s ", - bdevname(rdev->bdev,b)); + seq_printf(seq, "%pg ", rdev->bdev); } if (!i) seq_printf(seq, ""); @@ -8260,9 +8239,8 @@ static int md_seq_show(struct seq_file *seq, void *v) sectors = 0; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { - char b[BDEVNAME_SIZE]; - seq_printf(seq, " %s[%d]", - bdevname(rdev->bdev,b), rdev->desc_nr); + seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); + if (test_bit(WriteMostly, &rdev->flags)) seq_printf(seq, "(W)"); if (test_bit(Journal, &rdev->flags)) @@ -9661,7 +9639,6 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) struct mdp_superblock_1 *sb = page_address(rdev->sb_page); struct md_rdev *rdev2, *tmp; int role, ret; - char b[BDEVNAME_SIZE]; /* * If size is changed in another node then we need to @@ -9685,7 +9662,8 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) if (test_bit(Candidate, &rdev2->flags)) { if (role == MD_DISK_ROLE_FAULTY) { - pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b)); + pr_info("md: Removing Candidate device %pg because add failed\n", + rdev2->bdev); md_kick_rdev_from_array(rdev2); continue; } @@ -9702,8 +9680,8 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) MD_FEATURE_RESHAPE_ACTIVE)) { rdev2->saved_raid_disk = role; ret = remove_and_add_spares(mddev, rdev2); - pr_info("Activated spare: %s\n", - bdevname(rdev2->bdev,b)); + pr_info("Activated spare: %pg\n", + rdev2->bdev); /* wakeup mddev->thread here, so array could * perform resync with the new activated disk */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index e11701e394ca..e3bc38b6c61f 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -37,7 +37,6 @@ static void dump_zones(struct mddev *mddev) int j, k; sector_t zone_size = 0; sector_t zone_start = 0; - char b[BDEVNAME_SIZE]; struct r0conf *conf = mddev->private; int raid_disks = conf->strip_zone[0].nb_dev; pr_debug("md: RAID0 configuration for %s - %d zone%s\n", @@ -48,9 +47,8 @@ static void dump_zones(struct mddev *mddev) int len = 0; for (k = 0; k < conf->strip_zone[j].nb_dev; k++) - len += snprintf(line+len, 200-len, "%s%s", k?"/":"", - bdevname(conf->devlist[j*raid_disks - + k]->bdev, b)); + len += snprintf(line+len, 200-len, "%s%pg", k?"/":"", + conf->devlist[j * raid_disks + k]->bdev); pr_debug("md: zone%d=[%s]\n", j, line); zone_size = conf->strip_zone[j].zone_end - zone_start; @@ -69,8 +67,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) struct md_rdev *smallest, *rdev1, *rdev2, *rdev, **dev; struct strip_zone *zone; int cnt; - char b[BDEVNAME_SIZE]; - char b2[BDEVNAME_SIZE]; struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); unsigned blksize = 512; @@ -78,9 +74,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) if (!conf) return -ENOMEM; rdev_for_each(rdev1, mddev) { - pr_debug("md/raid0:%s: looking at %s\n", + pr_debug("md/raid0:%s: looking at %pg\n", mdname(mddev), - bdevname(rdev1->bdev, b)); + rdev1->bdev); c = 0; /* round size to chunk_size */ @@ -92,12 +88,12 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) rdev1->bdev->bd_disk->queue)); rdev_for_each(rdev2, mddev) { - pr_debug("md/raid0:%s: comparing %s(%llu)" - " with %s(%llu)\n", + pr_debug("md/raid0:%s: comparing %pg(%llu)" + " with %pg(%llu)\n", mdname(mddev), - bdevname(rdev1->bdev,b), + rdev1->bdev, (unsigned long long)rdev1->sectors, - bdevname(rdev2->bdev,b2), + rdev2->bdev, (unsigned long long)rdev2->sectors); if (rdev2 == rdev1) { pr_debug("md/raid0:%s: END\n", @@ -225,15 +221,15 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) for (j=0; jdevlist[j]; if (rdev->sectors <= zone->dev_start) { - pr_debug("md/raid0:%s: checking %s ... nope\n", + pr_debug("md/raid0:%s: checking %pg ... nope\n", mdname(mddev), - bdevname(rdev->bdev, b)); + rdev->bdev); continue; } - pr_debug("md/raid0:%s: checking %s ..." + pr_debug("md/raid0:%s: checking %pg ..." " contained as device %d\n", mdname(mddev), - bdevname(rdev->bdev, b), c); + rdev->bdev, c); dev[c] = rdev; c++; if (!smallest || rdev->sectors < smallest->sectors) { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 99d5af1362d7..258d4eb2d63c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -402,10 +402,9 @@ static void raid1_end_read_request(struct bio *bio) /* * oops, read error: */ - char b[BDEVNAME_SIZE]; - pr_err_ratelimited("md/raid1:%s: %s: rescheduling sector %llu\n", + pr_err_ratelimited("md/raid1:%s: %pg: rescheduling sector %llu\n", mdname(conf->mddev), - bdevname(rdev->bdev, b), + rdev->bdev, (unsigned long long)r1_bio->sector); set_bit(R1BIO_ReadError, &r1_bio->state); reschedule_retry(r1_bio); @@ -1283,10 +1282,10 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, mirror = conf->mirrors + rdisk; if (r1bio_existed) - pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n", + pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %pg\n", mdname(mddev), (unsigned long long)r1_bio->sector, - bdevname(mirror->rdev->bdev, b)); + mirror->rdev->bdev); if (test_bit(WriteMostly, &mirror->rdev->flags) && bitmap) { @@ -1659,7 +1658,6 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev) */ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) { - char b[BDEVNAME_SIZE]; struct r1conf *conf = mddev->private; unsigned long flags; @@ -1686,9 +1684,9 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); - pr_crit("md/raid1:%s: Disk failure on %s, disabling device.\n" + pr_crit("md/raid1:%s: Disk failure on %pg, disabling device.\n" "md/raid1:%s: Operation continuing on %d devices.\n", - mdname(mddev), bdevname(rdev->bdev, b), + mdname(mddev), rdev->bdev, mdname(mddev), conf->raid_disks - mddev->degraded); } @@ -1706,13 +1704,12 @@ static void print_conf(struct r1conf *conf) rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - char b[BDEVNAME_SIZE]; struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev) - pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n", + pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n", i, !test_bit(In_sync, &rdev->flags), !test_bit(Faulty, &rdev->flags), - bdevname(rdev->bdev,b)); + rdev->bdev); } rcu_read_unlock(); } @@ -2347,7 +2344,6 @@ static void fix_read_error(struct r1conf *conf, int read_disk, } d = start; while (d != read_disk) { - char b[BDEVNAME_SIZE]; if (d==0) d = conf->raid_disks * 2; d--; @@ -2360,11 +2356,11 @@ static void fix_read_error(struct r1conf *conf, int read_disk, if (r1_sync_page_io(rdev, sect, s, conf->tmppage, READ)) { atomic_add(s, &rdev->corrected_errors); - pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %s)\n", + pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %pg)\n", mdname(mddev), s, (unsigned long long)(sect + rdev->data_offset), - bdevname(rdev->bdev, b)); + rdev->bdev); } rdev_dec_pending(rdev, mddev); } else diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index dfa576cdf11c..d589f823feb1 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -397,10 +397,9 @@ static void raid10_end_read_request(struct bio *bio) /* * oops, read error - keep the refcount on the rdev */ - char b[BDEVNAME_SIZE]; - pr_err_ratelimited("md/raid10:%s: %s: rescheduling sector %llu\n", + pr_err_ratelimited("md/raid10:%s: %pg: rescheduling sector %llu\n", mdname(conf->mddev), - bdevname(rdev->bdev, b), + rdev->bdev, (unsigned long long)r10_bio->sector); set_bit(R10BIO_ReadError, &r10_bio->state); reschedule_retry(r10_bio); @@ -1187,9 +1186,9 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, return; } if (err_rdev) - pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n", + pr_err_ratelimited("md/raid10:%s: %pg: redirecting sector %llu to another mirror\n", mdname(mddev), - bdevname(rdev->bdev, b), + rdev->bdev, (unsigned long long)r10_bio->sector); if (max_sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, max_sectors, @@ -1987,7 +1986,6 @@ static int enough(struct r10conf *conf, int ignore) */ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) { - char b[BDEVNAME_SIZE]; struct r10conf *conf = mddev->private; unsigned long flags; @@ -2010,9 +2008,9 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); spin_unlock_irqrestore(&conf->device_lock, flags); - pr_crit("md/raid10:%s: Disk failure on %s, disabling device.\n" + pr_crit("md/raid10:%s: Disk failure on %pg, disabling device.\n" "md/raid10:%s: Operation continuing on %d devices.\n", - mdname(mddev), bdevname(rdev->bdev, b), + mdname(mddev), rdev->bdev, mdname(mddev), conf->geo.raid_disks - mddev->degraded); } @@ -2032,13 +2030,12 @@ static void print_conf(struct r10conf *conf) /* This is only called with ->reconfix_mutex held, so * rcu protection of rdev is not needed */ for (i = 0; i < conf->geo.raid_disks; i++) { - char b[BDEVNAME_SIZE]; rdev = conf->mirrors[i].rdev; if (rdev) - pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n", + pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n", i, !test_bit(In_sync, &rdev->flags), !test_bit(Faulty, &rdev->flags), - bdevname(rdev->bdev,b)); + rdev->bdev); } } @@ -2691,14 +2688,11 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 check_decay_read_errors(mddev, rdev); atomic_inc(&rdev->read_errors); if (atomic_read(&rdev->read_errors) > max_read_errors) { - char b[BDEVNAME_SIZE]; - bdevname(rdev->bdev, b); - - pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n", - mdname(mddev), b, + pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n", + mdname(mddev), rdev->bdev, atomic_read(&rdev->read_errors), max_read_errors); - pr_notice("md/raid10:%s: %s: Failing raid device\n", - mdname(mddev), b); + pr_notice("md/raid10:%s: %pg: Failing raid device\n", + mdname(mddev), rdev->bdev); md_error(mddev, rdev); r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; return; @@ -2768,8 +2762,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 /* write it back and re-read */ rcu_read_lock(); while (sl != r10_bio->read_slot) { - char b[BDEVNAME_SIZE]; - if (sl==0) sl = conf->copies; sl--; @@ -2788,24 +2780,22 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 s, conf->tmppage, WRITE) == 0) { /* Well, this device is dead */ - pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %s)\n", + pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %pg)\n", mdname(mddev), s, (unsigned long long)( sect + choose_data_offset(r10_bio, rdev)), - bdevname(rdev->bdev, b)); - pr_notice("md/raid10:%s: %s: failing drive\n", + rdev->bdev); + pr_notice("md/raid10:%s: %pg: failing drive\n", mdname(mddev), - bdevname(rdev->bdev, b)); + rdev->bdev); } rdev_dec_pending(rdev, mddev); rcu_read_lock(); } sl = start; while (sl != r10_bio->read_slot) { - char b[BDEVNAME_SIZE]; - if (sl==0) sl = conf->copies; sl--; @@ -2825,23 +2815,23 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 READ)) { case 0: /* Well, this device is dead */ - pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %s)\n", + pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %pg)\n", mdname(mddev), s, (unsigned long long)( sect + choose_data_offset(r10_bio, rdev)), - bdevname(rdev->bdev, b)); - pr_notice("md/raid10:%s: %s: failing drive\n", + rdev->bdev); + pr_notice("md/raid10:%s: %pg: failing drive\n", mdname(mddev), - bdevname(rdev->bdev, b)); + rdev->bdev); break; case 1: - pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %s)\n", + pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %pg)\n", mdname(mddev), s, (unsigned long long)( sect + choose_data_offset(r10_bio, rdev)), - bdevname(rdev->bdev, b)); + rdev->bdev); atomic_add(s, &rdev->corrected_errors); } diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 094a4042589e..83c184eddbda 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -3064,11 +3064,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) { struct request_queue *q = bdev_get_queue(rdev->bdev); struct r5l_log *log; - char b[BDEVNAME_SIZE]; int ret; - pr_debug("md/raid:%s: using device %s as journal\n", - mdname(conf->mddev), bdevname(rdev->bdev, b)); + pr_debug("md/raid:%s: using device %pg as journal\n", + mdname(conf->mddev), rdev->bdev); if (PAGE_SIZE != 4096) return -EINVAL; diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 55d065a87b89..973e2e06f19c 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -798,7 +798,6 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, int data_disks; int i; int ret = 0; - char b[BDEVNAME_SIZE]; unsigned int pp_size = le32_to_cpu(e->pp_size); unsigned int data_size = le32_to_cpu(e->data_size); @@ -894,8 +893,8 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, break; } - pr_debug("%s:%*s reading data member disk %s sector %llu\n", - __func__, indent, "", bdevname(rdev->bdev, b), + pr_debug("%s:%*s reading data member disk %pg sector %llu\n", + __func__, indent, "", rdev->bdev, (unsigned long long)sector); if (!sync_page_io(rdev, sector, block_size, page2, REQ_OP_READ, 0, false)) { @@ -942,10 +941,10 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, conf->disks[sh.pd_idx].rdev, 1); BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev); - pr_debug("%s:%*s write parity at sector %llu, disk %s\n", + pr_debug("%s:%*s write parity at sector %llu, disk %pg\n", __func__, indent, "", (unsigned long long)parity_sector, - bdevname(parity_rdev->bdev, b)); + parity_rdev->bdev); if (!sync_page_io(parity_rdev, parity_sector, block_size, page1, REQ_OP_WRITE, 0, false)) { pr_debug("%s:%*s parity write error!\n", __func__, @@ -1255,7 +1254,6 @@ void ppl_exit_log(struct r5conf *conf) static int ppl_validate_rdev(struct md_rdev *rdev) { - char b[BDEVNAME_SIZE]; int ppl_data_sectors; int ppl_size_new; @@ -1272,8 +1270,8 @@ static int ppl_validate_rdev(struct md_rdev *rdev) RAID5_STRIPE_SECTORS((struct r5conf *)rdev->mddev->private)); if (ppl_data_sectors <= 0) { - pr_warn("md/raid:%s: PPL space too small on %s\n", - mdname(rdev->mddev), bdevname(rdev->bdev, b)); + pr_warn("md/raid:%s: PPL space too small on %pg\n", + mdname(rdev->mddev), rdev->bdev); return -ENOSPC; } @@ -1283,16 +1281,16 @@ static int ppl_validate_rdev(struct md_rdev *rdev) rdev->ppl.sector + ppl_size_new > rdev->data_offset) || (rdev->ppl.sector >= rdev->data_offset && rdev->data_offset + rdev->sectors > rdev->ppl.sector)) { - pr_warn("md/raid:%s: PPL space overlaps with data on %s\n", - mdname(rdev->mddev), bdevname(rdev->bdev, b)); + pr_warn("md/raid:%s: PPL space overlaps with data on %pg\n", + mdname(rdev->mddev), rdev->bdev); return -EINVAL; } if (!rdev->mddev->external && ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) || (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) { - pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n", - mdname(rdev->mddev), bdevname(rdev->bdev, b)); + pr_warn("md/raid:%s: PPL space overlaps with superblock on %pg\n", + mdname(rdev->mddev), rdev->bdev); return -EINVAL; } @@ -1463,14 +1461,13 @@ int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add) struct ppl_conf *ppl_conf = conf->log_private; struct ppl_log *log; int ret = 0; - char b[BDEVNAME_SIZE]; if (!rdev) return -EINVAL; - pr_debug("%s: disk: %d operation: %s dev: %s\n", + pr_debug("%s: disk: %d operation: %s dev: %pg\n", __func__, rdev->raid_disk, add ? "add" : "remove", - bdevname(rdev->bdev, b)); + rdev->bdev); if (rdev->raid_disk < 0) return 0; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 39038fa8b1c8..5d09256d7f81 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2686,7 +2686,6 @@ static void raid5_end_read_request(struct bio * bi) struct stripe_head *sh = bi->bi_private; struct r5conf *conf = sh->raid_conf; int disks = sh->disks, i; - char b[BDEVNAME_SIZE]; struct md_rdev *rdev = NULL; sector_t s; @@ -2723,10 +2722,10 @@ static void raid5_end_read_request(struct bio * bi) * any error */ pr_info_ratelimited( - "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", + "md/raid:%s: read error corrected (%lu sectors at %llu on %pg)\n", mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf), (unsigned long long)s, - bdevname(rdev->bdev, b)); + rdev->bdev); atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); @@ -2743,7 +2742,6 @@ static void raid5_end_read_request(struct bio * bi) if (atomic_read(&rdev->read_errors)) atomic_set(&rdev->read_errors, 0); } else { - const char *bdn = bdevname(rdev->bdev, b); int retry = 0; int set_bad = 0; @@ -2752,25 +2750,25 @@ static void raid5_end_read_request(struct bio * bi) atomic_inc(&rdev->read_errors); if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) pr_warn_ratelimited( - "md/raid:%s: read error on replacement device (sector %llu on %s).\n", + "md/raid:%s: read error on replacement device (sector %llu on %pg).\n", mdname(conf->mddev), (unsigned long long)s, - bdn); + rdev->bdev); else if (conf->mddev->degraded >= conf->max_degraded) { set_bad = 1; pr_warn_ratelimited( - "md/raid:%s: read error not correctable (sector %llu on %s).\n", + "md/raid:%s: read error not correctable (sector %llu on %pg).\n", mdname(conf->mddev), (unsigned long long)s, - bdn); + rdev->bdev); } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { /* Oh, no!!! */ set_bad = 1; pr_warn_ratelimited( - "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n", + "md/raid:%s: read error NOT corrected!! (sector %llu on %pg).\n", mdname(conf->mddev), (unsigned long long)s, - bdn); + rdev->bdev); } else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) { if (!test_bit(Faulty, &rdev->flags)) { @@ -2778,8 +2776,8 @@ static void raid5_end_read_request(struct bio * bi) mdname(conf->mddev), atomic_read(&rdev->read_errors), conf->max_nr_stripes); - pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", - mdname(conf->mddev), bdn); + pr_warn("md/raid:%s: Too many read errors, failing device %pg.\n", + mdname(conf->mddev), rdev->bdev); } } else retry = 1; @@ -2891,13 +2889,12 @@ static void raid5_end_write_request(struct bio *bi) static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) { - char b[BDEVNAME_SIZE]; struct r5conf *conf = mddev->private; unsigned long flags; pr_debug("raid456: error called\n"); - pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n", - mdname(mddev), bdevname(rdev->bdev, b)); + pr_crit("md/raid:%s: Disk failure on %pg, disabling device.\n", + mdname(mddev), rdev->bdev); spin_lock_irqsave(&conf->device_lock, flags); set_bit(Faulty, &rdev->flags); @@ -7359,9 +7356,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) } if (test_bit(In_sync, &rdev->flags)) { - char b[BDEVNAME_SIZE]; - pr_info("md/raid:%s: device %s operational as raid disk %d\n", - mdname(mddev), bdevname(rdev->bdev, b), raid_disk); + pr_info("md/raid:%s: device %pg operational as raid disk %d\n", + mdname(mddev), rdev->bdev, raid_disk); } else if (rdev->saved_raid_disk != raid_disk) /* Cannot rely on bitmap to complete recovery */ conf->fullsync = 1; @@ -7877,12 +7873,11 @@ static void print_raid5_conf (struct r5conf *conf) rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - char b[BDEVNAME_SIZE]; rdev = rcu_dereference(conf->disks[i].rdev); if (rdev) - pr_debug(" disk %d, o:%d, dev:%s\n", + pr_debug(" disk %d, o:%d, dev:%pg\n", i, !test_bit(Faulty, &rdev->flags), - bdevname(rdev->bdev, b)); + rdev->bdev); } rcu_read_unlock(); } From 0f2571ad7a30ff6b33cde142439f9378669f8b4f Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Thu, 12 May 2022 17:21:08 +0800 Subject: [PATCH 04/23] md: Don't set mddev private to NULL in raid0 pers->free In normal stop process, it does like this: do_md_stop | __md_stop (pers->free(); mddev->private=NULL) | md_free (free mddev) __md_stop sets mddev->private to NULL after pers->free. The raid device will be stopped and mddev memory is free. But in reshape, it doesn't free the mddev and mddev will still be used in new raid. In reshape, it first sets mddev->private to new_pers and then runs old_pers->free(). Now raid0 sets mddev->private to NULL in raid0_free. The new raid can't work anymore. It will panic when dereference mddev->private because of NULL pointer dereference. It can panic like this: [63010.814972] kernel BUG at drivers/md/raid10.c:928! [63010.819778] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI [63010.825011] CPU: 3 PID: 44437 Comm: md0_resync Kdump: loaded Not tainted 5.14.0-86.el9.x86_64 #1 [63010.833789] Hardware name: Dell Inc. PowerEdge R6415/07YXFK, BIOS 1.15.0 09/11/2020 [63010.841440] RIP: 0010:raise_barrier+0x161/0x170 [raid10] [63010.865508] RSP: 0018:ffffc312408bbc10 EFLAGS: 00010246 [63010.870734] RAX: 0000000000000000 RBX: ffffa00bf7d39800 RCX: 0000000000000000 [63010.877866] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffffa00bf7d39800 [63010.884999] RBP: 0000000000000000 R08: fffffa4945e74400 R09: 0000000000000000 [63010.892132] R10: ffffa00eed02f798 R11: 0000000000000000 R12: ffffa00bbc435200 [63010.899266] R13: ffffa00bf7d39800 R14: 0000000000000400 R15: 0000000000000003 [63010.906399] FS: 0000000000000000(0000) GS:ffffa00eed000000(0000) knlGS:0000000000000000 [63010.914485] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [63010.920229] CR2: 00007f5cfbe99828 CR3: 0000000105efe000 CR4: 00000000003506e0 [63010.927363] Call Trace: [63010.929822] ? bio_reset+0xe/0x40 [63010.933144] ? raid10_alloc_init_r10buf+0x60/0xa0 [raid10] [63010.938629] raid10_sync_request+0x756/0x1610 [raid10] [63010.943770] md_do_sync.cold+0x3e4/0x94c [63010.947698] md_thread+0xab/0x160 [63010.951024] ? md_write_inc+0x50/0x50 [63010.954688] kthread+0x149/0x170 [63010.957923] ? set_kthread_struct+0x40/0x40 [63010.962107] ret_from_fork+0x22/0x30 Removing the code that sets mddev->private to NULL in raid0 can fix problem. Fixes: 0c031fd37f69 (md: Move alloc/free acct bioset in to personality) Reported-by: Fine Fan Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/raid0.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index e3bc38b6c61f..78addfe4a0c9 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -358,7 +358,6 @@ static void free_conf(struct mddev *mddev, struct r0conf *conf) kfree(conf->strip_zone); kfree(conf->devlist); kfree(conf); - mddev->private = NULL; } static void raid0_free(struct mddev *mddev, void *priv) From 42b805af102471f53e3c7867b8c2b502ea4eef7e Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Thu, 12 May 2022 17:21:09 +0800 Subject: [PATCH 05/23] md: fix double free of io_acct_set bioset Now io_acct_set is alloc and free in personality. Remove the codes that free io_acct_set in md_free and md_stop. Fixes: 0c031fd37f69 (md: Move alloc/free acct bioset in to personality) Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/md.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 2303fbd1dcad..8273ac5eef06 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5585,8 +5585,6 @@ static void md_free(struct kobject *ko) bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); - if (mddev->level != 1 && mddev->level != 10) - bioset_exit(&mddev->io_acct_set); kfree(mddev); } @@ -6271,8 +6269,6 @@ void md_stop(struct mddev *mddev) __md_stop(mddev); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); - if (mddev->level != 1 && mddev->level != 10) - bioset_exit(&mddev->io_acct_set); } EXPORT_SYMBOL_GPL(md_stop); From 622536443b6731ec82c563aae7807165adbe9178 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Tue, 24 May 2022 18:23:33 +0800 Subject: [PATCH 06/23] bcache: improve multithreaded bch_btree_check() Commit 8e7102273f59 ("bcache: make bch_btree_check() to be multithreaded") makes bch_btree_check() to be much faster when checking all btree nodes during cache device registration. But it isn't in ideal shap yet, still can be improved. This patch does the following thing to improve current parallel btree nodes check by multiple threads in bch_btree_check(), - Add read lock to root node while checking all the btree nodes with multiple threads. Although currently it is not mandatory but it is good to have a read lock in code logic. - Remove local variable 'char name[32]', and generate kernel thread name string directly when calling kthread_run(). - Allocate local variable "struct btree_check_state check_state" on the stack and avoid unnecessary dynamic memory allocation for it. - Reduce BCH_BTR_CHKTHREAD_MAX from 64 to 12 which is enough indeed. - Increase check_state->started to count created kernel thread after it succeeds to create. - When wait for all checking kernel threads to finish, use wait_event() to replace wait_event_interruptible(). With this change, the code is more clear, and some potential error conditions are avoided. Fixes: 8e7102273f59 ("bcache: make bch_btree_check() to be multithreaded") Signed-off-by: Coly Li Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220524102336.10684-2-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 58 ++++++++++++++++++--------------------- drivers/md/bcache/btree.h | 2 +- 2 files changed, 27 insertions(+), 33 deletions(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index ad9f16689419..2362bb8ef6d1 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2006,8 +2006,7 @@ int bch_btree_check(struct cache_set *c) int i; struct bkey *k = NULL; struct btree_iter iter; - struct btree_check_state *check_state; - char name[32]; + struct btree_check_state check_state; /* check and mark root node keys */ for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) @@ -2018,63 +2017,58 @@ int bch_btree_check(struct cache_set *c) if (c->root->level == 0) return 0; - check_state = kzalloc(sizeof(struct btree_check_state), GFP_KERNEL); - if (!check_state) - return -ENOMEM; - - check_state->c = c; - check_state->total_threads = bch_btree_chkthread_nr(); - check_state->key_idx = 0; - spin_lock_init(&check_state->idx_lock); - atomic_set(&check_state->started, 0); - atomic_set(&check_state->enough, 0); - init_waitqueue_head(&check_state->wait); + check_state.c = c; + check_state.total_threads = bch_btree_chkthread_nr(); + check_state.key_idx = 0; + spin_lock_init(&check_state.idx_lock); + atomic_set(&check_state.started, 0); + atomic_set(&check_state.enough, 0); + init_waitqueue_head(&check_state.wait); + rw_lock(0, c->root, c->root->level); /* * Run multiple threads to check btree nodes in parallel, - * if check_state->enough is non-zero, it means current + * if check_state.enough is non-zero, it means current * running check threads are enough, unncessary to create * more. */ - for (i = 0; i < check_state->total_threads; i++) { - /* fetch latest check_state->enough earlier */ + for (i = 0; i < check_state.total_threads; i++) { + /* fetch latest check_state.enough earlier */ smp_mb__before_atomic(); - if (atomic_read(&check_state->enough)) + if (atomic_read(&check_state.enough)) break; - check_state->infos[i].result = 0; - check_state->infos[i].state = check_state; - snprintf(name, sizeof(name), "bch_btrchk[%u]", i); - atomic_inc(&check_state->started); + check_state.infos[i].result = 0; + check_state.infos[i].state = &check_state; - check_state->infos[i].thread = + check_state.infos[i].thread = kthread_run(bch_btree_check_thread, - &check_state->infos[i], - name); - if (IS_ERR(check_state->infos[i].thread)) { + &check_state.infos[i], + "bch_btrchk[%d]", i); + if (IS_ERR(check_state.infos[i].thread)) { pr_err("fails to run thread bch_btrchk[%d]\n", i); for (--i; i >= 0; i--) - kthread_stop(check_state->infos[i].thread); + kthread_stop(check_state.infos[i].thread); ret = -ENOMEM; goto out; } + atomic_inc(&check_state.started); } /* * Must wait for all threads to stop. */ - wait_event_interruptible(check_state->wait, - atomic_read(&check_state->started) == 0); + wait_event(check_state.wait, atomic_read(&check_state.started) == 0); - for (i = 0; i < check_state->total_threads; i++) { - if (check_state->infos[i].result) { - ret = check_state->infos[i].result; + for (i = 0; i < check_state.total_threads; i++) { + if (check_state.infos[i].result) { + ret = check_state.infos[i].result; goto out; } } out: - kfree(check_state); + rw_unlock(0, c->root); return ret; } diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 50482107134f..1b5fdbc0d83e 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -226,7 +226,7 @@ struct btree_check_info { int result; }; -#define BCH_BTR_CHKTHREAD_MAX 64 +#define BCH_BTR_CHKTHREAD_MAX 12 struct btree_check_state { struct cache_set *c; int total_threads; From 4dc34ae1b45fe26e772a44379f936c72623dd407 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Tue, 24 May 2022 18:23:34 +0800 Subject: [PATCH 07/23] bcache: improve multithreaded bch_sectors_dirty_init() Commit b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be multithreaded") makes bch_sectors_dirty_init() to be much faster when counting dirty sectors by iterating all dirty keys in the btree. But it isn't in ideal shape yet, still can be improved. This patch does the following changes to improve current parallel dirty keys iteration on the btree, - Add read lock to root node when multiple threads iterating the btree, to prevent the root node gets split by I/Os from other registered bcache devices. - Remove local variable "char name[32]" and generate kernel thread name string directly when calling kthread_run(). - Allocate "struct bch_dirty_init_state state" directly on stack and avoid the unnecessary dynamic memory allocation for it. - Decrease BCH_DIRTY_INIT_THRD_MAX from 64 to 12 which is enough indeed. - Increase &state->started to count created kernel thread after it succeeds to create. - When wait for all dirty key counting threads to finish, use wait_event() to replace wait_event_interruptible(). With the above changes, the code is more clear, and some potential error conditions are avoided. Fixes: b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be multithreaded") Signed-off-by: Coly Li Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220524102336.10684-3-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 60 ++++++++++++++--------------------- drivers/md/bcache/writeback.h | 2 +- 2 files changed, 25 insertions(+), 37 deletions(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 9ee0005874cd..d24c09490f8e 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -948,10 +948,10 @@ void bch_sectors_dirty_init(struct bcache_device *d) struct btree_iter iter; struct sectors_dirty_init op; struct cache_set *c = d->c; - struct bch_dirty_init_state *state; - char name[32]; + struct bch_dirty_init_state state; /* Just count root keys if no leaf node */ + rw_lock(0, c->root, c->root->level); if (c->root->level == 0) { bch_btree_op_init(&op.op, -1); op.inode = d->id; @@ -961,54 +961,42 @@ void bch_sectors_dirty_init(struct bcache_device *d) for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) sectors_dirty_init_fn(&op.op, c->root, k); + rw_unlock(0, c->root); return; } - state = kzalloc(sizeof(struct bch_dirty_init_state), GFP_KERNEL); - if (!state) { - pr_warn("sectors dirty init failed: cannot allocate memory\n"); - return; - } + state.c = c; + state.d = d; + state.total_threads = bch_btre_dirty_init_thread_nr(); + state.key_idx = 0; + spin_lock_init(&state.idx_lock); + atomic_set(&state.started, 0); + atomic_set(&state.enough, 0); + init_waitqueue_head(&state.wait); - state->c = c; - state->d = d; - state->total_threads = bch_btre_dirty_init_thread_nr(); - state->key_idx = 0; - spin_lock_init(&state->idx_lock); - atomic_set(&state->started, 0); - atomic_set(&state->enough, 0); - init_waitqueue_head(&state->wait); - - for (i = 0; i < state->total_threads; i++) { - /* Fetch latest state->enough earlier */ + for (i = 0; i < state.total_threads; i++) { + /* Fetch latest state.enough earlier */ smp_mb__before_atomic(); - if (atomic_read(&state->enough)) + if (atomic_read(&state.enough)) break; - state->infos[i].state = state; - atomic_inc(&state->started); - snprintf(name, sizeof(name), "bch_dirty_init[%d]", i); - - state->infos[i].thread = - kthread_run(bch_dirty_init_thread, - &state->infos[i], - name); - if (IS_ERR(state->infos[i].thread)) { + state.infos[i].state = &state; + state.infos[i].thread = + kthread_run(bch_dirty_init_thread, &state.infos[i], + "bch_dirtcnt[%d]", i); + if (IS_ERR(state.infos[i].thread)) { pr_err("fails to run thread bch_dirty_init[%d]\n", i); for (--i; i >= 0; i--) - kthread_stop(state->infos[i].thread); + kthread_stop(state.infos[i].thread); goto out; } + atomic_inc(&state.started); } - /* - * Must wait for all threads to stop. - */ - wait_event_interruptible(state->wait, - atomic_read(&state->started) == 0); - out: - kfree(state); + /* Must wait for all threads to stop. */ + wait_event(state.wait, atomic_read(&state.started) == 0); + rw_unlock(0, c->root); } void bch_cached_dev_writeback_init(struct cached_dev *dc) diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 02b2f9df73f6..31df716951f6 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -20,7 +20,7 @@ #define BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID 57 #define BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH 64 -#define BCH_DIRTY_INIT_THRD_MAX 64 +#define BCH_DIRTY_INIT_THRD_MAX 12 /* * 14 (16384ths) is chosen here as something that each backing device * should be a reasonable fraction of the share, and not to blow up From 80db4e4707e78cb22287da7d058d7274bd4cb370 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Tue, 24 May 2022 18:23:35 +0800 Subject: [PATCH 08/23] bcache: remove incremental dirty sector counting for bch_sectors_dirty_init() After making bch_sectors_dirty_init() being multithreaded, the existing incremental dirty sector counting in bch_root_node_dirty_init() doesn't release btree occupation after iterating 500000 (INIT_KEYS_EACH_TIME) bkeys. Because a read lock is added on btree root node to prevent the btree to be split during the dirty sectors counting, other I/O requester has no chance to gain the write lock even restart bcache_btree(). That is to say, the incremental dirty sectors counting is incompatible to the multhreaded bch_sectors_dirty_init(). We have to choose one and drop another one. In my testing, with 512 bytes random writes, I generate 1.2T dirty data and a btree with 400K nodes. With single thread and incremental dirty sectors counting, it takes 30+ minites to register the backing device. And with multithreaded dirty sectors counting, the backing device registration can be accomplished within 2 minutes. The 30+ minutes V.S. 2- minutes difference makes me decide to keep multithreaded bch_sectors_dirty_init() and drop the incremental dirty sectors counting. This is what this patch does. But INIT_KEYS_EACH_TIME is kept, in sectors_dirty_init_fn() the CPU will be released by cond_resched() after every INIT_KEYS_EACH_TIME keys iterated. This is to avoid the watchdog reports a bogus soft lockup warning. Fixes: b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be multithreaded") Signed-off-by: Coly Li Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220524102336.10684-4-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 39 +++++++++++------------------------ 1 file changed, 12 insertions(+), 27 deletions(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d24c09490f8e..75b71199800d 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -805,13 +805,11 @@ static int bch_writeback_thread(void *arg) /* Init */ #define INIT_KEYS_EACH_TIME 500000 -#define INIT_KEYS_SLEEP_MS 100 struct sectors_dirty_init { struct btree_op op; unsigned int inode; size_t count; - struct bkey start; }; static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, @@ -827,11 +825,8 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, KEY_START(k), KEY_SIZE(k)); op->count++; - if (atomic_read(&b->c->search_inflight) && - !(op->count % INIT_KEYS_EACH_TIME)) { - bkey_copy_key(&op->start, k); - return -EAGAIN; - } + if (!(op->count % INIT_KEYS_EACH_TIME)) + cond_resched(); return MAP_CONTINUE; } @@ -846,24 +841,16 @@ static int bch_root_node_dirty_init(struct cache_set *c, bch_btree_op_init(&op.op, -1); op.inode = d->id; op.count = 0; - op.start = KEY(op.inode, 0, 0); - do { - ret = bcache_btree(map_keys_recurse, - k, - c->root, - &op.op, - &op.start, - sectors_dirty_init_fn, - 0); - if (ret == -EAGAIN) - schedule_timeout_interruptible( - msecs_to_jiffies(INIT_KEYS_SLEEP_MS)); - else if (ret < 0) { - pr_warn("sectors dirty init failed, ret=%d!\n", ret); - break; - } - } while (ret == -EAGAIN); + ret = bcache_btree(map_keys_recurse, + k, + c->root, + &op.op, + &KEY(op.inode, 0, 0), + sectors_dirty_init_fn, + 0); + if (ret < 0) + pr_warn("sectors dirty init failed, ret=%d!\n", ret); return ret; } @@ -907,7 +894,6 @@ static int bch_dirty_init_thread(void *arg) goto out; } skip_nr--; - cond_resched(); } if (p) { @@ -917,7 +903,6 @@ static int bch_dirty_init_thread(void *arg) p = NULL; prev_idx = cur_idx; - cond_resched(); } out: @@ -956,11 +941,11 @@ void bch_sectors_dirty_init(struct bcache_device *d) bch_btree_op_init(&op.op, -1); op.inode = d->id; op.count = 0; - op.start = KEY(op.inode, 0, 0); for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) sectors_dirty_init_fn(&op.op, c->root, k); + rw_unlock(0, c->root); return; } From 32feee36c30ea06e38ccb8ae6e5c44c6eec790a6 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Tue, 24 May 2022 18:23:36 +0800 Subject: [PATCH 09/23] bcache: avoid journal no-space deadlock by reserving 1 journal bucket The journal no-space deadlock was reported time to time. Such deadlock can happen in the following situation. When all journal buckets are fully filled by active jset with heavy write I/O load, the cache set registration (after a reboot) will load all active jsets and inserting them into the btree again (which is called journal replay). If a journaled bkey is inserted into a btree node and results btree node split, new journal request might be triggered. For example, the btree grows one more level after the node split, then the root node record in cache device super block will be upgrade by bch_journal_meta() from bch_btree_set_root(). But there is no space in journal buckets, the journal replay has to wait for new journal bucket to be reclaimed after at least one journal bucket replayed. This is one example that how the journal no-space deadlock happens. The solution to avoid the deadlock is to reserve 1 journal bucket in run time, and only permit the reserved journal bucket to be used during cache set registration procedure for things like journal replay. Then the journal space will never be fully filled, there is no chance for journal no-space deadlock to happen anymore. This patch adds a new member "bool do_reserve" in struct journal, it is inititalized to 0 (false) when struct journal is allocated, and set to 1 (true) by bch_journal_space_reserve() when all initialization done in run_cache_set(). In the run time when journal_reclaim() tries to allocate a new journal bucket, free_journal_buckets() is called to check whether there are enough free journal buckets to use. If there is only 1 free journal bucket and journal->do_reserve is 1 (true), the last bucket is reserved and free_journal_buckets() will return 0 to indicate no free journal bucket. Then journal_reclaim() will give up, and try next time to see whetheer there is free journal bucket to allocate. By this method, there is always 1 jouranl bucket reserved in run time. During the cache set registration, journal->do_reserve is 0 (false), so the reserved journal bucket can be used to avoid the no-space deadlock. Reported-by: Nikhil Kshirsagar Signed-off-by: Coly Li Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220524102336.10684-5-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 31 ++++++++++++++++++++++++++----- drivers/md/bcache/journal.h | 2 ++ drivers/md/bcache/super.c | 1 + 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 7c2ca52ca3e4..42407b44a940 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -405,6 +405,11 @@ err: return ret; } +void bch_journal_space_reserve(struct journal *j) +{ + j->do_reserve = true; +} + /* Journalling */ static void btree_flush_write(struct cache_set *c) @@ -621,12 +626,30 @@ static void do_journal_discard(struct cache *ca) } } +static unsigned int free_journal_buckets(struct cache_set *c) +{ + struct journal *j = &c->journal; + struct cache *ca = c->cache; + struct journal_device *ja = &c->cache->journal; + unsigned int n; + + /* In case njournal_buckets is not power of 2 */ + if (ja->cur_idx >= ja->discard_idx) + n = ca->sb.njournal_buckets + ja->discard_idx - ja->cur_idx; + else + n = ja->discard_idx - ja->cur_idx; + + if (n > (1 + j->do_reserve)) + return n - (1 + j->do_reserve); + + return 0; +} + static void journal_reclaim(struct cache_set *c) { struct bkey *k = &c->journal.key; struct cache *ca = c->cache; uint64_t last_seq; - unsigned int next; struct journal_device *ja = &ca->journal; atomic_t p __maybe_unused; @@ -649,12 +672,10 @@ static void journal_reclaim(struct cache_set *c) if (c->journal.blocks_free) goto out; - next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; - /* No space available on this device */ - if (next == ja->discard_idx) + if (!free_journal_buckets(c)) goto out; - ja->cur_idx = next; + ja->cur_idx = (ja->cur_idx + 1) % ca->sb.njournal_buckets; k->ptr[0] = MAKE_PTR(0, bucket_to_sector(c, ca->sb.d[ja->cur_idx]), ca->sb.nr_this_dev); diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index f2ea34d5f431..cd316b4a1e95 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -105,6 +105,7 @@ struct journal { spinlock_t lock; spinlock_t flush_write_lock; bool btree_flushing; + bool do_reserve; /* used when waiting because the journal was full */ struct closure_waitlist wait; struct closure io; @@ -182,5 +183,6 @@ int bch_journal_replay(struct cache_set *c, struct list_head *list); void bch_journal_free(struct cache_set *c); int bch_journal_alloc(struct cache_set *c); +void bch_journal_space_reserve(struct journal *j); #endif /* _BCACHE_JOURNAL_H */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 2f49e31142f6..3563d15dbaf2 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2127,6 +2127,7 @@ static int run_cache_set(struct cache_set *c) flash_devs_run(c); + bch_journal_space_reserve(&c->journal); set_bit(CACHE_SET_RUNNING, &c->flags); return 0; err: From b9684a71fca793213378dd410cd11675d973eaa1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 May 2022 07:58:06 +0200 Subject: [PATCH 10/23] block, loop: support partitions without scanning Historically we did distinguish between a flag that surpressed partition scanning, and a combinations of the minors variable and another flag if any partitions were supported. This was generally confusing and doesn't make much sense, but some corner case uses of the loop driver actually do want to support manually added partitions on a device that does not actively scan for partitions. To make things worsee the loop driver also wants to dynamically toggle the scanning for partitions on a live gendisk, which makes the disk->flags updates non-atomic. Introduce a new GD_SUPPRESS_PART_SCAN bit in disk->state that disables just scanning for partitions, and toggle that instead of GENHD_FL_NO_PART in the loop driver. Fixes: 1ebe2e5f9d68 ("block: remove GENHD_FL_EXT_DEVT") Reported-by: Ming Lei Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20220527055806.1972352-1-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 2 ++ drivers/block/loop.c | 8 ++++---- include/linux/blkdev.h | 1 + 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 36532b931841..27205ae47d59 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -385,6 +385,8 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode) if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) return -EINVAL; + if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) + return -EINVAL; if (disk->open_partitions) return -EBUSY; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 01b4e257016a..121f96dfc23e 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1102,7 +1102,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, lo->lo_flags |= LO_FLAGS_PARTSCAN; partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; if (partscan) - lo->lo_disk->flags &= ~GENHD_FL_NO_PART; + clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); loop_global_unlock(lo, is_loop); if (partscan) @@ -1198,7 +1198,7 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) */ lo->lo_flags = 0; if (!part_shift) - lo->lo_disk->flags |= GENHD_FL_NO_PART; + set_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); mutex_lock(&lo->lo_mutex); lo->lo_state = Lo_unbound; mutex_unlock(&lo->lo_mutex); @@ -1308,7 +1308,7 @@ out_unfreeze: if (!err && (lo->lo_flags & LO_FLAGS_PARTSCAN) && !(prev_lo_flags & LO_FLAGS_PARTSCAN)) { - lo->lo_disk->flags &= ~GENHD_FL_NO_PART; + clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); partscan = true; } out_unlock: @@ -2009,7 +2009,7 @@ static int loop_add(int i) * userspace tools. Parameters like this in general should be avoided. */ if (!part_shift) - disk->flags |= GENHD_FL_NO_PART; + set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); mutex_init(&lo->lo_mutex); lo->lo_number = i; spin_lock_init(&lo->lo_lock); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index afad3d1d0dac..691b4c15b8ce 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -147,6 +147,7 @@ struct gendisk { #define GD_DEAD 2 #define GD_NATIVE_CAPACITY 3 #define GD_ADDED 4 +#define GD_SUPPRESS_PART_SCAN 5 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ From 7d6b902ea0e02b2a25c480edf471cbaa4ebe6b3c Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 27 May 2022 23:28:16 +0800 Subject: [PATCH 11/23] bcache: memset on stack variables in bch_btree_check() and bch_sectors_dirty_init() The local variables check_state (in bch_btree_check()) and state (in bch_sectors_dirty_init()) should be fully filled by 0, because before allocating them on stack, they were dynamically allocated by kzalloc(). Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20220527152818.27545-2-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 1 + drivers/md/bcache/writeback.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 2362bb8ef6d1..e136d6edc1ed 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2017,6 +2017,7 @@ int bch_btree_check(struct cache_set *c) if (c->root->level == 0) return 0; + memset(&check_state, 0, sizeof(struct btree_check_state)); check_state.c = c; check_state.total_threads = bch_btree_chkthread_nr(); check_state.key_idx = 0; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 75b71199800d..d138a2d73240 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -950,6 +950,7 @@ void bch_sectors_dirty_init(struct bcache_device *d) return; } + memset(&state, 0, sizeof(struct bch_dirty_init_state)); state.c = c; state.d = d; state.total_threads = bch_btre_dirty_init_thread_nr(); From 40f567bbb3b0639d2ec7d1c6ad4b1b018f80cf19 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Fri, 27 May 2022 23:28:18 +0800 Subject: [PATCH 12/23] md: bcache: check the return value of kzalloc() in detached_dev_do_request() The function kzalloc() in detached_dev_do_request() can fail, so its return value should be checked. Fixes: bc082a55d25c ("bcache: fix inaccurate io state for detached bcache devices") Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20220527152818.27545-4-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index e27f67f06a42..566838b40f10 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1105,6 +1105,12 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio, * which would call closure_get(&dc->disk.cl) */ ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); + if (!ddip) { + bio->bi_status = BLK_STS_RESOURCE; + bio->bi_end_io(bio); + return; + } + ddip->d = d; /* Count on the bcache device */ ddip->orig_bdev = orig_bdev; From 06c4da89c24e7023ea448cadf8e9daf06a0aae6e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 21 May 2022 15:37:44 +0800 Subject: [PATCH 13/23] nbd: call genl_unregister_family() first in nbd_cleanup() Otherwise there may be race between module removal and the handling of netlink command, which can lead to the oops as shown below: BUG: kernel NULL pointer dereference, address: 0000000000000098 Oops: 0002 [#1] SMP PTI CPU: 1 PID: 31299 Comm: nbd-client Tainted: G E 5.14.0-rc4 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) RIP: 0010:down_write+0x1a/0x50 Call Trace: start_creating+0x89/0x130 debugfs_create_dir+0x1b/0x130 nbd_start_device+0x13d/0x390 [nbd] nbd_genl_connect+0x42f/0x748 [nbd] genl_family_rcv_msg_doit.isra.0+0xec/0x150 genl_rcv_msg+0xe5/0x1e0 netlink_rcv_skb+0x55/0x100 genl_rcv+0x29/0x40 netlink_unicast+0x1a8/0x250 netlink_sendmsg+0x21b/0x430 ____sys_sendmsg+0x2a4/0x2d0 ___sys_sendmsg+0x81/0xc0 __sys_sendmsg+0x62/0xb0 __x64_sys_sendmsg+0x1f/0x30 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae Modules linked in: nbd(E-) Signed-off-by: Hou Tao Signed-off-by: Yu Kuai Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20220521073749.3146892-2-yukuai3@huawei.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index ac8b045c777c..a73e853f5833 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -2528,6 +2528,12 @@ static void __exit nbd_cleanup(void) struct nbd_device *nbd; LIST_HEAD(del_list); + /* + * Unregister netlink interface prior to waiting + * for the completion of netlink commands. + */ + genl_unregister_family(&nbd_genl_family); + nbd_dbg_close(); mutex_lock(&nbd_index_mutex); @@ -2546,7 +2552,6 @@ static void __exit nbd_cleanup(void) destroy_workqueue(nbd_del_wq); idr_destroy(&nbd_index_idr); - genl_unregister_family(&nbd_genl_family); unregister_blkdev(NBD_MAJOR, "nbd"); } From c55b2b983b0fa012942c3eb16384b2b722caa810 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 21 May 2022 15:37:45 +0800 Subject: [PATCH 14/23] nbd: fix race between nbd_alloc_config() and module removal When nbd module is being removing, nbd_alloc_config() may be called concurrently by nbd_genl_connect(), although try_module_get() will return false, but nbd_alloc_config() doesn't handle it. The race may lead to the leak of nbd_config and its related resources (e.g, recv_workq) and oops in nbd_read_stat() due to the unload of nbd module as shown below: BUG: kernel NULL pointer dereference, address: 0000000000000040 Oops: 0000 [#1] SMP PTI CPU: 5 PID: 13840 Comm: kworker/u17:33 Not tainted 5.14.0+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) Workqueue: knbd16-recv recv_work [nbd] RIP: 0010:nbd_read_stat.cold+0x130/0x1a4 [nbd] Call Trace: recv_work+0x3b/0xb0 [nbd] process_one_work+0x1ed/0x390 worker_thread+0x4a/0x3d0 kthread+0x12a/0x150 ret_from_fork+0x22/0x30 Fixing it by checking the return value of try_module_get() in nbd_alloc_config(). As nbd_alloc_config() may return ERR_PTR(-ENODEV), assign nbd->config only when nbd_alloc_config() succeeds to ensure the value of nbd->config is binary (valid or NULL). Also adding a debug message to check the reference counter of nbd_config during module removal. Signed-off-by: Hou Tao Signed-off-by: Yu Kuai Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20220521073749.3146892-3-yukuai3@huawei.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a73e853f5833..2ee1e376d5c4 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1518,15 +1518,20 @@ static struct nbd_config *nbd_alloc_config(void) { struct nbd_config *config; + if (!try_module_get(THIS_MODULE)) + return ERR_PTR(-ENODEV); + config = kzalloc(sizeof(struct nbd_config), GFP_NOFS); - if (!config) - return NULL; + if (!config) { + module_put(THIS_MODULE); + return ERR_PTR(-ENOMEM); + } + atomic_set(&config->recv_threads, 0); init_waitqueue_head(&config->recv_wq); init_waitqueue_head(&config->conn_wait); config->blksize_bits = NBD_DEF_BLKSIZE_BITS; atomic_set(&config->live_connections, 0); - try_module_get(THIS_MODULE); return config; } @@ -1553,12 +1558,13 @@ static int nbd_open(struct block_device *bdev, fmode_t mode) mutex_unlock(&nbd->config_lock); goto out; } - config = nbd->config = nbd_alloc_config(); - if (!config) { - ret = -ENOMEM; + config = nbd_alloc_config(); + if (IS_ERR(config)) { + ret = PTR_ERR(config); mutex_unlock(&nbd->config_lock); goto out; } + nbd->config = config; refcount_set(&nbd->config_refs, 1); refcount_inc(&nbd->refs); mutex_unlock(&nbd->config_lock); @@ -1964,13 +1970,14 @@ again: nbd_put(nbd); return -EINVAL; } - config = nbd->config = nbd_alloc_config(); - if (!nbd->config) { + config = nbd_alloc_config(); + if (IS_ERR(config)) { mutex_unlock(&nbd->config_lock); nbd_put(nbd); printk(KERN_ERR "nbd: couldn't allocate config\n"); - return -ENOMEM; + return PTR_ERR(config); } + nbd->config = config; refcount_set(&nbd->config_refs, 1); set_bit(NBD_RT_BOUND, &config->runtime_flags); @@ -2543,6 +2550,9 @@ static void __exit nbd_cleanup(void) while (!list_empty(&del_list)) { nbd = list_first_entry(&del_list, struct nbd_device, list); list_del_init(&nbd->list); + if (refcount_read(&nbd->config_refs)) + printk(KERN_ERR "nbd: possibly leaking nbd_config (ref %d)\n", + refcount_read(&nbd->config_refs)); if (refcount_read(&nbd->refs) != 1) printk(KERN_ERR "nbd: possibly leaking a device\n"); nbd_put(nbd); From 2895f1831e911ca87d4efdf43e35eb72a0c7e66e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 21 May 2022 15:37:46 +0800 Subject: [PATCH 15/23] nbd: don't clear 'NBD_CMD_INFLIGHT' flag if request is not completed Otherwise io will hung because request will only be completed if the cmd has the flag 'NBD_CMD_INFLIGHT'. Fixes: 07175cb1baf4 ("nbd: make sure request completion won't concurrent") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20220521073749.3146892-4-yukuai3@huawei.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 2ee1e376d5c4..a0d0910dae2a 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -403,13 +403,14 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, if (!mutex_trylock(&cmd->lock)) return BLK_EH_RESET_TIMER; - if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { + if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { mutex_unlock(&cmd->lock); return BLK_EH_DONE; } if (!refcount_inc_not_zero(&nbd->config_refs)) { cmd->status = BLK_STS_TIMEOUT; + __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); mutex_unlock(&cmd->lock); goto done; } @@ -478,6 +479,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n"); set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags); cmd->status = BLK_STS_IOERR; + __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); mutex_unlock(&cmd->lock); sock_shutdown(nbd); nbd_config_put(nbd); @@ -745,7 +747,7 @@ static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index, cmd = blk_mq_rq_to_pdu(req); mutex_lock(&cmd->lock); - if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { + if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { dev_err(disk_to_dev(nbd->disk), "Suspicious reply %d (status %u flags %lu)", tag, cmd->status, cmd->flags); ret = -ENOENT; @@ -854,8 +856,16 @@ static void recv_work(struct work_struct *work) } rq = blk_mq_rq_from_pdu(cmd); - if (likely(!blk_should_fake_timeout(rq->q))) - blk_mq_complete_request(rq); + if (likely(!blk_should_fake_timeout(rq->q))) { + bool complete; + + mutex_lock(&cmd->lock); + complete = __test_and_clear_bit(NBD_CMD_INFLIGHT, + &cmd->flags); + mutex_unlock(&cmd->lock); + if (complete) + blk_mq_complete_request(rq); + } percpu_ref_put(&q->q_usage_counter); } From 09dadb5985023e27d4740ebd17e6fea4640110e5 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 21 May 2022 15:37:47 +0800 Subject: [PATCH 16/23] nbd: fix io hung while disconnecting device In our tests, "qemu-nbd" triggers a io hung: INFO: task qemu-nbd:11445 blocked for more than 368 seconds. Not tainted 5.18.0-rc3-next-20220422-00003-g2176915513ca #884 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:qemu-nbd state:D stack: 0 pid:11445 ppid: 1 flags:0x00000000 Call Trace: __schedule+0x480/0x1050 ? _raw_spin_lock_irqsave+0x3e/0xb0 schedule+0x9c/0x1b0 blk_mq_freeze_queue_wait+0x9d/0xf0 ? ipi_rseq+0x70/0x70 blk_mq_freeze_queue+0x2b/0x40 nbd_add_socket+0x6b/0x270 [nbd] nbd_ioctl+0x383/0x510 [nbd] blkdev_ioctl+0x18e/0x3e0 __x64_sys_ioctl+0xac/0x120 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7fd8ff706577 RSP: 002b:00007fd8fcdfebf8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000040000000 RCX: 00007fd8ff706577 RDX: 000000000000000d RSI: 000000000000ab00 RDI: 000000000000000f RBP: 000000000000000f R08: 000000000000fbe8 R09: 000055fe497c62b0 R10: 00000002aff20000 R11: 0000000000000246 R12: 000000000000006d R13: 0000000000000000 R14: 00007ffe82dc5e70 R15: 00007fd8fcdff9c0 "qemu-ndb -d" will call ioctl 'NBD_DISCONNECT' first, however, following message was found: block nbd0: Send disconnect failed -32 Which indicate that something is wrong with the server. Then, "qemu-nbd -d" will call ioctl 'NBD_CLEAR_SOCK', however ioctl can't clear requests after commit 2516ab1543fd("nbd: only clear the queue on device teardown"). And in the meantime, request can't complete through timeout because nbd_xmit_timeout() will always return 'BLK_EH_RESET_TIMER', which means such request will never be completed in this situation. Now that the flag 'NBD_CMD_INFLIGHT' can make sure requests won't complete multiple times, switch back to call nbd_clear_sock() in nbd_clear_sock_ioctl(), so that inflight requests can be cleared. Signed-off-by: Yu Kuai Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20220521073749.3146892-5-yukuai3@huawei.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a0d0910dae2a..ec736cc52134 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1429,7 +1429,7 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd) static void nbd_clear_sock_ioctl(struct nbd_device *nbd, struct block_device *bdev) { - sock_shutdown(nbd); + nbd_clear_sock(nbd); __invalidate_device(bdev, true); nbd_bdev_reset(nbd); if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, From 858f1bf65d3d9c00b5e2d8ca87dc79ed88267c98 Mon Sep 17 00:00:00 2001 From: Zhang Wensheng Date: Sat, 21 May 2022 15:37:48 +0800 Subject: [PATCH 17/23] nbd: fix possible overflow on 'first_minor' in nbd_dev_add() When 'index' is a big numbers, it may become negative which forced to 'int'. then 'index << part_shift' might overflow to a positive value that is not greater than '0xfffff', then sysfs might complains about duplicate creation. Because of this, move the 'index' judgment to the front will fix it and be better. Fixes: b0d9111a2d53 ("nbd: use an idr to keep track of nbd devices") Fixes: 940c264984fd ("nbd: fix possible overflow for 'first_minor' in nbd_dev_add()") Signed-off-by: Zhang Wensheng Signed-off-by: Yu Kuai Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20220521073749.3146892-6-yukuai3@huawei.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index ec736cc52134..349bc3da878d 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1814,17 +1814,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) refcount_set(&nbd->refs, 0); INIT_LIST_HEAD(&nbd->list); disk->major = NBD_MAJOR; - - /* Too big first_minor can cause duplicate creation of - * sysfs files/links, since index << part_shift might overflow, or - * MKDEV() expect that the max bits of first_minor is 20. - */ disk->first_minor = index << part_shift; - if (disk->first_minor < index || disk->first_minor > MINORMASK) { - err = -EINVAL; - goto out_free_work; - } - disk->minors = 1 << part_shift; disk->fops = &nbd_fops; disk->private_data = nbd; @@ -1929,8 +1919,19 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; - if (info->attrs[NBD_ATTR_INDEX]) + if (info->attrs[NBD_ATTR_INDEX]) { index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); + + /* + * Too big first_minor can cause duplicate creation of + * sysfs files/links, since index << part_shift might overflow, or + * MKDEV() expect that the max bits of first_minor is 20. + */ + if (index < 0 || index > MINORMASK >> part_shift) { + printk(KERN_ERR "nbd: illegal input index %d\n", index); + return -EINVAL; + } + } if (!info->attrs[NBD_ATTR_SOCKETS]) { printk(KERN_ERR "nbd: must specify at least one socket\n"); return -EINVAL; From 1243172d5894e2d8f277ee3c278180792de5c521 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 21 May 2022 15:37:49 +0800 Subject: [PATCH 18/23] nbd: use pr_err to output error message Instead of using the long printk(KERN_ERR "nbd: ...") to output error message, defining pr_fmt and using the short pr_err("") to do that. The replacemen is done by using the following command: sed -i 's/printk(KERN_ERR "nbd: /pr_err("/g' \ drivers/block/nbd.c This patch also rewrap to 80 columns where possible. Signed-off-by: Hou Tao Signed-off-by: Yu Kuai Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20220521073749.3146892-7-yukuai3@huawei.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 349bc3da878d..07f3c139a3d7 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1928,16 +1928,16 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) * MKDEV() expect that the max bits of first_minor is 20. */ if (index < 0 || index > MINORMASK >> part_shift) { - printk(KERN_ERR "nbd: illegal input index %d\n", index); + pr_err("illegal input index %d\n", index); return -EINVAL; } } if (!info->attrs[NBD_ATTR_SOCKETS]) { - printk(KERN_ERR "nbd: must specify at least one socket\n"); + pr_err("must specify at least one socket\n"); return -EINVAL; } if (!info->attrs[NBD_ATTR_SIZE_BYTES]) { - printk(KERN_ERR "nbd: must specify a size in bytes for the device\n"); + pr_err("must specify a size in bytes for the device\n"); return -EINVAL; } again: @@ -1973,7 +1973,7 @@ again: nbd_put(nbd); if (index == -1) goto again; - printk(KERN_ERR "nbd: nbd%d already in use\n", index); + pr_err("nbd%d already in use\n", index); return -EBUSY; } if (WARN_ON(nbd->config)) { @@ -1985,7 +1985,7 @@ again: if (IS_ERR(config)) { mutex_unlock(&nbd->config_lock); nbd_put(nbd); - printk(KERN_ERR "nbd: couldn't allocate config\n"); + pr_err("couldn't allocate config\n"); return PTR_ERR(config); } nbd->config = config; @@ -2041,7 +2041,7 @@ again: struct nlattr *socks[NBD_SOCK_MAX+1]; if (nla_type(attr) != NBD_SOCK_ITEM) { - printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n"); + pr_err("socks must be embedded in a SOCK_ITEM attr\n"); ret = -EINVAL; goto out; } @@ -2050,7 +2050,7 @@ again: nbd_sock_policy, info->extack); if (ret != 0) { - printk(KERN_ERR "nbd: error processing sock list\n"); + pr_err("error processing sock list\n"); ret = -EINVAL; goto out; } @@ -2122,7 +2122,7 @@ static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info) return -EPERM; if (!info->attrs[NBD_ATTR_INDEX]) { - printk(KERN_ERR "nbd: must specify an index to disconnect\n"); + pr_err("must specify an index to disconnect\n"); return -EINVAL; } index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); @@ -2130,14 +2130,12 @@ static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info) nbd = idr_find(&nbd_index_idr, index); if (!nbd) { mutex_unlock(&nbd_index_mutex); - printk(KERN_ERR "nbd: couldn't find device at index %d\n", - index); + pr_err("couldn't find device at index %d\n", index); return -EINVAL; } if (!refcount_inc_not_zero(&nbd->refs)) { mutex_unlock(&nbd_index_mutex); - printk(KERN_ERR "nbd: device at index %d is going down\n", - index); + pr_err("device at index %d is going down\n", index); return -EINVAL; } mutex_unlock(&nbd_index_mutex); @@ -2162,7 +2160,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) return -EPERM; if (!info->attrs[NBD_ATTR_INDEX]) { - printk(KERN_ERR "nbd: must specify a device to reconfigure\n"); + pr_err("must specify a device to reconfigure\n"); return -EINVAL; } index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); @@ -2170,8 +2168,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) nbd = idr_find(&nbd_index_idr, index); if (!nbd) { mutex_unlock(&nbd_index_mutex); - printk(KERN_ERR "nbd: couldn't find a device at index %d\n", - index); + pr_err("couldn't find a device at index %d\n", index); return -EINVAL; } if (nbd->backend) { @@ -2192,8 +2189,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) } if (!refcount_inc_not_zero(&nbd->refs)) { mutex_unlock(&nbd_index_mutex); - printk(KERN_ERR "nbd: device at index %d is going down\n", - index); + pr_err("device at index %d is going down\n", index); return -EINVAL; } mutex_unlock(&nbd_index_mutex); @@ -2257,7 +2253,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) struct nlattr *socks[NBD_SOCK_MAX+1]; if (nla_type(attr) != NBD_SOCK_ITEM) { - printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n"); + pr_err("socks must be embedded in a SOCK_ITEM attr\n"); ret = -EINVAL; goto out; } @@ -2266,7 +2262,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) nbd_sock_policy, info->extack); if (ret != 0) { - printk(KERN_ERR "nbd: error processing sock list\n"); + pr_err("error processing sock list\n"); ret = -EINVAL; goto out; } @@ -2483,7 +2479,7 @@ static int __init nbd_init(void) BUILD_BUG_ON(sizeof(struct nbd_request) != 28); if (max_part < 0) { - printk(KERN_ERR "nbd: max_part must be >= 0\n"); + pr_err("max_part must be >= 0\n"); return -EINVAL; } @@ -2562,10 +2558,10 @@ static void __exit nbd_cleanup(void) nbd = list_first_entry(&del_list, struct nbd_device, list); list_del_init(&nbd->list); if (refcount_read(&nbd->config_refs)) - printk(KERN_ERR "nbd: possibly leaking nbd_config (ref %d)\n", + pr_err("possibly leaking nbd_config (ref %d)\n", refcount_read(&nbd->config_refs)); if (refcount_read(&nbd->refs) != 1) - printk(KERN_ERR "nbd: possibly leaking a device\n"); + pr_err("possibly leaking a device\n"); nbd_put(nbd); } From a1a2d8f0162b27e85e7ce0ae6a35c96a490e0559 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 28 May 2022 20:45:50 +0800 Subject: [PATCH 19/23] bcache: avoid unnecessary soft lockup in kworker update_writeback_rate() The kworker routine update_writeback_rate() is schedued to update the writeback rate in every 5 seconds by default. Before calling __update_writeback_rate() to do real job, semaphore dc->writeback_lock should be held by the kworker routine. At the same time, bcache writeback thread routine bch_writeback_thread() also needs to hold dc->writeback_lock before flushing dirty data back into the backing device. If the dirty data set is large, it might be very long time for bch_writeback_thread() to scan all dirty buckets and releases dc->writeback_lock. In such case update_writeback_rate() can be starved for long enough time so that kernel reports a soft lockup warn- ing started like: watchdog: BUG: soft lockup - CPU#246 stuck for 23s! [kworker/246:31:179713] Such soft lockup condition is unnecessary, because after the writeback thread finishes its job and releases dc->writeback_lock, the kworker update_writeback_rate() may continue to work and everything is fine indeed. This patch avoids the unnecessary soft lockup by the following method, - Add new member to struct cached_dev - dc->rate_update_retry (0 by default) - In update_writeback_rate() call down_read_trylock(&dc->writeback_lock) firstly, if it fails then lock contention happens. - If dc->rate_update_retry <= BCH_WBRATE_UPDATE_MAX_SKIPS (15), doesn't acquire the lock and reschedules the kworker for next try. - If dc->rate_update_retry > BCH_WBRATE_UPDATE_MAX_SKIPS, no retry anymore and call down_read(&dc->writeback_lock) to wait for the lock. By the above method, at worst case update_writeback_rate() may retry for 1+ minutes before blocking on dc->writeback_lock by calling down_read(). For a 4TB cache device with 1TB dirty data, 90%+ of the unnecessary soft lockup warning message can be avoided. When retrying to acquire dc->writeback_lock in update_writeback_rate(), of course the writeback rate cannot be updated. It is fair, because when the kworker is blocked on the lock contention of dc->writeback_lock, the writeback rate cannot be updated neither. This change follows Jens Axboe's suggestion to a more clear and simple version. Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20220528124550.32834-2-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 7 +++++++ drivers/md/bcache/writeback.c | 31 +++++++++++++++++++++---------- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 9ed9c955add7..2acda9cea0f9 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -395,6 +395,13 @@ struct cached_dev { atomic_t io_errors; unsigned int error_limit; unsigned int offline_seconds; + + /* + * Retry to update writeback_rate if contention happens for + * down_read(dc->writeback_lock) in update_writeback_rate() + */ +#define BCH_WBRATE_UPDATE_MAX_SKIPS 15 + unsigned int rate_update_retry; }; enum alloc_reserve { diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d138a2d73240..3f0ff3aab6f2 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -235,19 +235,27 @@ static void update_writeback_rate(struct work_struct *work) return; } - if (atomic_read(&dc->has_dirty) && dc->writeback_percent) { - /* - * If the whole cache set is idle, set_at_max_writeback_rate() - * will set writeback rate to a max number. Then it is - * unncessary to update writeback rate for an idle cache set - * in maximum writeback rate number(s). - */ - if (!set_at_max_writeback_rate(c, dc)) { - down_read(&dc->writeback_lock); + /* + * If the whole cache set is idle, set_at_max_writeback_rate() + * will set writeback rate to a max number. Then it is + * unncessary to update writeback rate for an idle cache set + * in maximum writeback rate number(s). + */ + if (atomic_read(&dc->has_dirty) && dc->writeback_percent && + !set_at_max_writeback_rate(c, dc)) { + do { + if (!down_read_trylock((&dc->writeback_lock))) { + dc->rate_update_retry++; + if (dc->rate_update_retry <= + BCH_WBRATE_UPDATE_MAX_SKIPS) + break; + down_read(&dc->writeback_lock); + dc->rate_update_retry = 0; + } __update_writeback_rate(dc); update_gc_after_writeback(c); up_read(&dc->writeback_lock); - } + } while (0); } @@ -1006,6 +1014,9 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_rate_fp_term_high = 1000; dc->writeback_rate_i_term_inverse = 10000; + /* For dc->writeback_lock contention in update_writeback_rate() */ + dc->rate_update_retry = 0; + WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); } From 70ce3455345d056b5fc427c3bb4a3ff4d126b6d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 May 2022 07:32:08 +0200 Subject: [PATCH 20/23] nvme-pci: disable namespace identifiers for the MAXIO MAP1001 The MAXIO MAP1001 controllers reports completely bogus Namespace identifiers that even change after suspend cycles. Disable using the Identifiers entirely. Reported-by: Arman Hajishafieha Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Tested-by: Arman Hajishafieha --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5a98a7de0964..cb4adc0c2284 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3453,6 +3453,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, { PCI_DEVICE(0x2646, 0x2263), /* KINGSTON A2000 NVMe SSD */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, + { PCI_DEVICE(0x1e4B, 0x1001), /* MAXIO MAP1001 */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1e4B, 0x1002), /* MAXIO MAP1002 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1e4B, 0x1202), /* MAXIO MAP1202 */ From aa41d2fe60ee2e4452b0f9ca9f0f6d80a4ff9f9d Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 26 May 2022 15:57:21 +0200 Subject: [PATCH 21/23] nvme: set controller enable bit in a separate write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The NVM Express Base Specification 2.0 specifies in the description of the CC – Controller Configuration register: "Host software shall set the Arbitration Mechanism Selected (CC.AMS), the Memory Page Size (CC.MPS), and the I/O Command Set Selected (CC.CSS) to valid values prior to enabling the controller by setting CC.EN to ‘1’. While we haven't seen any controller misbehaving while setting all bits in a single write, let's do it in the order that it is written in the spec, as there could potentially be controllers that are implemented to rely on the configuration bits being set before enabling the controller. Signed-off-by: Niklas Cassel Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a7660e4be55b..98e343cb4a12 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2227,8 +2227,16 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; - ctrl->ctrl_config |= NVME_CC_ENABLE; + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + /* Flush write to device (required if transport is PCI) */ + ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CC, &ctrl->ctrl_config); + if (ret) + return ret; + + ctrl->ctrl_config |= NVME_CC_ENABLE; ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; From cbf84dbf0600b0efe1134cc3f98ae29c523a3a23 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 21 May 2022 13:10:36 +0200 Subject: [PATCH 22/23] nvmet: fix typo in comment Spelling mistake (triple letters) in comment. Detected with the help of Coccinelle. Signed-off-by: Julia Lawall Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/passthru.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 5247c24538eb..2cebb0da38d5 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -97,7 +97,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) id->sgls |= cpu_to_le32(1 << 20); /* - * When passsthru controller is setup using nvme-loop transport it will + * When passthru controller is setup using nvme-loop transport it will * export the passthru ctrl subsysnqn (PCIe NVMe ctrl) and will fail in * the nvme/host/core.c in the nvme_init_subsystem()->nvme_active_ctrl() * code path with duplicate ctr subsynqn. In order to prevent that we From aacae8c469f9ce4b303a2eb61593ff522c1420bc Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 2 Jun 2022 21:03:44 +0900 Subject: [PATCH 23/23] block: null_blk: Fix null_zone_write() The bio and rq fields of struct nullb_cmd are now overlapping in a union. So we cannot use a test on ->bio being non-NULL to detect the NULL_Q_BIO queue mode. null_zone_write() use such broken test to set the sector position of a zone append write in the command bio or request. When the null_blk device uses the NULL_Q_MQ queue mode, null_zone_write() wrongly end up setting the bio sector position, resulting in the command request to be broken and random crashes following. Fix this by testing the device queue mode directly. Fixes: 8ba816b23abd ("null-blk: save memory footprint for struct nullb_cmd") Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220602120344.1365329-1-damien.lemoal@opensource.wdc.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 6 ------ drivers/block/null_blk/null_blk.h | 7 +++++++ drivers/block/null_blk/zoned.c | 6 +++--- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 539cfeac263d..6b67088f4ea7 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -77,12 +77,6 @@ enum { NULL_IRQ_TIMER = 2, }; -enum { - NULL_Q_BIO = 0, - NULL_Q_RQ = 1, - NULL_Q_MQ = 2, -}; - static bool g_virt_boundary = false; module_param_named(virt_boundary, g_virt_boundary, bool, 0444); MODULE_PARM_DESC(virt_boundary, "Require a virtual boundary for the device. Default: False"); diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 4525a65e1b23..8359b43842f2 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -60,6 +60,13 @@ struct nullb_zone { unsigned int capacity; }; +/* Queue modes */ +enum { + NULL_Q_BIO = 0, + NULL_Q_RQ = 1, + NULL_Q_MQ = 2, +}; + struct nullb_device { struct nullb *nullb; struct config_item item; diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index ed158ea4fdd1..2fdd7b20c224 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -398,10 +398,10 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, */ if (append) { sector = zone->wp; - if (cmd->bio) - cmd->bio->bi_iter.bi_sector = sector; - else + if (dev->queue_mode == NULL_Q_MQ) cmd->rq->__sector = sector; + else + cmd->bio->bi_iter.bi_sector = sector; } else if (sector != zone->wp) { ret = BLK_STS_IOERR; goto unlock;