From ef100397fac3e2e403d5d510e66f36e242654073 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 16 Aug 2023 09:27:05 +0800 Subject: [PATCH 01/10] blk-throttle: print signed value 'carryover_bytes/ios' for user 'carryover_bytes/ios' can be negative, indicate that some bio is dispatched in advance within slice while configuration is updated. Print a huge value is not user-friendly. Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20230816012708.1193747-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 2 +- block/blk-throttle.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 7397ff199d66..5184f17f5129 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -816,7 +816,7 @@ static void tg_update_carryover(struct throtl_grp *tg) __tg_update_carryover(tg, WRITE); /* see comments in struct throtl_grp for meaning of these fields. */ - throtl_log(&tg->service_queue, "%s: %llu %llu %u %u\n", __func__, + throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__, tg->carryover_bytes[READ], tg->carryover_bytes[WRITE], tg->carryover_ios[READ], tg->carryover_ios[WRITE]); } diff --git a/block/blk-throttle.h b/block/blk-throttle.h index d1ccbfe9f797..bffbc9cfc8ab 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -127,8 +127,8 @@ struct throtl_grp { * bytes/ios are waited already in previous configuration, and they will * be used to calculate wait time under new configuration. */ - uint64_t carryover_bytes[2]; - unsigned int carryover_ios[2]; + long long carryover_bytes[2]; + int carryover_ios[2]; unsigned long last_check_time; From bb8d5587bdc3ab211e1eae2eeb966f7a7d1f9c0b Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 16 Aug 2023 09:27:06 +0800 Subject: [PATCH 02/10] blk-throttle: fix wrong comparation while 'carryover_ios/bytes' is negative carryover_ios/bytes[] can be negative in the case that ios are dispatched in the slice in advance, and then configuration is updated. For example: 1) set iops limit to 1000, and slice start is 0, slice end is 100ms; 2) current time is 0, and 100 ios are dispatched, those ios will not be throttled, hence io_disp is 100; 3) still at current time 0, update iops limit to 100, then carryover_ios is (0 - 100) = -100; 4) then, dispatch a new io at time 0, the expected result is that this io will wait for 1s. The calculation in tg_within_iops_limit: io_disp = 0; io_allowed = calculate_io_allowed + carryover_ios = 10 + (-100) = -90; io won't be throttled if (io_disp + 1 < io_allowed) passed. Before this patch, in step 4) (io_disp + 1 < io_allowed) is passed, because -90 for unsigned value is very huge, and such io won't be throttled. Fix this problem by checking if 'io/bytes_allowed' is negative first. Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20230816012708.1193747-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 5184f17f5129..7c93144d03da 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -825,7 +825,7 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio u32 iops_limit) { bool rw = bio_data_dir(bio); - unsigned int io_allowed; + int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; if (iops_limit == UINT_MAX) { @@ -838,9 +838,8 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + tg->carryover_ios[rw]; - if (tg->io_disp[rw] + 1 <= io_allowed) { + if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) return 0; - } /* Calc approx time to dispatch */ jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; @@ -851,7 +850,8 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, u64 bps_limit) { bool rw = bio_data_dir(bio); - u64 bytes_allowed, extra_bytes; + long long bytes_allowed; + u64 extra_bytes; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio); @@ -869,9 +869,8 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + tg->carryover_bytes[rw]; - if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) { + if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) return 0; - } /* Calc approx time to dispatch */ extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; From e8368b57c006dc0e02dcd8a9dc9f2060ff5476fe Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 16 Aug 2023 09:27:07 +0800 Subject: [PATCH 03/10] blk-throttle: use calculate_io/bytes_allowed() for throtl_trim_slice() There are no functional changes, just make the code cleaner. Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20230816012708.1193747-4-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 116 +++++++++++++++++++++---------------------- 1 file changed, 56 insertions(+), 60 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 7c93144d03da..69a994156772 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -697,66 +697,6 @@ static bool throtl_slice_used(struct throtl_grp *tg, bool rw) return true; } -/* Trim the used slices and adjust slice start accordingly */ -static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) -{ - unsigned long nr_slices, time_elapsed, io_trim; - u64 bytes_trim, tmp; - - BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); - - /* - * If bps are unlimited (-1), then time slice don't get - * renewed. Don't try to trim the slice if slice is used. A new - * slice will start when appropriate. - */ - if (throtl_slice_used(tg, rw)) - return; - - /* - * A bio has been dispatched. Also adjust slice_end. It might happen - * that initially cgroup limit was very low resulting in high - * slice_end, but later limit was bumped up and bio was dispatched - * sooner, then we need to reduce slice_end. A high bogus slice_end - * is bad because it does not allow new slice to start. - */ - - throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); - - time_elapsed = jiffies - tg->slice_start[rw]; - - nr_slices = time_elapsed / tg->td->throtl_slice; - - if (!nr_slices) - return; - tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices; - do_div(tmp, HZ); - bytes_trim = tmp; - - io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) / - HZ; - - if (!bytes_trim && !io_trim) - return; - - if (tg->bytes_disp[rw] >= bytes_trim) - tg->bytes_disp[rw] -= bytes_trim; - else - tg->bytes_disp[rw] = 0; - - if (tg->io_disp[rw] >= io_trim) - tg->io_disp[rw] -= io_trim; - else - tg->io_disp[rw] = 0; - - tg->slice_start[rw] += nr_slices * tg->td->throtl_slice; - - throtl_log(&tg->service_queue, - "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", - rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, - tg->slice_start[rw], tg->slice_end[rw], jiffies); -} - static unsigned int calculate_io_allowed(u32 iops_limit, unsigned long jiffy_elapsed) { @@ -786,6 +726,62 @@ static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); } +/* Trim the used slices and adjust slice start accordingly */ +static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) +{ + unsigned long time_elapsed, io_trim; + u64 bytes_trim; + + BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); + + /* + * If bps are unlimited (-1), then time slice don't get + * renewed. Don't try to trim the slice if slice is used. A new + * slice will start when appropriate. + */ + if (throtl_slice_used(tg, rw)) + return; + + /* + * A bio has been dispatched. Also adjust slice_end. It might happen + * that initially cgroup limit was very low resulting in high + * slice_end, but later limit was bumped up and bio was dispatched + * sooner, then we need to reduce slice_end. A high bogus slice_end + * is bad because it does not allow new slice to start. + */ + + throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); + + time_elapsed = rounddown(jiffies - tg->slice_start[rw], + tg->td->throtl_slice); + if (!time_elapsed) + return; + + bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw), + time_elapsed); + io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed); + if (!bytes_trim && !io_trim) + return; + + if (tg->bytes_disp[rw] >= bytes_trim) + tg->bytes_disp[rw] -= bytes_trim; + else + tg->bytes_disp[rw] = 0; + + if (tg->io_disp[rw] >= io_trim) + tg->io_disp[rw] -= io_trim; + else + tg->io_disp[rw] = 0; + + tg->slice_start[rw] += time_elapsed; + + throtl_log(&tg->service_queue, + "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', time_elapsed / tg->td->throtl_slice, + bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], + jiffies); +} + static void __tg_update_carryover(struct throtl_grp *tg, bool rw) { unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; From eead0056648cef49d7b15c07ae612fa217083165 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 16 Aug 2023 09:27:08 +0800 Subject: [PATCH 04/10] blk-throttle: consider 'carryover_ios/bytes' in throtl_trim_slice() Currently, 'carryover_ios/bytes' is not handled in throtl_trim_slice(), for consequence, 'carryover_ios/bytes' will be used to throttle bio multiple times, for example: 1) set iops limit to 100, and slice start is 0, slice end is 100ms; 2) current time is 0, and 10 ios are dispatched, those io won't be throttled and io_disp is 10; 3) still at current time 0, update iops limit to 1000, carryover_ios is updated to (0 - 10) = -10; 4) in this slice(0 - 100ms), io_allowed = 100 + (-10) = 90, which means only 90 ios can be dispatched without waiting; 5) assume that io is throttled in slice(0 - 100ms), and throtl_trim_slice() update silce to (100ms - 200ms). In this case, 'carryover_ios/bytes' is not cleared and still only 90 ios can be dispatched between 100ms - 200ms. Fix this problem by updating 'carryover_ios/bytes' in throtl_trim_slice(). Fixes: a880ae93e5b5 ("blk-throttle: fix io hung due to configuration updates") Reported-by: zhuxiaohui Link: https://lore.kernel.org/all/20230812072116.42321-1-zhuxiaohui.400@bytedance.com/ Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20230816012708.1193747-5-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 69a994156772..38a881cf97d0 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -729,8 +729,9 @@ static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) /* Trim the used slices and adjust slice start accordingly */ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) { - unsigned long time_elapsed, io_trim; - u64 bytes_trim; + unsigned long time_elapsed; + long long bytes_trim; + int io_trim; BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); @@ -758,17 +759,21 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) return; bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw), - time_elapsed); - io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed); - if (!bytes_trim && !io_trim) + time_elapsed) + + tg->carryover_bytes[rw]; + io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed) + + tg->carryover_ios[rw]; + if (bytes_trim <= 0 && io_trim <= 0) return; - if (tg->bytes_disp[rw] >= bytes_trim) + tg->carryover_bytes[rw] = 0; + if ((long long)tg->bytes_disp[rw] >= bytes_trim) tg->bytes_disp[rw] -= bytes_trim; else tg->bytes_disp[rw] = 0; - if (tg->io_disp[rw] >= io_trim) + tg->carryover_ios[rw] = 0; + if ((int)tg->io_disp[rw] >= io_trim) tg->io_disp[rw] -= io_trim; else tg->io_disp[rw] = 0; @@ -776,7 +781,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) tg->slice_start[rw] += time_elapsed; throtl_log(&tg->service_queue, - "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", + "[%c] trim slice nr=%lu bytes=%lld io=%d start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', time_elapsed / tg->td->throtl_slice, bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], jiffies); From 0d997f1de89970bff4be1b6f0142c0bd036c3ee8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 31 Aug 2023 14:19:11 +0200 Subject: [PATCH 05/10] block: remove the call to file_remove_privs in blkdev_write_iter file_remove_privs instantly returns 0 when not called for regular files, so don't bother. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230831121911.280155-1-hch@lst.de Signed-off-by: Jens Axboe --- block/fops.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/block/fops.c b/block/fops.c index a24a624d3bf7..acff3d5d22d4 100644 --- a/block/fops.c +++ b/block/fops.c @@ -671,10 +671,6 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) iov_iter_truncate(from, size); } - ret = file_remove_privs(file); - if (ret) - return ret; - ret = file_update_time(file); if (ret) return ret; From 1a721de8489fa559ff4471f73c58bb74ac5580d3 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Thu, 31 Aug 2023 15:59:00 +0800 Subject: [PATCH 06/10] block: don't add or resize partition on the disk with GENHD_FL_NO_PART Commit a33df75c6328 ("block: use an xarray for disk->part_tbl") remove disk_expand_part_tbl() in add_partition(), which means all kinds of devices will support extended dynamic `dev_t`. However, some devices with GENHD_FL_NO_PART are not expected to add or resize partition. Fix this by adding check of GENHD_FL_NO_PART before add or resize partition. Fixes: a33df75c6328 ("block: use an xarray for disk->part_tbl") Signed-off-by: Li Lingfeng Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230831075900.1725842-1-lilingfeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/ioctl.c b/block/ioctl.c index 648670ddb164..d5f5cd61efd7 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -20,6 +20,8 @@ static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition p; long long start, length; + if (disk->flags & GENHD_FL_NO_PART) + return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (copy_from_user(&p, upart, sizeof(struct blkpg_partition))) From f7cf22424665043787a96a66a048ff6b2cfd473c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 28 Aug 2023 17:31:42 +0200 Subject: [PATCH 07/10] s390/dasd: fix string length handling Building dasd_eckd.o with latest clang reveals this bug: CC drivers/s390/block/dasd_eckd.o drivers/s390/block/dasd_eckd.c:1082:3: warning: 'snprintf' will always be truncated; specified size is 1, but format string expands to at least 11 [-Wfortify-source] 1082 | snprintf(print_uid, sizeof(*print_uid), | ^ drivers/s390/block/dasd_eckd.c:1087:3: warning: 'snprintf' will always be truncated; specified size is 1, but format string expands to at least 10 [-Wfortify-source] 1087 | snprintf(print_uid, sizeof(*print_uid), | ^ Fix this by moving and using the existing UID_STRLEN for the arrays that are being written to. Also rename UID_STRLEN to DASD_UID_STRLEN to clarify its scope. Fixes: 23596961b437 ("s390/dasd: split up dasd_eckd_read_conf") Reviewed-by: Peter Oberparleiter Signed-off-by: Heiko Carstens Tested-by: Nick Desaulniers # build Reported-by: Nathan Chancellor Closes: https://github.com/ClangBuiltLinux/linux/issues/1923 Reviewed-by: Nick Desaulniers Link: https://lore.kernel.org/r/20230828153142.2843753-2-hca@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_devmap.c | 6 +----- drivers/s390/block/dasd_eckd.c | 10 +++++----- drivers/s390/block/dasd_int.h | 4 ++++ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index 620fab01b710..c4e36650c426 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -1378,16 +1378,12 @@ static ssize_t dasd_vendor_show(struct device *dev, static DEVICE_ATTR(vendor, 0444, dasd_vendor_show, NULL); -#define UID_STRLEN ( /* vendor */ 3 + 1 + /* serial */ 14 + 1 +\ - /* SSID */ 4 + 1 + /* unit addr */ 2 + 1 +\ - /* vduit */ 32 + 1) - static ssize_t dasd_uid_show(struct device *dev, struct device_attribute *attr, char *buf) { + char uid_string[DASD_UID_STRLEN]; struct dasd_device *device; struct dasd_uid uid; - char uid_string[UID_STRLEN]; char ua_string[3]; device = dasd_device_from_cdev(to_ccwdev(dev)); diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 8587e423169e..bd89b032968a 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -1079,12 +1079,12 @@ static void dasd_eckd_get_uid_string(struct dasd_conf *conf, create_uid(conf, &uid); if (strlen(uid.vduit) > 0) - snprintf(print_uid, sizeof(*print_uid), + snprintf(print_uid, DASD_UID_STRLEN, "%s.%s.%04x.%02x.%s", uid.vendor, uid.serial, uid.ssid, uid.real_unit_addr, uid.vduit); else - snprintf(print_uid, sizeof(*print_uid), + snprintf(print_uid, DASD_UID_STRLEN, "%s.%s.%04x.%02x", uid.vendor, uid.serial, uid.ssid, uid.real_unit_addr); @@ -1093,8 +1093,8 @@ static void dasd_eckd_get_uid_string(struct dasd_conf *conf, static int dasd_eckd_check_cabling(struct dasd_device *device, void *conf_data, __u8 lpm) { + char print_path_uid[DASD_UID_STRLEN], print_device_uid[DASD_UID_STRLEN]; struct dasd_eckd_private *private = device->private; - char print_path_uid[60], print_device_uid[60]; struct dasd_conf path_conf; path_conf.data = conf_data; @@ -1293,9 +1293,9 @@ static void dasd_eckd_path_available_action(struct dasd_device *device, __u8 path_rcd_buf[DASD_ECKD_RCD_DATA_SIZE]; __u8 lpm, opm, npm, ppm, epm, hpfpm, cablepm; struct dasd_conf_data *conf_data; + char print_uid[DASD_UID_STRLEN]; struct dasd_conf path_conf; unsigned long flags; - char print_uid[60]; int rc, pos; opm = 0; @@ -5855,8 +5855,8 @@ static void dasd_eckd_dump_sense(struct dasd_device *device, static int dasd_eckd_reload_device(struct dasd_device *device) { struct dasd_eckd_private *private = device->private; + char print_uid[DASD_UID_STRLEN]; int rc, old_base; - char print_uid[60]; struct dasd_uid uid; unsigned long flags; diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 0aa56351da72..8a4dbe9d7741 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -259,6 +259,10 @@ struct dasd_uid { char vduit[33]; }; +#define DASD_UID_STRLEN ( /* vendor */ 3 + 1 + /* serial */ 14 + 1 + \ + /* SSID */ 4 + 1 + /* unit addr */ 2 + 1 + \ + /* vduit */ 32 + 1) + /* * PPRC Status data */ From 5a26e45edb4690d58406178b5a9ea4c6dcf2c105 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 1 Sep 2023 20:03:06 +0800 Subject: [PATCH 08/10] null_blk: fix poll request timeout handling When doing io_uring benchmark on /dev/nullb0, it's easy to crash the kernel if poll requests timeout triggered, as reported by David. [1] BUG: kernel NULL pointer dereference, address: 0000000000000008 Workqueue: kblockd blk_mq_timeout_work RIP: 0010:null_timeout_rq+0x4e/0x91 Call Trace: ? null_timeout_rq+0x4e/0x91 blk_mq_handle_expired+0x31/0x4b bt_iter+0x68/0x84 ? bt_tags_iter+0x81/0x81 __sbitmap_for_each_set.constprop.0+0xb0/0xf2 ? __blk_mq_complete_request_remote+0xf/0xf bt_for_each+0x46/0x64 ? __blk_mq_complete_request_remote+0xf/0xf ? percpu_ref_get_many+0xc/0x2a blk_mq_queue_tag_busy_iter+0x14d/0x18e blk_mq_timeout_work+0x95/0x127 process_one_work+0x185/0x263 worker_thread+0x1b5/0x227 This is indeed a race problem between null_timeout_rq() and null_poll(). null_poll() null_timeout_rq() spin_lock(&nq->poll_lock) list_splice_init(&nq->poll_list, &list) spin_unlock(&nq->poll_lock) while (!list_empty(&list)) req = list_first_entry() list_del_init() ... blk_mq_add_to_batch() // req->rq_next = NULL spin_lock(&nq->poll_lock) // rq->queuelist->next == NULL list_del_init(&rq->queuelist) spin_unlock(&nq->poll_lock) Fix these problems by setting requests state to MQ_RQ_COMPLETE under nq->poll_lock protection, in which null_timeout_rq() can safely detect this race and early return. Note this patch just fix the kernel panic when request timeout happen. [1] https://lore.kernel.org/all/3893581.1691785261@warthog.procyon.org.uk/ Fixes: 0a593fbbc245 ("null_blk: poll queue support") Reported-by: David Howells Tested-by: David Howells Reviewed-by: Ming Lei Signed-off-by: Chengming Zhou Link: https://lore.kernel.org/r/20230901120306.170520-2-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 864013019d6b..968090935eb2 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1643,9 +1643,12 @@ static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) struct nullb_queue *nq = hctx->driver_data; LIST_HEAD(list); int nr = 0; + struct request *rq; spin_lock(&nq->poll_lock); list_splice_init(&nq->poll_list, &list); + list_for_each_entry(rq, &list, queuelist) + blk_mq_set_request_complete(rq); spin_unlock(&nq->poll_lock); while (!list_empty(&list)) { @@ -1671,16 +1674,21 @@ static enum blk_eh_timer_return null_timeout_rq(struct request *rq) struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); - pr_info("rq %p timed out\n", rq); - if (hctx->type == HCTX_TYPE_POLL) { struct nullb_queue *nq = hctx->driver_data; spin_lock(&nq->poll_lock); + /* The request may have completed meanwhile. */ + if (blk_mq_request_completed(rq)) { + spin_unlock(&nq->poll_lock); + return BLK_EH_DONE; + } list_del_init(&rq->queuelist); spin_unlock(&nq->poll_lock); } + pr_info("rq %p timed out\n", rq); + /* * If the device is marked as blocking (i.e. memory backed or zoned * device), the submission path may be blocked waiting for resources From 5905afc2c7bb713d52c7c7585565feecbb686b44 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 Sep 2023 14:47:31 +0200 Subject: [PATCH 09/10] block: fix pin count management when merging same-page segments There is no need to unpin the added page when adding it to the bio fails as that is done by the loop below. Instead we want to unpin it when adding a single page to the bio more than once as bio_release_pages will only unpin it once. Fixes: d1916c86ccdc ("block: move same page handling from __bio_add_pc_page to the callers") Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20230905124731.328255-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-map.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/block/blk-map.c b/block/blk-map.c index 44d74a30ddac..8584babf3ea0 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -315,12 +315,11 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, n = bytes; if (!bio_add_hw_page(rq->q, bio, page, n, offs, - max_sectors, &same_page)) { - if (same_page) - bio_release_page(bio, page); + max_sectors, &same_page)) break; - } + if (same_page) + bio_release_page(bio, page); bytes -= n; offs = 0; } From 4b9c2edaf7282d60e069551b4b28abc2932cd3e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=B6hmwalder?= Date: Wed, 6 Sep 2023 15:30:34 +0200 Subject: [PATCH 10/10] drbd: swap bvec_set_page len and offset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bvec_set_page has the following signature: static inline void bvec_set_page(struct bio_vec *bv, struct page *page, unsigned int len, unsigned int offset) However, the usage in DRBD swaps the len and offset parameters. This leads to a bvec with length=0 instead of the intended length=4096, which causes sock_sendmsg to return -EIO. This leaves DRBD unable to transmit any pages and thus completely broken. Swapping the parameters fixes the regression. Fixes: eeac7405c735 ("drbd: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage()") Reported-by: Serguei Ivantsov Link: https://lore.kernel.org/regressions/CAKH+VT3YLmAn0Y8=q37UTDShqxDLsqPcQ4hBMzY7HPn7zNx+RQ@mail.gmail.com/ Cc: stable@vger.kernel.org Signed-off-by: Christoph Böhmwalder Link: https://lore.kernel.org/r/20230906133034.948817-1-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 79ab532aabaf..6bc86106c7b2 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1557,7 +1557,7 @@ static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *pa do { int sent; - bvec_set_page(&bvec, page, offset, len); + bvec_set_page(&bvec, page, len, offset); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); sent = sock_sendmsg(socket, &msg);