From 79478bf9ea9fa48d30836afa796ac13d8a0f320b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Wed, 17 Nov 2021 07:13:54 +0100 Subject: [PATCH 001/132] block: move blk_rq_err_bytes to scsi blk_rq_err_bytes is only used by the scsi midlayer, so move it there. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Link: https://lore.kernel.org/r/20211117061404.331732-2-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-core.c | 41 ---------------------------------------- drivers/scsi/scsi_lib.c | 42 ++++++++++++++++++++++++++++++++++++++++- include/linux/blk-mq.h | 3 --- 3 files changed, 41 insertions(+), 45 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 1378d084c770..682b112f513f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1176,47 +1176,6 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); -/** - * blk_rq_err_bytes - determine number of bytes till the next failure boundary - * @rq: request to examine - * - * Description: - * A request could be merge of IOs which require different failure - * handling. This function determines the number of bytes which - * can be failed from the beginning of the request without - * crossing into area which need to be retried further. - * - * Return: - * The number of bytes to fail. - */ -unsigned int blk_rq_err_bytes(const struct request *rq) -{ - unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; - unsigned int bytes = 0; - struct bio *bio; - - if (!(rq->rq_flags & RQF_MIXED_MERGE)) - return blk_rq_bytes(rq); - - /* - * Currently the only 'mixing' which can happen is between - * different fastfail types. We can safely fail portions - * which have all the failfast bits that the first one has - - * the ones which are at least as eager to fail as the first - * one. - */ - for (bio = rq->bio; bio; bio = bio->bi_next) { - if ((bio->bi_opf & ff) != ff) - break; - bytes += bio->bi_iter.bi_size; - } - - /* this could lead to infinite loop */ - BUG_ON(blk_rq_bytes(rq) && !bytes); - return bytes; -} -EXPORT_SYMBOL_GPL(blk_rq_err_bytes); - static void update_io_ticks(struct block_device *part, unsigned long now, bool end) { diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 621d841d819a..5e8b5ecb3245 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -617,6 +617,46 @@ static blk_status_t scsi_result_to_blk_status(struct scsi_cmnd *cmd, int result) } } +/** + * scsi_rq_err_bytes - determine number of bytes till the next failure boundary + * @rq: request to examine + * + * Description: + * A request could be merge of IOs which require different failure + * handling. This function determines the number of bytes which + * can be failed from the beginning of the request without + * crossing into area which need to be retried further. + * + * Return: + * The number of bytes to fail. + */ +static unsigned int scsi_rq_err_bytes(const struct request *rq) +{ + unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; + unsigned int bytes = 0; + struct bio *bio; + + if (!(rq->rq_flags & RQF_MIXED_MERGE)) + return blk_rq_bytes(rq); + + /* + * Currently the only 'mixing' which can happen is between + * different fastfail types. We can safely fail portions + * which have all the failfast bits that the first one has - + * the ones which are at least as eager to fail as the first + * one. + */ + for (bio = rq->bio; bio; bio = bio->bi_next) { + if ((bio->bi_opf & ff) != ff) + break; + bytes += bio->bi_iter.bi_size; + } + + /* this could lead to infinite loop */ + BUG_ON(blk_rq_bytes(rq) && !bytes); + return bytes; +} + /* Helper for scsi_io_completion() when "reprep" action required. */ static void scsi_io_completion_reprep(struct scsi_cmnd *cmd, struct request_queue *q) @@ -794,7 +834,7 @@ static void scsi_io_completion_action(struct scsi_cmnd *cmd, int result) scsi_print_command(cmd); } } - if (!scsi_end_request(req, blk_stat, blk_rq_err_bytes(req))) + if (!scsi_end_request(req, blk_stat, scsi_rq_err_bytes(req))) return; fallthrough; case ACTION_REPREP: diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2949d9ac7484..a78d9a0f2a1b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -947,7 +947,6 @@ struct req_iterator { * blk_rq_pos() : the current sector * blk_rq_bytes() : bytes left in the entire request * blk_rq_cur_bytes() : bytes left in the current segment - * blk_rq_err_bytes() : bytes left till the next error boundary * blk_rq_sectors() : sectors left in the entire request * blk_rq_cur_sectors() : sectors left in the current segment * blk_rq_stats_sectors() : sectors of the entire request used for stats @@ -971,8 +970,6 @@ static inline int blk_rq_cur_bytes(const struct request *rq) return bio_iovec(rq->bio).bv_len; } -unsigned int blk_rq_err_bytes(const struct request *rq); - static inline unsigned int blk_rq_sectors(const struct request *rq) { return blk_rq_bytes(rq) >> SECTOR_SHIFT; From 786d4e01c550e8bb7c9f9f23bca0596a2a33483c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Wed, 17 Nov 2021 07:13:55 +0100 Subject: [PATCH 002/132] block: remove rq_flush_dcache_pages This function is trivial, and flush_dcache_page is always defined, so just open code it in the 2.5 callers. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Link: https://lore.kernel.org/r/20211117061404.331732-3-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-core.c | 19 ------------------- drivers/mtd/mtd_blkdevs.c | 10 ++++++++-- drivers/mtd/ubi/block.c | 6 +++++- include/linux/blk-mq.h | 10 ---------- 4 files changed, 13 insertions(+), 32 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 682b112f513f..039e28509f59 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1300,25 +1300,6 @@ void blk_steal_bios(struct bio_list *list, struct request *rq) } EXPORT_SYMBOL_GPL(blk_steal_bios); -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -/** - * rq_flush_dcache_pages - Helper function to flush all pages in a request - * @rq: the request to be flushed - * - * Description: - * Flush all pages in @rq. - */ -void rq_flush_dcache_pages(struct request *rq) -{ - struct req_iterator iter; - struct bio_vec bvec; - - rq_for_each_segment(bvec, rq, iter) - flush_dcache_page(bvec.bv_page); -} -EXPORT_SYMBOL_GPL(rq_flush_dcache_pages); -#endif - /** * blk_lld_busy - Check if underlying low-level drivers of a device are busy * @q : the queue of the device being checked diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 4eaba6f4ec68..66f81d42fe77 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -46,6 +46,8 @@ static blk_status_t do_blktrans_request(struct mtd_blktrans_ops *tr, struct mtd_blktrans_dev *dev, struct request *req) { + struct req_iterator iter; + struct bio_vec bvec; unsigned long block, nsect; char *buf; @@ -76,13 +78,17 @@ static blk_status_t do_blktrans_request(struct mtd_blktrans_ops *tr, } } kunmap(bio_page(req->bio)); - rq_flush_dcache_pages(req); + + rq_for_each_segment(bvec, req, iter) + flush_dcache_page(bvec.bv_page); return BLK_STS_OK; case REQ_OP_WRITE: if (!tr->writesect) return BLK_STS_IOERR; - rq_flush_dcache_pages(req); + rq_for_each_segment(bvec, req, iter) + flush_dcache_page(bvec.bv_page); + buf = kmap(bio_page(req->bio)) + bio_offset(req->bio); for (; nsect > 0; nsect--, block++, buf += tr->blksize) { if (tr->writesect(dev, block, buf)) { diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 062e6c2c45f5..302426ab30f8 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -294,6 +294,8 @@ static void ubiblock_do_work(struct work_struct *work) int ret; struct ubiblock_pdu *pdu = container_of(work, struct ubiblock_pdu, work); struct request *req = blk_mq_rq_from_pdu(pdu); + struct req_iterator iter; + struct bio_vec bvec; blk_mq_start_request(req); @@ -305,7 +307,9 @@ static void ubiblock_do_work(struct work_struct *work) blk_rq_map_sg(req->q, req, pdu->usgl.sg); ret = ubiblock_read(pdu); - rq_flush_dcache_pages(req); + + rq_for_each_segment(bvec, req, iter) + flush_dcache_page(bvec.bv_page); blk_mq_end_request(req, errno_to_blk_status(ret)); } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a78d9a0f2a1b..308edc2a4925 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1132,14 +1132,4 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq) } #endif /* CONFIG_BLK_DEV_ZONED */ -#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" -#endif -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -void rq_flush_dcache_pages(struct request *rq); -#else -static inline void rq_flush_dcache_pages(struct request *rq) -{ -} -#endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */ #endif /* BLK_MQ_H */ From 4054cff92c357813b6861b622122b344990f7e31 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Wed, 17 Nov 2021 07:13:56 +0100 Subject: [PATCH 003/132] block: remove blk-exec.c All this code is tightly coupled to the blk-mq core, so move it there. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Link: https://lore.kernel.org/r/20211117061404.331732-4-hch@lst.de [axboe: remove doc generation for blk-exec.c] Signed-off-by: Jens Axboe <axboe@kernel.dk> --- Documentation/core-api/kernel-api.rst | 3 - .../zh_CN/core-api/kernel-api.rst | 2 - block/Makefile | 2 +- block/blk-exec.c | 116 ------------------ block/blk-mq.c | 107 ++++++++++++++++ 5 files changed, 108 insertions(+), 122 deletions(-) delete mode 100644 block/blk-exec.c diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 2e7186805148..19f501d58f5d 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -294,9 +294,6 @@ Block Devices .. kernel-doc:: block/blk-settings.c :export: -.. kernel-doc:: block/blk-exec.c - :export: - .. kernel-doc:: block/blk-flush.c :export: diff --git a/Documentation/translations/zh_CN/core-api/kernel-api.rst b/Documentation/translations/zh_CN/core-api/kernel-api.rst index ab7d81889340..e45fe80d1cd8 100644 --- a/Documentation/translations/zh_CN/core-api/kernel-api.rst +++ b/Documentation/translations/zh_CN/core-api/kernel-api.rst @@ -292,8 +292,6 @@ block/blk-sysfs.c block/blk-settings.c -block/blk-exec.c - block/blk-flush.c block/blk-lib.c diff --git a/block/Makefile b/block/Makefile index 44df57e562bf..f38eaa612929 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ - blk-exec.o blk-merge.o blk-timeout.o \ + blk-merge.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \ diff --git a/block/blk-exec.c b/block/blk-exec.c deleted file mode 100644 index 1b8b47f6e79b..000000000000 --- a/block/blk-exec.c +++ /dev/null @@ -1,116 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Functions related to setting various queue properties from drivers - */ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/blk-mq.h> -#include <linux/sched/sysctl.h> - -#include "blk.h" -#include "blk-mq-sched.h" - -/** - * blk_end_sync_rq - executes a completion event on a request - * @rq: request to complete - * @error: end I/O status of the request - */ -static void blk_end_sync_rq(struct request *rq, blk_status_t error) -{ - struct completion *waiting = rq->end_io_data; - - rq->end_io_data = (void *)(uintptr_t)error; - - /* - * complete last, if this is a stack request the process (and thus - * the rq pointer) could be invalid right after this complete() - */ - complete(waiting); -} - -/** - * blk_execute_rq_nowait - insert a request to I/O scheduler for execution - * @bd_disk: matching gendisk - * @rq: request to insert - * @at_head: insert request at head or tail of queue - * @done: I/O completion handler - * - * Description: - * Insert a fully prepared request at the back of the I/O scheduler queue - * for execution. Don't wait for completion. - * - * Note: - * This function will invoke @done directly if the queue is dead. - */ -void blk_execute_rq_nowait(struct gendisk *bd_disk, struct request *rq, - int at_head, rq_end_io_fn *done) -{ - WARN_ON(irqs_disabled()); - WARN_ON(!blk_rq_is_passthrough(rq)); - - rq->rq_disk = bd_disk; - rq->end_io = done; - - blk_account_io_start(rq); - - /* - * don't check dying flag for MQ because the request won't - * be reused after dying flag is set - */ - blk_mq_sched_insert_request(rq, at_head, true, false); -} -EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); - -static bool blk_rq_is_poll(struct request *rq) -{ - if (!rq->mq_hctx) - return false; - if (rq->mq_hctx->type != HCTX_TYPE_POLL) - return false; - if (WARN_ON_ONCE(!rq->bio)) - return false; - return true; -} - -static void blk_rq_poll_completion(struct request *rq, struct completion *wait) -{ - do { - bio_poll(rq->bio, NULL, 0); - cond_resched(); - } while (!completion_done(wait)); -} - -/** - * blk_execute_rq - insert a request into queue for execution - * @bd_disk: matching gendisk - * @rq: request to insert - * @at_head: insert request at head or tail of queue - * - * Description: - * Insert a fully prepared request at the back of the I/O scheduler queue - * for execution and wait for completion. - * Return: The blk_status_t result provided to blk_mq_end_request(). - */ -blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq, int at_head) -{ - DECLARE_COMPLETION_ONSTACK(wait); - unsigned long hang_check; - - rq->end_io_data = &wait; - blk_execute_rq_nowait(bd_disk, rq, at_head, blk_end_sync_rq); - - /* Prevent hang_check timer from firing at us during very long I/O */ - hang_check = sysctl_hung_task_timeout_secs; - - if (blk_rq_is_poll(rq)) - blk_rq_poll_completion(rq, &wait); - else if (hang_check) - while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); - else - wait_for_completion_io(&wait); - - return (blk_status_t)(uintptr_t)rq->end_io_data; -} -EXPORT_SYMBOL(blk_execute_rq); diff --git a/block/blk-mq.c b/block/blk-mq.c index 8874a63ae952..3e5dc87e0cfc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -28,6 +28,7 @@ #include <linux/crash_dump.h> #include <linux/prefetch.h> #include <linux/blk-crypto.h> +#include <linux/sched/sysctl.h> #include <trace/events/block.h> @@ -1058,6 +1059,112 @@ void blk_mq_start_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_start_request); +/** + * blk_end_sync_rq - executes a completion event on a request + * @rq: request to complete + * @error: end I/O status of the request + */ +static void blk_end_sync_rq(struct request *rq, blk_status_t error) +{ + struct completion *waiting = rq->end_io_data; + + rq->end_io_data = (void *)(uintptr_t)error; + + /* + * complete last, if this is a stack request the process (and thus + * the rq pointer) could be invalid right after this complete() + */ + complete(waiting); +} + +/** + * blk_execute_rq_nowait - insert a request to I/O scheduler for execution + * @bd_disk: matching gendisk + * @rq: request to insert + * @at_head: insert request at head or tail of queue + * @done: I/O completion handler + * + * Description: + * Insert a fully prepared request at the back of the I/O scheduler queue + * for execution. Don't wait for completion. + * + * Note: + * This function will invoke @done directly if the queue is dead. + */ +void blk_execute_rq_nowait(struct gendisk *bd_disk, struct request *rq, + int at_head, rq_end_io_fn *done) +{ + WARN_ON(irqs_disabled()); + WARN_ON(!blk_rq_is_passthrough(rq)); + + rq->rq_disk = bd_disk; + rq->end_io = done; + + blk_account_io_start(rq); + + /* + * don't check dying flag for MQ because the request won't + * be reused after dying flag is set + */ + blk_mq_sched_insert_request(rq, at_head, true, false); +} +EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); + +static bool blk_rq_is_poll(struct request *rq) +{ + if (!rq->mq_hctx) + return false; + if (rq->mq_hctx->type != HCTX_TYPE_POLL) + return false; + if (WARN_ON_ONCE(!rq->bio)) + return false; + return true; +} + +static void blk_rq_poll_completion(struct request *rq, struct completion *wait) +{ + do { + bio_poll(rq->bio, NULL, 0); + cond_resched(); + } while (!completion_done(wait)); +} + +/** + * blk_execute_rq - insert a request into queue for execution + * @bd_disk: matching gendisk + * @rq: request to insert + * @at_head: insert request at head or tail of queue + * + * Description: + * Insert a fully prepared request at the back of the I/O scheduler queue + * for execution and wait for completion. + * Return: The blk_status_t result provided to blk_mq_end_request(). + */ +blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq, + int at_head) +{ + DECLARE_COMPLETION_ONSTACK(wait); + unsigned long hang_check; + + rq->end_io_data = &wait; + blk_execute_rq_nowait(bd_disk, rq, at_head, blk_end_sync_rq); + + /* Prevent hang_check timer from firing at us during very long I/O */ + hang_check = sysctl_hung_task_timeout_secs; + + if (blk_rq_is_poll(rq)) + blk_rq_poll_completion(rq, &wait); + else if (hang_check) + while (!wait_for_completion_io_timeout(&wait, + hang_check * (HZ/2))) + ; + else + wait_for_completion_io(&wait); + + return (blk_status_t)(uintptr_t)rq->end_io_data; +} +EXPORT_SYMBOL(blk_execute_rq); + static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; From b84c5b50d329bf7cfdba6bd5c8a99f1b8604e301 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Wed, 17 Nov 2021 07:13:57 +0100 Subject: [PATCH 004/132] blk-mq: move blk_mq_flush_plug_list Move blk_mq_flush_plug_list and blk_mq_plug_issue_direct down in blk-mq.c to prepare for marking blk_mq_request_issue_directly static without the need of a forward declaration. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Link: https://lore.kernel.org/r/20211117061404.331732-5-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq.c | 184 ++++++++++++++++++++++++------------------------- 1 file changed, 92 insertions(+), 92 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 3e5dc87e0cfc..df28e5ef8c2d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2309,98 +2309,6 @@ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int *queued, *queued = 0; } -static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule) -{ - struct blk_mq_hw_ctx *hctx = NULL; - struct request *rq; - int queued = 0; - int errors = 0; - - while ((rq = rq_list_pop(&plug->mq_list))) { - bool last = rq_list_empty(plug->mq_list); - blk_status_t ret; - - if (hctx != rq->mq_hctx) { - if (hctx) - blk_mq_commit_rqs(hctx, &queued, from_schedule); - hctx = rq->mq_hctx; - } - - ret = blk_mq_request_issue_directly(rq, last); - switch (ret) { - case BLK_STS_OK: - queued++; - break; - case BLK_STS_RESOURCE: - case BLK_STS_DEV_RESOURCE: - blk_mq_request_bypass_insert(rq, false, last); - blk_mq_commit_rqs(hctx, &queued, from_schedule); - return; - default: - blk_mq_end_request(rq, ret); - errors++; - break; - } - } - - /* - * If we didn't flush the entire list, we could have told the driver - * there was more coming, but that turned out to be a lie. - */ - if (errors) - blk_mq_commit_rqs(hctx, &queued, from_schedule); -} - -void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) -{ - struct blk_mq_hw_ctx *this_hctx; - struct blk_mq_ctx *this_ctx; - unsigned int depth; - LIST_HEAD(list); - - if (rq_list_empty(plug->mq_list)) - return; - plug->rq_count = 0; - - if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { - blk_mq_plug_issue_direct(plug, false); - if (rq_list_empty(plug->mq_list)) - return; - } - - this_hctx = NULL; - this_ctx = NULL; - depth = 0; - do { - struct request *rq; - - rq = rq_list_pop(&plug->mq_list); - - if (!this_hctx) { - this_hctx = rq->mq_hctx; - this_ctx = rq->mq_ctx; - } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) { - trace_block_unplug(this_hctx->queue, depth, - !from_schedule); - blk_mq_sched_insert_requests(this_hctx, this_ctx, - &list, from_schedule); - depth = 0; - this_hctx = rq->mq_hctx; - this_ctx = rq->mq_ctx; - - } - - list_add(&rq->queuelist, &list); - depth++; - } while (!rq_list_empty(plug->mq_list)); - - if (!list_empty(&list)) { - trace_block_unplug(this_hctx->queue, depth, !from_schedule); - blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, - from_schedule); - } -} - static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, unsigned int nr_segs) { @@ -2540,6 +2448,98 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) return ret; } +static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule) +{ + struct blk_mq_hw_ctx *hctx = NULL; + struct request *rq; + int queued = 0; + int errors = 0; + + while ((rq = rq_list_pop(&plug->mq_list))) { + bool last = rq_list_empty(plug->mq_list); + blk_status_t ret; + + if (hctx != rq->mq_hctx) { + if (hctx) + blk_mq_commit_rqs(hctx, &queued, from_schedule); + hctx = rq->mq_hctx; + } + + ret = blk_mq_request_issue_directly(rq, last); + switch (ret) { + case BLK_STS_OK: + queued++; + break; + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + blk_mq_request_bypass_insert(rq, false, last); + blk_mq_commit_rqs(hctx, &queued, from_schedule); + return; + default: + blk_mq_end_request(rq, ret); + errors++; + break; + } + } + + /* + * If we didn't flush the entire list, we could have told the driver + * there was more coming, but that turned out to be a lie. + */ + if (errors) + blk_mq_commit_rqs(hctx, &queued, from_schedule); +} + +void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) +{ + struct blk_mq_hw_ctx *this_hctx; + struct blk_mq_ctx *this_ctx; + unsigned int depth; + LIST_HEAD(list); + + if (rq_list_empty(plug->mq_list)) + return; + plug->rq_count = 0; + + if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { + blk_mq_plug_issue_direct(plug, false); + if (rq_list_empty(plug->mq_list)) + return; + } + + this_hctx = NULL; + this_ctx = NULL; + depth = 0; + do { + struct request *rq; + + rq = rq_list_pop(&plug->mq_list); + + if (!this_hctx) { + this_hctx = rq->mq_hctx; + this_ctx = rq->mq_ctx; + } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) { + trace_block_unplug(this_hctx->queue, depth, + !from_schedule); + blk_mq_sched_insert_requests(this_hctx, this_ctx, + &list, from_schedule); + depth = 0; + this_hctx = rq->mq_hctx; + this_ctx = rq->mq_ctx; + + } + + list_add(&rq->queuelist, &list); + depth++; + } while (!rq_list_empty(plug->mq_list)); + + if (!list_empty(&list)) { + trace_block_unplug(this_hctx->queue, depth, !from_schedule); + blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, + from_schedule); + } +} + void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { From 06c8c691e2820077936e59ad334eb806e90b69eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Wed, 17 Nov 2021 07:13:58 +0100 Subject: [PATCH 005/132] block: move request based cloning helpers to blk-mq.c Keep all the request based code together. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Link: https://lore.kernel.org/r/20211117061404.331732-6-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-core.c | 184 +---------------------------------------------- block/blk-mq.c | 175 +++++++++++++++++++++++++++++++++++++++++++- block/blk-mq.h | 3 - block/blk.h | 10 +++ 4 files changed, 185 insertions(+), 187 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 039e28509f59..5d6017d7f84e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -594,7 +594,7 @@ static int __init setup_fail_make_request(char *str) } __setup("fail_make_request=", setup_fail_make_request); -static bool should_fail_request(struct block_device *part, unsigned int bytes) +bool should_fail_request(struct block_device *part, unsigned int bytes) { return part->bd_make_it_fail && should_fail(&fail_make_request, bytes); } @@ -608,15 +608,6 @@ static int __init fail_make_request_debugfs(void) } late_initcall(fail_make_request_debugfs); - -#else /* CONFIG_FAIL_MAKE_REQUEST */ - -static inline bool should_fail_request(struct block_device *part, - unsigned int bytes) -{ - return false; -} - #endif /* CONFIG_FAIL_MAKE_REQUEST */ static inline bool bio_check_ro(struct bio *bio) @@ -1090,92 +1081,6 @@ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, } EXPORT_SYMBOL_GPL(iocb_bio_iopoll); -/** - * blk_cloned_rq_check_limits - Helper function to check a cloned request - * for the new queue limits - * @q: the queue - * @rq: the request being checked - * - * Description: - * @rq may have been made based on weaker limitations of upper-level queues - * in request stacking drivers, and it may violate the limitation of @q. - * Since the block layer and the underlying device driver trust @rq - * after it is inserted to @q, it should be checked against @q before - * the insertion using this generic function. - * - * Request stacking drivers like request-based dm may change the queue - * limits when retrying requests on other queues. Those requests need - * to be checked against the new queue limits again during dispatch. - */ -static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, - struct request *rq) -{ - unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); - - if (blk_rq_sectors(rq) > max_sectors) { - /* - * SCSI device does not have a good way to return if - * Write Same/Zero is actually supported. If a device rejects - * a non-read/write command (discard, write same,etc.) the - * low-level device driver will set the relevant queue limit to - * 0 to prevent blk-lib from issuing more of the offending - * operations. Commands queued prior to the queue limit being - * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O - * errors being propagated to upper layers. - */ - if (max_sectors == 0) - return BLK_STS_NOTSUPP; - - printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", - __func__, blk_rq_sectors(rq), max_sectors); - return BLK_STS_IOERR; - } - - /* - * The queue settings related to segment counting may differ from the - * original queue. - */ - rq->nr_phys_segments = blk_recalc_rq_segments(rq); - if (rq->nr_phys_segments > queue_max_segments(q)) { - printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", - __func__, rq->nr_phys_segments, queue_max_segments(q)); - return BLK_STS_IOERR; - } - - return BLK_STS_OK; -} - -/** - * blk_insert_cloned_request - Helper for stacking drivers to submit a request - * @q: the queue to submit the request - * @rq: the request being queued - */ -blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) -{ - blk_status_t ret; - - ret = blk_cloned_rq_check_limits(q, rq); - if (ret != BLK_STS_OK) - return ret; - - if (rq->rq_disk && - should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq))) - return BLK_STS_IOERR; - - if (blk_crypto_insert_cloned_request(rq)) - return BLK_STS_IOERR; - - blk_account_io_start(rq); - - /* - * Since we have a scheduler attached on the top device, - * bypass a potential scheduler on the bottom device for - * insert. - */ - return blk_mq_request_issue_directly(rq, true); -} -EXPORT_SYMBOL_GPL(blk_insert_cloned_request); - static void update_io_ticks(struct block_device *part, unsigned long now, bool end) { @@ -1328,93 +1233,6 @@ int blk_lld_busy(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_lld_busy); -/** - * blk_rq_unprep_clone - Helper function to free all bios in a cloned request - * @rq: the clone request to be cleaned up - * - * Description: - * Free all bios in @rq for a cloned request. - */ -void blk_rq_unprep_clone(struct request *rq) -{ - struct bio *bio; - - while ((bio = rq->bio) != NULL) { - rq->bio = bio->bi_next; - - bio_put(bio); - } -} -EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); - -/** - * blk_rq_prep_clone - Helper function to setup clone request - * @rq: the request to be setup - * @rq_src: original request to be cloned - * @bs: bio_set that bios for clone are allocated from - * @gfp_mask: memory allocation mask for bio - * @bio_ctr: setup function to be called for each clone bio. - * Returns %0 for success, non %0 for failure. - * @data: private data to be passed to @bio_ctr - * - * Description: - * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. - * Also, pages which the original bios are pointing to are not copied - * and the cloned bios just point same pages. - * So cloned bios must be completed before original bios, which means - * the caller must complete @rq before @rq_src. - */ -int blk_rq_prep_clone(struct request *rq, struct request *rq_src, - struct bio_set *bs, gfp_t gfp_mask, - int (*bio_ctr)(struct bio *, struct bio *, void *), - void *data) -{ - struct bio *bio, *bio_src; - - if (!bs) - bs = &fs_bio_set; - - __rq_for_each_bio(bio_src, rq_src) { - bio = bio_clone_fast(bio_src, gfp_mask, bs); - if (!bio) - goto free_and_out; - - if (bio_ctr && bio_ctr(bio, bio_src, data)) - goto free_and_out; - - if (rq->bio) { - rq->biotail->bi_next = bio; - rq->biotail = bio; - } else { - rq->bio = rq->biotail = bio; - } - bio = NULL; - } - - /* Copy attributes of the original request to the clone request. */ - rq->__sector = blk_rq_pos(rq_src); - rq->__data_len = blk_rq_bytes(rq_src); - if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { - rq->rq_flags |= RQF_SPECIAL_PAYLOAD; - rq->special_vec = rq_src->special_vec; - } - rq->nr_phys_segments = rq_src->nr_phys_segments; - rq->ioprio = rq_src->ioprio; - - if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) - goto free_and_out; - - return 0; - -free_and_out: - if (bio) - bio_put(bio); - blk_rq_unprep_clone(rq); - - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(blk_rq_prep_clone); - int kblockd_schedule_work(struct work_struct *work) { return queue_work(kblockd_workqueue, work); diff --git a/block/blk-mq.c b/block/blk-mq.c index df28e5ef8c2d..812dac9ecb29 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2435,7 +2435,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, hctx_unlock(hctx, srcu_idx); } -blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) +static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) { blk_status_t ret; int srcu_idx; @@ -2824,6 +2824,179 @@ void blk_mq_submit_bio(struct bio *bio) } } +/** + * blk_cloned_rq_check_limits - Helper function to check a cloned request + * for the new queue limits + * @q: the queue + * @rq: the request being checked + * + * Description: + * @rq may have been made based on weaker limitations of upper-level queues + * in request stacking drivers, and it may violate the limitation of @q. + * Since the block layer and the underlying device driver trust @rq + * after it is inserted to @q, it should be checked against @q before + * the insertion using this generic function. + * + * Request stacking drivers like request-based dm may change the queue + * limits when retrying requests on other queues. Those requests need + * to be checked against the new queue limits again during dispatch. + */ +static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, + struct request *rq) +{ + unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + + if (blk_rq_sectors(rq) > max_sectors) { + /* + * SCSI device does not have a good way to return if + * Write Same/Zero is actually supported. If a device rejects + * a non-read/write command (discard, write same,etc.) the + * low-level device driver will set the relevant queue limit to + * 0 to prevent blk-lib from issuing more of the offending + * operations. Commands queued prior to the queue limit being + * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O + * errors being propagated to upper layers. + */ + if (max_sectors == 0) + return BLK_STS_NOTSUPP; + + printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", + __func__, blk_rq_sectors(rq), max_sectors); + return BLK_STS_IOERR; + } + + /* + * The queue settings related to segment counting may differ from the + * original queue. + */ + rq->nr_phys_segments = blk_recalc_rq_segments(rq); + if (rq->nr_phys_segments > queue_max_segments(q)) { + printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", + __func__, rq->nr_phys_segments, queue_max_segments(q)); + return BLK_STS_IOERR; + } + + return BLK_STS_OK; +} + +/** + * blk_insert_cloned_request - Helper for stacking drivers to submit a request + * @q: the queue to submit the request + * @rq: the request being queued + */ +blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) +{ + blk_status_t ret; + + ret = blk_cloned_rq_check_limits(q, rq); + if (ret != BLK_STS_OK) + return ret; + + if (rq->rq_disk && + should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq))) + return BLK_STS_IOERR; + + if (blk_crypto_insert_cloned_request(rq)) + return BLK_STS_IOERR; + + blk_account_io_start(rq); + + /* + * Since we have a scheduler attached on the top device, + * bypass a potential scheduler on the bottom device for + * insert. + */ + return blk_mq_request_issue_directly(rq, true); +} +EXPORT_SYMBOL_GPL(blk_insert_cloned_request); + +/** + * blk_rq_unprep_clone - Helper function to free all bios in a cloned request + * @rq: the clone request to be cleaned up + * + * Description: + * Free all bios in @rq for a cloned request. + */ +void blk_rq_unprep_clone(struct request *rq) +{ + struct bio *bio; + + while ((bio = rq->bio) != NULL) { + rq->bio = bio->bi_next; + + bio_put(bio); + } +} +EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); + +/** + * blk_rq_prep_clone - Helper function to setup clone request + * @rq: the request to be setup + * @rq_src: original request to be cloned + * @bs: bio_set that bios for clone are allocated from + * @gfp_mask: memory allocation mask for bio + * @bio_ctr: setup function to be called for each clone bio. + * Returns %0 for success, non %0 for failure. + * @data: private data to be passed to @bio_ctr + * + * Description: + * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. + * Also, pages which the original bios are pointing to are not copied + * and the cloned bios just point same pages. + * So cloned bios must be completed before original bios, which means + * the caller must complete @rq before @rq_src. + */ +int blk_rq_prep_clone(struct request *rq, struct request *rq_src, + struct bio_set *bs, gfp_t gfp_mask, + int (*bio_ctr)(struct bio *, struct bio *, void *), + void *data) +{ + struct bio *bio, *bio_src; + + if (!bs) + bs = &fs_bio_set; + + __rq_for_each_bio(bio_src, rq_src) { + bio = bio_clone_fast(bio_src, gfp_mask, bs); + if (!bio) + goto free_and_out; + + if (bio_ctr && bio_ctr(bio, bio_src, data)) + goto free_and_out; + + if (rq->bio) { + rq->biotail->bi_next = bio; + rq->biotail = bio; + } else { + rq->bio = rq->biotail = bio; + } + bio = NULL; + } + + /* Copy attributes of the original request to the clone request. */ + rq->__sector = blk_rq_pos(rq_src); + rq->__data_len = blk_rq_bytes(rq_src); + if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { + rq->rq_flags |= RQF_SPECIAL_PAYLOAD; + rq->special_vec = rq_src->special_vec; + } + rq->nr_phys_segments = rq_src->nr_phys_segments; + rq->ioprio = rq_src->ioprio; + + if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) + goto free_and_out; + + return 0; + +free_and_out: + if (bio) + bio_put(bio); + blk_rq_unprep_clone(rq); + + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(blk_rq_prep_clone); + static size_t order_to_size(unsigned int order) { return (size_t)PAGE_SIZE << order; diff --git a/block/blk-mq.h b/block/blk-mq.h index afcf9931a489..d516c7a46f57 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -65,9 +65,6 @@ void blk_mq_request_bypass_insert(struct request *rq, bool at_head, bool run_queue); void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list); - -/* Used by blk_insert_cloned_request() to issue request directly */ -blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last); void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list); diff --git a/block/blk.h b/block/blk.h index ccde6e6f1736..8a3761b6dc33 100644 --- a/block/blk.h +++ b/block/blk.h @@ -493,4 +493,14 @@ int disk_register_independent_access_ranges(struct gendisk *disk, struct blk_independent_access_ranges *new_iars); void disk_unregister_independent_access_ranges(struct gendisk *disk); +#ifdef CONFIG_FAIL_MAKE_REQUEST +bool should_fail_request(struct block_device *part, unsigned int bytes); +#else /* CONFIG_FAIL_MAKE_REQUEST */ +static inline bool should_fail_request(struct block_device *part, + unsigned int bytes) +{ + return false; +} +#endif /* CONFIG_FAIL_MAKE_REQUEST */ + #endif /* BLK_INTERNAL_H */ From 52fdbbcc83f35da90f857668acc387470ed84606 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Wed, 17 Nov 2021 07:13:59 +0100 Subject: [PATCH 006/132] block: move blk_rq_init to blk-mq.c blk_rq_init deals with a request structure, so move it to blk-mq.c Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Link: https://lore.kernel.org/r/20211117061404.331732-7-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-core.c | 17 ----------------- block/blk-mq.c | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 5d6017d7f84e..6c0bc2d895a1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -109,23 +109,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); -void blk_rq_init(struct request_queue *q, struct request *rq) -{ - memset(rq, 0, sizeof(*rq)); - - INIT_LIST_HEAD(&rq->queuelist); - rq->q = q; - rq->__sector = (sector_t) -1; - INIT_HLIST_NODE(&rq->hash); - RB_CLEAR_NODE(&rq->rb_node); - rq->tag = BLK_MQ_NO_TAG; - rq->internal_tag = BLK_MQ_NO_TAG; - rq->start_time_ns = ktime_get_ns(); - rq->part = NULL; - blk_crypto_rq_set_defaults(rq); -} -EXPORT_SYMBOL(blk_rq_init); - #define REQ_OP_NAME(name) [REQ_OP_##name] = #name static const char *const blk_op_name[] = { REQ_OP_NAME(READ), diff --git a/block/blk-mq.c b/block/blk-mq.c index 812dac9ecb29..3217139d2e0b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -328,6 +328,23 @@ void blk_mq_wake_waiters(struct request_queue *q) blk_mq_tag_wakeup_all(hctx->tags, true); } +void blk_rq_init(struct request_queue *q, struct request *rq) +{ + memset(rq, 0, sizeof(*rq)); + + INIT_LIST_HEAD(&rq->queuelist); + rq->q = q; + rq->__sector = (sector_t) -1; + INIT_HLIST_NODE(&rq->hash); + RB_CLEAR_NODE(&rq->rb_node); + rq->tag = BLK_MQ_NO_TAG; + rq->internal_tag = BLK_MQ_NO_TAG; + rq->start_time_ns = ktime_get_ns(); + rq->part = NULL; + blk_crypto_rq_set_defaults(rq); +} +EXPORT_SYMBOL(blk_rq_init); + static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns) { From f2b8f3ce989d8066f2dedaad223b3e73c4485223 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Wed, 17 Nov 2021 07:14:00 +0100 Subject: [PATCH 007/132] block: move blk_steal_bios to blk-mq.c Keep all the request based code together. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Link: https://lore.kernel.org/r/20211117061404.331732-8-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-core.c | 21 --------------------- block/blk-mq.c | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 6c0bc2d895a1..65891f058f3d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1167,27 +1167,6 @@ void disk_end_io_acct(struct gendisk *disk, unsigned int op, } EXPORT_SYMBOL(disk_end_io_acct); -/* - * Steal bios from a request and add them to a bio list. - * The request must not have been partially completed before. - */ -void blk_steal_bios(struct bio_list *list, struct request *rq) -{ - if (rq->bio) { - if (list->tail) - list->tail->bi_next = rq->bio; - else - list->head = rq->bio; - list->tail = rq->biotail; - - rq->bio = NULL; - rq->biotail = NULL; - } - - rq->__data_len = 0; -} -EXPORT_SYMBOL_GPL(blk_steal_bios); - /** * blk_lld_busy - Check if underlying low-level drivers of a device are busy * @q : the queue of the device being checked diff --git a/block/blk-mq.c b/block/blk-mq.c index 3217139d2e0b..cd5f31c4d2fd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3014,6 +3014,27 @@ free_and_out: } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); +/* + * Steal bios from a request and add them to a bio list. + * The request must not have been partially completed before. + */ +void blk_steal_bios(struct bio_list *list, struct request *rq) +{ + if (rq->bio) { + if (list->tail) + list->tail->bi_next = rq->bio; + else + list->head = rq->bio; + list->tail = rq->biotail; + + rq->bio = NULL; + rq->biotail = NULL; + } + + rq->__data_len = 0; +} +EXPORT_SYMBOL_GPL(blk_steal_bios); + static size_t order_to_size(unsigned int order) { return (size_t)PAGE_SIZE << order; From 450b7879e34517c3ebc3a35a53806fe40e60fac2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Wed, 17 Nov 2021 07:14:01 +0100 Subject: [PATCH 008/132] block: move blk_account_io_{start,done} to blk-mq.c These are only used for request based I/O, so move them where they are used. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Link: https://lore.kernel.org/r/20211117061404.331732-9-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-core.c | 27 +-------------------------- block/blk-mq.c | 42 ++++++++++++++++++++++++++++++++++++++++++ block/blk.h | 21 +-------------------- 3 files changed, 44 insertions(+), 46 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 65891f058f3d..29c4db03742b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1064,8 +1064,7 @@ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, } EXPORT_SYMBOL_GPL(iocb_bio_iopoll); -static void update_io_ticks(struct block_device *part, unsigned long now, - bool end) +void update_io_ticks(struct block_device *part, unsigned long now, bool end) { unsigned long stamp; again: @@ -1080,30 +1079,6 @@ again: } } -void __blk_account_io_done(struct request *req, u64 now) -{ - const int sgrp = op_stat_group(req_op(req)); - - part_stat_lock(); - update_io_ticks(req->part, jiffies, true); - part_stat_inc(req->part, ios[sgrp]); - part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); - part_stat_unlock(); -} - -void __blk_account_io_start(struct request *rq) -{ - /* passthrough requests can hold bios that do not have ->bi_bdev set */ - if (rq->bio && rq->bio->bi_bdev) - rq->part = rq->bio->bi_bdev; - else - rq->part = rq->rq_disk->part0; - - part_stat_lock(); - update_io_ticks(rq->part, jiffies, false); - part_stat_unlock(); -} - static unsigned long __part_start_io_acct(struct block_device *part, unsigned int sectors, unsigned int op) { diff --git a/block/blk-mq.c b/block/blk-mq.c index cd5f31c4d2fd..3921c7cfd64c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -809,6 +809,48 @@ bool blk_update_request(struct request *req, blk_status_t error, } EXPORT_SYMBOL_GPL(blk_update_request); +static void __blk_account_io_done(struct request *req, u64 now) +{ + const int sgrp = op_stat_group(req_op(req)); + + part_stat_lock(); + update_io_ticks(req->part, jiffies, true); + part_stat_inc(req->part, ios[sgrp]); + part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); + part_stat_unlock(); +} + +static inline void blk_account_io_done(struct request *req, u64 now) +{ + /* + * Account IO completion. flush_rq isn't accounted as a + * normal IO on queueing nor completion. Accounting the + * containing request is enough. + */ + if (blk_do_io_stat(req) && req->part && + !(req->rq_flags & RQF_FLUSH_SEQ)) + __blk_account_io_done(req, now); +} + +static void __blk_account_io_start(struct request *rq) +{ + /* passthrough requests can hold bios that do not have ->bi_bdev set */ + if (rq->bio && rq->bio->bi_bdev) + rq->part = rq->bio->bi_bdev; + else + rq->part = rq->rq_disk->part0; + + part_stat_lock(); + update_io_ticks(rq->part, jiffies, false); + part_stat_unlock(); +} + +static inline void blk_account_io_start(struct request *req) +{ + if (blk_do_io_stat(req)) + __blk_account_io_start(req); +} + static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) { if (rq->rq_flags & RQF_STATS) { diff --git a/block/blk.h b/block/blk.h index 8a3761b6dc33..50aae8c0e03c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -257,9 +257,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio, unsigned int nr_segs); -void __blk_account_io_start(struct request *req); -void __blk_account_io_done(struct request *req, u64 now); - /* * Plug flush limits */ @@ -350,23 +347,7 @@ static inline bool blk_do_io_stat(struct request *rq) return (rq->rq_flags & RQF_IO_STAT) && rq->rq_disk; } -static inline void blk_account_io_done(struct request *req, u64 now) -{ - /* - * Account IO completion. flush_rq isn't accounted as a - * normal IO on queueing nor completion. Accounting the - * containing request is enough. - */ - if (blk_do_io_stat(req) && req->part && - !(req->rq_flags & RQF_FLUSH_SEQ)) - __blk_account_io_done(req, now); -} - -static inline void blk_account_io_start(struct request *req) -{ - if (blk_do_io_stat(req)) - __blk_account_io_start(req); -} +void update_io_ticks(struct block_device *part, unsigned long now, bool end); static inline void req_set_nomerge(struct request_queue *q, struct request *req) { From 22350ad7f15931b65d288e3e1462a51bfbfa5c4b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Wed, 17 Nov 2021 07:14:02 +0100 Subject: [PATCH 009/132] block: move blk_dump_rq_flags to blk-mq.c blk_dump_rq_flags deals with a request, so move it to blk-mq.c. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Link: https://lore.kernel.org/r/20211117061404.331732-10-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-core.c | 14 -------------- block/blk-mq.c | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 29c4db03742b..ae9bea11695e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -217,20 +217,6 @@ void blk_print_req_error(struct request *req, blk_status_t status) IOPRIO_PRIO_CLASS(req->ioprio)); } -void blk_dump_rq_flags(struct request *rq, char *msg) -{ - printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, - rq->rq_disk ? rq->rq_disk->disk_name : "?", - (unsigned long long) rq->cmd_flags); - - printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", - (unsigned long long)blk_rq_pos(rq), - blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); - printk(KERN_INFO " bio %p, biotail %p, len %u\n", - rq->bio, rq->biotail, blk_rq_bytes(rq)); -} -EXPORT_SYMBOL(blk_dump_rq_flags); - /** * blk_sync_queue - cancel any pending callbacks on a queue * @q: the queue diff --git a/block/blk-mq.c b/block/blk-mq.c index 3921c7cfd64c..2a4eff639036 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -667,6 +667,20 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug) blk_mq_free_request(rq); } +void blk_dump_rq_flags(struct request *rq, char *msg) +{ + printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, + rq->rq_disk ? rq->rq_disk->disk_name : "?", + (unsigned long long) rq->cmd_flags); + + printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", + (unsigned long long)blk_rq_pos(rq), + blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); + printk(KERN_INFO " bio %p, biotail %p, len %u\n", + rq->bio, rq->biotail, blk_rq_bytes(rq)); +} +EXPORT_SYMBOL(blk_dump_rq_flags); + static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, blk_status_t error) { From 0d7a29a2b5eae30eff1e216badca76978a3de39f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Wed, 17 Nov 2021 07:14:03 +0100 Subject: [PATCH 010/132] block: move blk_print_req_error to blk-mq.c This function is only used by the request completion path. Factor out a blk_status_to_str to keep blk_errors private in blk-core.c. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Link: https://lore.kernel.org/r/20211117061404.331732-11-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-core.c | 15 +++------------ block/blk-mq.c | 13 +++++++++++++ block/blk.h | 2 +- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index ae9bea11695e..8e14438ef822 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -199,22 +199,13 @@ int blk_status_to_errno(blk_status_t status) } EXPORT_SYMBOL_GPL(blk_status_to_errno); -void blk_print_req_error(struct request *req, blk_status_t status) +const char *blk_status_to_str(blk_status_t status) { int idx = (__force int)status; if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) - return; - - printk_ratelimited(KERN_ERR - "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " - "phys_seg %u prio class %u\n", - blk_errors[idx].name, - req->rq_disk ? req->rq_disk->disk_name : "?", - blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), - req->cmd_flags & ~REQ_OP_MASK, - req->nr_phys_segments, - IOPRIO_PRIO_CLASS(req->ioprio)); + return "<null>"; + return blk_errors[idx].name; } /** diff --git a/block/blk-mq.c b/block/blk-mq.c index 2a4eff639036..00df1eb031c0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -717,6 +717,19 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) } } +static void blk_print_req_error(struct request *req, blk_status_t status) +{ + printk_ratelimited(KERN_ERR + "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " + "phys_seg %u prio class %u\n", + blk_status_to_str(status), + req->rq_disk ? req->rq_disk->disk_name : "?", + blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), + req->cmd_flags & ~REQ_OP_MASK, + req->nr_phys_segments, + IOPRIO_PRIO_CLASS(req->ioprio)); +} + /** * blk_update_request - Complete multiple bytes without completing the request * @req: the request being processed diff --git a/block/blk.h b/block/blk.h index 50aae8c0e03c..31ac75413287 100644 --- a/block/blk.h +++ b/block/blk.h @@ -250,7 +250,7 @@ static inline void blk_integrity_del(struct gendisk *disk) unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); -void blk_print_req_error(struct request *req, blk_status_t status); +const char *blk_status_to_str(blk_status_t status); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, bool *same_queue_rq); From d9337a420aed38cb4ffa465e5a546360410bc0cb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Wed, 17 Nov 2021 07:14:04 +0100 Subject: [PATCH 011/132] block: don't include blk-mq headers in blk-core.c All request based code is in the blk-mq files now. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Link: https://lore.kernel.org/r/20211117061404.331732-12-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-core.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 8e14438ef822..35a04d8c180a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -16,7 +16,6 @@ #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> -#include <linux/blk-mq.h> #include <linux/blk-pm.h> #include <linux/blk-integrity.h> #include <linux/highmem.h> @@ -47,8 +46,6 @@ #include <trace/events/block.h> #include "blk.h" -#include "blk-mq.h" -#include "blk-mq-sched.h" #include "blk-pm.h" #include "blk-throttle.h" From 86416916466514e4ae0b7296d20133b6427c4c1f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Mon, 22 Nov 2021 14:06:12 +0100 Subject: [PATCH 012/132] block: move GENHD_FL_NATIVE_CAPACITY to disk->state The flag to indicate an unlocked native capacity is dynamic state, not a driver capability flag, so move it to disk->state. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211122130625.1136848-2-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/partitions/core.c | 15 ++++++--------- include/linux/genhd.h | 8 +------- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 334b72ef1d73..520292fee933 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -527,18 +527,15 @@ out_unlock: static bool disk_unlock_native_capacity(struct gendisk *disk) { - const struct block_device_operations *bdops = disk->fops; - - if (bdops->unlock_native_capacity && - !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) { - printk(KERN_CONT "enabling native capacity\n"); - bdops->unlock_native_capacity(disk); - disk->flags |= GENHD_FL_NATIVE_CAPACITY; - return true; - } else { + if (!disk->fops->unlock_native_capacity || + test_and_set_bit(GD_NATIVE_CAPACITY, &disk->state)) { printk(KERN_CONT "truncated\n"); return false; } + + printk(KERN_CONT "enabling native capacity\n"); + disk->fops->unlock_native_capacity(disk); + return true; } void blk_drop_partitions(struct gendisk *disk) diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 74c410263113..e490a71e5e9d 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -60,12 +60,6 @@ struct partition_meta_info { * (``BLOCK_EXT_MAJOR``). * This affects the maximum number of partitions. * - * ``GENHD_FL_NATIVE_CAPACITY`` (0x0080): based on information in the - * partition table, the device's capacity has been extended to its - * native capacity; i.e. the device has hidden capacity used by one - * of the partitions (this is a flag used so that native capacity is - * only ever unlocked once). - * * ``GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE`` (0x0100): event polling is * blocked whenever a writer holds an exclusive lock. * @@ -86,7 +80,6 @@ struct partition_meta_info { #define GENHD_FL_CD 0x0008 #define GENHD_FL_SUPPRESS_PARTITION_INFO 0x0020 #define GENHD_FL_EXT_DEVT 0x0040 -#define GENHD_FL_NATIVE_CAPACITY 0x0080 #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 0x0100 #define GENHD_FL_NO_PART_SCAN 0x0200 #define GENHD_FL_HIDDEN 0x0400 @@ -140,6 +133,7 @@ struct gendisk { #define GD_NEED_PART_SCAN 0 #define GD_READ_ONLY 1 #define GD_DEAD 2 +#define GD_NATIVE_CAPACITY 3 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ From 1545e0b419ba1d9b9bee4061d4826340afe6b0aa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Mon, 22 Nov 2021 14:06:13 +0100 Subject: [PATCH 013/132] block: move GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE to disk->event_flags GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE is all about the event reporting mechanism, so move it to the event_flags field. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211122130625.1136848-3-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bdev.c | 2 +- drivers/block/paride/pcd.c | 2 +- drivers/scsi/sr.c | 5 +++-- include/linux/genhd.h | 6 ++---- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index b1d087e5e205..dd84961bed7e 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -837,7 +837,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) * used in blkdev_get/put(). */ if ((mode & FMODE_WRITE) && !bdev->bd_write_holder && - (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { + (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) { bdev->bd_write_holder = true; unblock_events = false; } diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index f6b1d63e96e1..430ee8004a51 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -928,8 +928,8 @@ static int pcd_init_unit(struct pcd_unit *cd, bool autoprobe, int port, disk->minors = 1; strcpy(disk->disk_name, cd->name); /* umm... */ disk->fops = &pcd_bdops; - disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; disk->events = DISK_EVENT_MEDIA_CHANGE; + disk->event_flags = DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE; if (!pi_init(cd->pi, autoprobe, port, mode, unit, protocol, delay, pcd_buffer, PI_PCD, verbose, cd->name)) { diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 8e4af111c078..be445dc35f2c 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -684,9 +684,10 @@ static int sr_probe(struct device *dev) disk->minors = 1; sprintf(disk->disk_name, "sr%d", minor); disk->fops = &sr_bdops; - disk->flags = GENHD_FL_CD | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; + disk->flags = GENHD_FL_CD; disk->events = DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST; - disk->event_flags = DISK_EVENT_FLAG_POLL | DISK_EVENT_FLAG_UEVENT; + disk->event_flags = DISK_EVENT_FLAG_POLL | DISK_EVENT_FLAG_UEVENT | + DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE; blk_queue_rq_timeout(sdev->request_queue, SR_TIMEOUT); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index e490a71e5e9d..c1136ff3c91f 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -60,9 +60,6 @@ struct partition_meta_info { * (``BLOCK_EXT_MAJOR``). * This affects the maximum number of partitions. * - * ``GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE`` (0x0100): event polling is - * blocked whenever a writer holds an exclusive lock. - * * ``GENHD_FL_NO_PART_SCAN`` (0x0200): partition scanning is disabled. * Used for loop devices in their default settings and some MMC * devices. @@ -80,7 +77,6 @@ struct partition_meta_info { #define GENHD_FL_CD 0x0008 #define GENHD_FL_SUPPRESS_PARTITION_INFO 0x0020 #define GENHD_FL_EXT_DEVT 0x0040 -#define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 0x0100 #define GENHD_FL_NO_PART_SCAN 0x0200 #define GENHD_FL_HIDDEN 0x0400 @@ -94,6 +90,8 @@ enum { DISK_EVENT_FLAG_POLL = 1 << 0, /* Forward events to udev */ DISK_EVENT_FLAG_UEVENT = 1 << 1, + /* Block event polling when open for exclusive write */ + DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE = 1 << 2, }; struct disk_events; From 1a827ce1b9f2c740d2c6a228afd972970c18bc21 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Mon, 22 Nov 2021 14:06:14 +0100 Subject: [PATCH 014/132] block: remove GENHD_FL_CD GENHD_FL_CD marks a gendisk as a vaguely CD-ROM like device. Besides being used internally inside of sunvdc.c an xen-blkfront it is used by xen-blkback as a hint to claim a device exported to a guest is a CD-ROM like device. Just check for disk->cdi instead which is the right indicator for "real" CD-ROM or DVD drivers. This will miss the paravirtualized guest drivers, but those make little sense to report anyway. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211122130625.1136848-4-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- drivers/block/sunvdc.c | 17 +++++++++-------- drivers/block/xen-blkback/xenbus.c | 2 +- drivers/block/xen-blkfront.c | 26 +++++++++++--------------- drivers/scsi/sr.c | 1 - include/linux/genhd.h | 5 ----- 5 files changed, 21 insertions(+), 30 deletions(-) diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 6f45a53f7cbf..2157936de623 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -143,8 +143,8 @@ static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo) static int vdc_ioctl(struct block_device *bdev, fmode_t mode, unsigned command, unsigned long argument) { + struct vdc_port *port = bdev->bd_disk->private_data; int i; - struct gendisk *disk; switch (command) { case CDROMMULTISESSION: @@ -155,12 +155,15 @@ static int vdc_ioctl(struct block_device *bdev, fmode_t mode, return 0; case CDROM_GET_CAPABILITY: - disk = bdev->bd_disk; - - if (bdev->bd_disk && (disk->flags & GENHD_FL_CD)) + if (!vdc_version_supported(port, 1, 1)) + return -EINVAL; + switch (port->vdisk_mtype) { + case VD_MEDIA_TYPE_CD: + case VD_MEDIA_TYPE_DVD: return 0; - return -EINVAL; - + default: + return -EINVAL; + } default: pr_debug(PFX "ioctl %08x not supported\n", command); return -EINVAL; @@ -854,14 +857,12 @@ static int probe_disk(struct vdc_port *port) switch (port->vdisk_mtype) { case VD_MEDIA_TYPE_CD: pr_info(PFX "Virtual CDROM %s\n", port->disk_name); - g->flags |= GENHD_FL_CD; g->flags |= GENHD_FL_REMOVABLE; set_disk_ro(g, 1); break; case VD_MEDIA_TYPE_DVD: pr_info(PFX "Virtual DVD %s\n", port->disk_name); - g->flags |= GENHD_FL_CD; g->flags |= GENHD_FL_REMOVABLE; set_disk_ro(g, 1); break; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 914587aabca0..62125fd4af4a 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -510,7 +510,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, } vbd->size = vbd_sz(vbd); - if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom) + if (cdrom || disk_to_cdi(vbd->bdev->bd_disk)) vbd->type |= VDISK_CDROM; if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE) vbd->type |= VDISK_REMOVABLE; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 8e3983e456f3..700c765a759a 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -198,6 +198,7 @@ struct blkfront_info struct gendisk *gd; u16 sector_size; unsigned int physical_sector_size; + unsigned long vdisk_info; int vdevice; blkif_vdev_t handle; enum blkif_state connected; @@ -505,6 +506,7 @@ static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) static int blkif_ioctl(struct block_device *bdev, fmode_t mode, unsigned command, unsigned long argument) { + struct blkfront_info *info = bdev->bd_disk->private_data; int i; switch (command) { @@ -514,9 +516,9 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode, return -EFAULT; return 0; case CDROM_GET_CAPABILITY: - if (bdev->bd_disk->flags & GENHD_FL_CD) - return 0; - return -EINVAL; + if (!(info->vdisk_info & VDISK_CDROM)) + return -EINVAL; + return 0; default: return -EINVAL; } @@ -1057,9 +1059,8 @@ static char *encode_disk_name(char *ptr, unsigned int n) } static int xlvbd_alloc_gendisk(blkif_sector_t capacity, - struct blkfront_info *info, - u16 vdisk_info, u16 sector_size, - unsigned int physical_sector_size) + struct blkfront_info *info, u16 sector_size, + unsigned int physical_sector_size) { struct gendisk *gd; int nr_minors = 1; @@ -1157,15 +1158,11 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, xlvbd_flush(info); - if (vdisk_info & VDISK_READONLY) + if (info->vdisk_info & VDISK_READONLY) set_disk_ro(gd, 1); - - if (vdisk_info & VDISK_REMOVABLE) + if (info->vdisk_info & VDISK_REMOVABLE) gd->flags |= GENHD_FL_REMOVABLE; - if (vdisk_info & VDISK_CDROM) - gd->flags |= GENHD_FL_CD; - return 0; out_free_tag_set: @@ -2304,7 +2301,6 @@ static void blkfront_connect(struct blkfront_info *info) unsigned long long sectors; unsigned long sector_size; unsigned int physical_sector_size; - unsigned int binfo; int err, i; struct blkfront_ring_info *rinfo; @@ -2342,7 +2338,7 @@ static void blkfront_connect(struct blkfront_info *info) err = xenbus_gather(XBT_NIL, info->xbdev->otherend, "sectors", "%llu", §ors, - "info", "%u", &binfo, + "info", "%u", &info->vdisk_info, "sector-size", "%lu", §or_size, NULL); if (err) { @@ -2371,7 +2367,7 @@ static void blkfront_connect(struct blkfront_info *info) } } - err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, + err = xlvbd_alloc_gendisk(sectors, info, sector_size, physical_sector_size); if (err) { xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index be445dc35f2c..6646797a7756 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -684,7 +684,6 @@ static int sr_probe(struct device *dev) disk->minors = 1; sprintf(disk->disk_name, "sr%d", minor); disk->fops = &sr_bdops; - disk->flags = GENHD_FL_CD; disk->events = DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST; disk->event_flags = DISK_EVENT_FLAG_POLL | DISK_EVENT_FLAG_UEVENT | DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index c1136ff3c91f..74518c576fbb 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -46,10 +46,6 @@ struct partition_meta_info { * Must not be set for devices which are removed entirely when the * media is removed. * - * ``GENHD_FL_CD`` (0x0008): the block device is a CD-ROM-style - * device. - * Affects responses to the ``CDROM_GET_CAPABILITY`` ioctl. - * * ``GENHD_FL_SUPPRESS_PARTITION_INFO`` (0x0020): don't include * partition information in ``/proc/partitions`` or in the output of * printk_all_partitions(). @@ -74,7 +70,6 @@ struct partition_meta_info { #define GENHD_FL_REMOVABLE 0x0001 /* 2 is unused (used to be GENHD_FL_DRIVERFS) */ /* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */ -#define GENHD_FL_CD 0x0008 #define GENHD_FL_SUPPRESS_PARTITION_INFO 0x0020 #define GENHD_FL_EXT_DEVT 0x0040 #define GENHD_FL_NO_PART_SCAN 0x0200 From e3b3bad3f29878d13fdbc96f9e59674bd9b06bae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Mon, 22 Nov 2021 14:06:15 +0100 Subject: [PATCH 015/132] block: remove a dead check in show_partition disk_max_parts never returns 0 given that ->minors for devices not using the extended dev_t must be non-zero, and disk_max_parts always returns DISK_MAX_PARTS for the latter. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211122130625.1136848-5-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/genhd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 30362aeacac4..1c326d3b54b4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -814,9 +814,7 @@ static int show_partition(struct seq_file *seqf, void *v) struct block_device *part; unsigned long idx; - /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && - (sgp->flags & GENHD_FL_REMOVABLE))) + if (!get_capacity(sgp)) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) return 0; From e16e506ccd673a3a888a34f8f694698305840044 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Mon, 22 Nov 2021 14:06:16 +0100 Subject: [PATCH 016/132] block: merge disk_scan_partitions and blkdev_reread_part Unify the functionality that implements a partition rescan for a gendisk. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211122130625.1136848-6-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk.h | 1 + block/genhd.c | 19 ++++++++++++------- block/ioctl.c | 31 +++++-------------------------- 3 files changed, 18 insertions(+), 33 deletions(-) diff --git a/block/blk.h b/block/blk.h index 31ac75413287..423cba8ea0a6 100644 --- a/block/blk.h +++ b/block/blk.h @@ -449,6 +449,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, unsigned int max_sectors, bool *same_page); struct request_queue *blk_alloc_queue(int node_id); +int disk_scan_partitions(struct gendisk *disk, fmode_t mode); int disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); diff --git a/block/genhd.c b/block/genhd.c index 1c326d3b54b4..94f39c4333b8 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -372,17 +372,21 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action) } EXPORT_SYMBOL_GPL(disk_uevent); -static void disk_scan_partitions(struct gendisk *disk) +int disk_scan_partitions(struct gendisk *disk, fmode_t mode) { struct block_device *bdev; - if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) - return; + if (!disk_part_scan_enabled(disk)) + return -EINVAL; + if (disk->open_partitions) + return -EBUSY; set_bit(GD_NEED_PART_SCAN, &disk->state); - bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); - if (!IS_ERR(bdev)) - blkdev_put(bdev, FMODE_READ); + bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + blkdev_put(bdev, mode); + return 0; } /** @@ -509,7 +513,8 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, goto out_unregister_bdi; bdev_add(disk->part0, ddev->devt); - disk_scan_partitions(disk); + if (get_capacity(disk)) + disk_scan_partitions(disk, FMODE_READ); /* * Announce the disk and partitions after all partitions are diff --git a/block/ioctl.c b/block/ioctl.c index 0a1d10ac2e1a..4a86340133e4 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -82,31 +82,6 @@ static int compat_blkpg_ioctl(struct block_device *bdev, } #endif -static int blkdev_reread_part(struct block_device *bdev, fmode_t mode) -{ - struct block_device *tmp; - - if (!disk_part_scan_enabled(bdev->bd_disk) || bdev_is_partition(bdev)) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (bdev->bd_disk->open_partitions) - return -EBUSY; - - /* - * Reopen the device to revalidate the driver state and force a - * partition rescan. - */ - mode &= ~FMODE_EXCL; - set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); - - tmp = blkdev_get_by_dev(bdev->bd_dev, mode, NULL); - if (IS_ERR(tmp)) - return PTR_ERR(tmp); - blkdev_put(tmp, mode); - return 0; -} - static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, unsigned long arg, unsigned long flags) { @@ -522,7 +497,11 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE; return 0; case BLKRRPART: - return blkdev_reread_part(bdev, mode); + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (bdev_is_partition(bdev)) + return -EINVAL; + return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: From 46e7eac647b34ed4106a8262f8bedbb90801fadd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Mon, 22 Nov 2021 14:06:17 +0100 Subject: [PATCH 017/132] block: rename GENHD_FL_NO_PART_SCAN to GENHD_FL_NO_PART The GENHD_FL_NO_PART_SCAN controls more than just partitions canning, so rename it to GENHD_FL_NO_PART. Signed-off-by: Christoph Hellwig <hch@lst.de> Acked-by: Ulf Hansson <ulf.hansson@linaro.org> Link: https://lore.kernel.org/r/20211122130625.1136848-7-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/genhd.c | 2 +- drivers/block/loop.c | 8 ++++---- drivers/block/n64cart.c | 2 +- drivers/mmc/core/block.c | 4 ++-- include/linux/genhd.h | 13 ++++++------- 5 files changed, 14 insertions(+), 15 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 94f39c4333b8..685794a2ebb0 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -500,7 +500,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, * and don't bother scanning for partitions either. */ disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; - disk->flags |= GENHD_FL_NO_PART_SCAN; + disk->flags |= GENHD_FL_NO_PART; } else { ret = bdi_register(disk->bdi, "%u:%u", disk->major, disk->first_minor); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index a154cab6cd98..7219d98c6fb8 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1061,7 +1061,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, lo->lo_flags |= LO_FLAGS_PARTSCAN; partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; if (partscan) - lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; + lo->lo_disk->flags &= ~GENHD_FL_NO_PART; loop_global_unlock(lo, is_loop); if (partscan) @@ -1191,7 +1191,7 @@ out_unlock: mutex_lock(&lo->lo_mutex); lo->lo_flags = 0; if (!part_shift) - lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; + lo->lo_disk->flags |= GENHD_FL_NO_PART; lo->lo_state = Lo_unbound; mutex_unlock(&lo->lo_mutex); @@ -1301,7 +1301,7 @@ out_unfreeze: if (!err && (lo->lo_flags & LO_FLAGS_PARTSCAN) && !(prev_lo_flags & LO_FLAGS_PARTSCAN)) { - lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; + lo->lo_disk->flags &= ~GENHD_FL_NO_PART; partscan = true; } out_unlock: @@ -2032,7 +2032,7 @@ static int loop_add(int i) * userspace tools. Parameters like this in general should be avoided. */ if (!part_shift) - disk->flags |= GENHD_FL_NO_PART_SCAN; + disk->flags |= GENHD_FL_NO_PART; disk->flags |= GENHD_FL_EXT_DEVT; atomic_set(&lo->lo_refcnt, 0); mutex_init(&lo->lo_mutex); diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index 78282f01f581..4db9a8c244af 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -136,7 +136,7 @@ static int __init n64cart_probe(struct platform_device *pdev) goto out; disk->first_minor = 0; - disk->flags = GENHD_FL_NO_PART_SCAN; + disk->flags = GENHD_FL_NO_PART; disk->fops = &n64cart_fops; disk->private_data = &pdev->dev; strcpy(disk->disk_name, "n64cart"); diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 90e1bcd03b46..a71b3512c877 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -2397,8 +2397,8 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, set_disk_ro(md->disk, md->read_only || default_ro); md->disk->flags = GENHD_FL_EXT_DEVT; if (area_type & (MMC_BLK_DATA_AREA_RPMB | MMC_BLK_DATA_AREA_BOOT)) - md->disk->flags |= GENHD_FL_NO_PART_SCAN - | GENHD_FL_SUPPRESS_PARTITION_INFO; + md->disk->flags |= GENHD_FL_NO_PART | + GENHD_FL_SUPPRESS_PARTITION_INFO; /* * As discussed on lkml, GENHD_FL_REMOVABLE should: diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 74518c576fbb..0b9be3df9489 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -56,15 +56,15 @@ struct partition_meta_info { * (``BLOCK_EXT_MAJOR``). * This affects the maximum number of partitions. * - * ``GENHD_FL_NO_PART_SCAN`` (0x0200): partition scanning is disabled. - * Used for loop devices in their default settings and some MMC - * devices. + * ``GENHD_FL_NO_PART`` (0x0200): partition support is disabled. + * The kernel will not scan for partitions from add_disk, and users + * can't add partitions manually. * * ``GENHD_FL_HIDDEN`` (0x0400): the block device is hidden; it * doesn't produce events, doesn't appear in sysfs, and doesn't have * an associated ``bdev``. * Implies ``GENHD_FL_SUPPRESS_PARTITION_INFO`` and - * ``GENHD_FL_NO_PART_SCAN``. + * ``GENHD_FL_NO_PART``. * Used for multipath devices. */ #define GENHD_FL_REMOVABLE 0x0001 @@ -72,7 +72,7 @@ struct partition_meta_info { /* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */ #define GENHD_FL_SUPPRESS_PARTITION_INFO 0x0020 #define GENHD_FL_EXT_DEVT 0x0040 -#define GENHD_FL_NO_PART_SCAN 0x0200 +#define GENHD_FL_NO_PART 0x0200 #define GENHD_FL_HIDDEN 0x0400 enum { @@ -180,8 +180,7 @@ static inline int disk_max_parts(struct gendisk *disk) static inline bool disk_part_scan_enabled(struct gendisk *disk) { - return disk_max_parts(disk) > 1 && - !(disk->flags & GENHD_FL_NO_PART_SCAN); + return disk_max_parts(disk) > 1 && !(disk->flags & GENHD_FL_NO_PART); } static inline dev_t disk_devt(struct gendisk *disk) From 140862805affca32b3c92a9a7643a7ee6d6ab278 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Mon, 29 Nov 2021 06:38:04 -0700 Subject: [PATCH 018/132] block: remove the GENHD_FL_HIDDEN check in blkdev_get_no_open Hidden gendisks never hash the block device inode, so this can't happen. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211122130625.1136848-8-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bdev.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index dd84961bed7e..e9ada04e71be 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -750,14 +750,6 @@ struct block_device *blkdev_get_no_open(dev_t dev) if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) bdev = NULL; iput(inode); - - if (!bdev) - return NULL; - if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN)) { - put_device(&bdev->bd_device); - return NULL; - } - return bdev; } From 94b49c3ddb2127f84a133d70cef49054e1ebaaa4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Mon, 22 Nov 2021 14:06:19 +0100 Subject: [PATCH 019/132] null_blk: don't suppress partitioning information This manually reverts commit 27290b469051 ("null_blk: suppress invalid partition info"). The message in that commit log can't appearch as the flag is never checked during probing, and there is no good reason to treat null_blk special in /proc/partitions. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211122130625.1136848-9-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- drivers/block/null_blk/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 323af5c9c802..eb17def1f265 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1850,7 +1850,7 @@ static int null_gendisk_register(struct nullb *nullb) set_capacity(disk, size); - disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; + disk->flags |= GENHD_FL_EXT_DEVT; disk->major = null_major; disk->first_minor = nullb->index; disk->minors = 1; From 79b0f79a835c6f1103c06e449cd88fb13e67f405 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Mon, 22 Nov 2021 14:06:20 +0100 Subject: [PATCH 020/132] mmc: don't set GENHD_FL_SUPPRESS_PARTITION_INFO This manually reverts 07b652cdbec3 ("mmc: card: Don't show eMMC RPMB and BOOT areas in /proc/partitions"). Based on the commit description that change was purely cosmetic. mmc is the last driver that sets this flag and thus prevents it from being removed. Signed-off-by: Christoph Hellwig <hch@lst.de> Acked-by: Ulf Hansson <ulf.hansson@linaro.org> Link: https://lore.kernel.org/r/20211122130625.1136848-10-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- drivers/mmc/core/block.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index a71b3512c877..2dd93d49d822 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -2397,8 +2397,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, set_disk_ro(md->disk, md->read_only || default_ro); md->disk->flags = GENHD_FL_EXT_DEVT; if (area_type & (MMC_BLK_DATA_AREA_RPMB | MMC_BLK_DATA_AREA_BOOT)) - md->disk->flags |= GENHD_FL_NO_PART | - GENHD_FL_SUPPRESS_PARTITION_INFO; + md->disk->flags |= GENHD_FL_NO_PART; /* * As discussed on lkml, GENHD_FL_REMOVABLE should: From 3b5149ac50970669ee0ddb9629ec77ffd5c0622d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Mon, 22 Nov 2021 14:06:21 +0100 Subject: [PATCH 021/132] block: remove GENHD_FL_SUPPRESS_PARTITION_INFO This flag is not set directly anywhere and only inherited from GENHD_FL_HIDDEN. Just check for GENHD_FL_HIDDEN instead. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211122130625.1136848-11-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/genhd.c | 11 +++-------- include/linux/genhd.h | 9 +-------- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 685794a2ebb0..50a843ee18f6 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -496,10 +496,8 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, if (disk->flags & GENHD_FL_HIDDEN) { /* - * Don't let hidden disks show up in /proc/partitions, - * and don't bother scanning for partitions either. + * Don't bother scanning for partitions. */ - disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; disk->flags |= GENHD_FL_NO_PART; } else { ret = bdi_register(disk->bdi, "%u:%u", @@ -725,8 +723,7 @@ void __init printk_all_partitions(void) * Don't show empty devices or things that have been * suppressed */ - if (get_capacity(disk) == 0 || - (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)) + if (get_capacity(disk) == 0 || (disk->flags & GENHD_FL_HIDDEN)) continue; /* @@ -819,9 +816,7 @@ static int show_partition(struct seq_file *seqf, void *v) struct block_device *part; unsigned long idx; - if (!get_capacity(sgp)) - return 0; - if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) + if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN)) return 0; rcu_read_lock(); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 0b9be3df9489..64a2f33ae9ea 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -46,11 +46,6 @@ struct partition_meta_info { * Must not be set for devices which are removed entirely when the * media is removed. * - * ``GENHD_FL_SUPPRESS_PARTITION_INFO`` (0x0020): don't include - * partition information in ``/proc/partitions`` or in the output of - * printk_all_partitions(). - * Used for the null block device and some MMC devices. - * * ``GENHD_FL_EXT_DEVT`` (0x0040): the driver supports extended * dynamic ``dev_t``, i.e. it wants extended device numbers * (``BLOCK_EXT_MAJOR``). @@ -63,14 +58,12 @@ struct partition_meta_info { * ``GENHD_FL_HIDDEN`` (0x0400): the block device is hidden; it * doesn't produce events, doesn't appear in sysfs, and doesn't have * an associated ``bdev``. - * Implies ``GENHD_FL_SUPPRESS_PARTITION_INFO`` and - * ``GENHD_FL_NO_PART``. + * Implies ``GENHD_FL_NO_PART``. * Used for multipath devices. */ #define GENHD_FL_REMOVABLE 0x0001 /* 2 is unused (used to be GENHD_FL_DRIVERFS) */ /* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */ -#define GENHD_FL_SUPPRESS_PARTITION_INFO 0x0020 #define GENHD_FL_EXT_DEVT 0x0040 #define GENHD_FL_NO_PART 0x0200 #define GENHD_FL_HIDDEN 0x0400 From 1ebe2e5f9d68e94c524aba876f27b945669a7879 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Mon, 22 Nov 2021 14:06:22 +0100 Subject: [PATCH 022/132] block: remove GENHD_FL_EXT_DEVT All modern drivers can support extra partitions using the extended dev_t. In fact except for the ioctl method drivers never even see partitions in normal operation. So remove the GENHD_FL_EXT_DEVT and allow extra partitions for all block devices that do support partitions, and require those that do not support partitions to explicit disallow them using GENHD_FL_NO_PART. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211122130625.1136848-12-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/genhd.c | 6 +++--- block/partitions/core.c | 9 ++++----- drivers/block/amiflop.c | 1 + drivers/block/ataflop.c | 1 + drivers/block/brd.c | 1 - drivers/block/drbd/drbd_main.c | 1 + drivers/block/floppy.c | 1 + drivers/block/loop.c | 1 - drivers/block/null_blk/main.c | 1 - drivers/block/paride/pcd.c | 1 + drivers/block/paride/pf.c | 1 + drivers/block/pktcdvd.c | 2 +- drivers/block/ps3vram.c | 1 + drivers/block/rbd.c | 6 ++---- drivers/block/swim.c | 1 + drivers/block/swim3.c | 2 +- drivers/block/virtio_blk.c | 1 - drivers/block/z2ram.c | 1 + drivers/block/zram/zram_drv.c | 1 + drivers/cdrom/gdrom.c | 1 + drivers/md/dm.c | 1 + drivers/md/md.c | 5 ----- drivers/mmc/core/block.c | 1 - drivers/mtd/ubi/block.c | 1 + drivers/scsi/sd.c | 1 - drivers/scsi/sr.c | 1 + include/linux/genhd.h | 28 +++++----------------------- 27 files changed, 30 insertions(+), 48 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 50a843ee18f6..628632537129 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -376,7 +376,7 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode) { struct block_device *bdev; - if (!disk_part_scan_enabled(disk)) + if (disk->flags & GENHD_FL_NO_PART) return -EINVAL; if (disk->open_partitions) return -EBUSY; @@ -438,7 +438,6 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, return ret; disk->major = BLOCK_EXT_MAJOR; disk->first_minor = ret; - disk->flags |= GENHD_FL_EXT_DEVT; } ret = disk_alloc_events(disk); @@ -872,7 +871,8 @@ static ssize_t disk_ext_range_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", disk_max_parts(disk)); + return sprintf(buf, "%d\n", + (disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS); } static ssize_t disk_removable_show(struct device *dev, diff --git a/block/partitions/core.c b/block/partitions/core.c index 520292fee933..c2a1635922b1 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -98,13 +98,12 @@ static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) static struct parsed_partitions *allocate_partitions(struct gendisk *hd) { struct parsed_partitions *state; - int nr; + int nr = DISK_MAX_PARTS; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return NULL; - nr = disk_max_parts(hd); state->parts = vzalloc(array_size(nr, sizeof(state->parts[0]))); if (!state->parts) { kfree(state); @@ -326,7 +325,7 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, lockdep_assert_held(&disk->open_mutex); - if (partno >= disk_max_parts(disk)) + if (partno >= DISK_MAX_PARTS) return ERR_PTR(-EINVAL); /* @@ -604,7 +603,7 @@ static int blk_add_partitions(struct gendisk *disk) struct parsed_partitions *state; int ret = -EAGAIN, p; - if (!disk_part_scan_enabled(disk)) + if (disk->flags & GENHD_FL_NO_PART) return 0; state = check_partition(disk); @@ -687,7 +686,7 @@ rescan: * userspace for this particular setup. */ if (invalidate) { - if (disk_part_scan_enabled(disk) || + if (!(disk->flags & GENHD_FL_NO_PART) || !(disk->flags & GENHD_FL_REMOVABLE)) set_capacity(disk, 0); } diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index bf5c124c5452..1eec5113d0b5 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1790,6 +1790,7 @@ static int fd_alloc_disk(int drive, int system) disk->first_minor = drive + system; disk->minors = 1; disk->fops = &floppy_fops; + disk->flags |= GENHD_FL_NO_PART; disk->events = DISK_EVENT_MEDIA_CHANGE; if (system) sprintf(disk->disk_name, "fd%d_msdos", drive); diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index bf769e6e32fe..f3ff9babdb5c 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -2000,6 +2000,7 @@ static int ataflop_alloc_disk(unsigned int drive, unsigned int type) disk->minors = 1; sprintf(disk->disk_name, "fd%d", drive); disk->fops = &floppy_fops; + disk->flags |= GENHD_FL_NO_PART; disk->events = DISK_EVENT_MEDIA_CHANGE; disk->private_data = &unit[drive]; set_capacity(disk, MAX_DISK_SIZE * 2); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index a896ee175d86..8fe2e4289dae 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -405,7 +405,6 @@ static int brd_alloc(int i) disk->minors = max_part; disk->fops = &brd_fops; disk->private_data = brd; - disk->flags = GENHD_FL_EXT_DEVT; strlcpy(disk->disk_name, buf, DISK_NAME_LEN); set_capacity(disk, rd_size * 2); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 53ba2dddba6e..07b3c6093e7d 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2734,6 +2734,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig disk->first_minor = minor; disk->minors = 1; disk->fops = &drbd_ops; + disk->flags |= GENHD_FL_NO_PART; sprintf(disk->disk_name, "drbd%d", minor); disk->private_data = device; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index c4267da716fe..7f0a60c4079f 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4503,6 +4503,7 @@ static int floppy_alloc_disk(unsigned int drive, unsigned int type) disk->first_minor = TOMINOR(drive) | (type << 2); disk->minors = 1; disk->fops = &floppy_fops; + disk->flags |= GENHD_FL_NO_PART; disk->events = DISK_EVENT_MEDIA_CHANGE; if (type) sprintf(disk->disk_name, "fd%d_type%d", drive, type); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 7219d98c6fb8..0954ea8cf9e3 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2033,7 +2033,6 @@ static int loop_add(int i) */ if (!part_shift) disk->flags |= GENHD_FL_NO_PART; - disk->flags |= GENHD_FL_EXT_DEVT; atomic_set(&lo->lo_refcnt, 0); mutex_init(&lo->lo_mutex); lo->lo_number = i; diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index eb17def1f265..54f7d490f8eb 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1850,7 +1850,6 @@ static int null_gendisk_register(struct nullb *nullb) set_capacity(disk, size); - disk->flags |= GENHD_FL_EXT_DEVT; disk->major = null_major; disk->first_minor = nullb->index; disk->minors = 1; diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 430ee8004a51..255fd3d4b8a8 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -928,6 +928,7 @@ static int pcd_init_unit(struct pcd_unit *cd, bool autoprobe, int port, disk->minors = 1; strcpy(disk->disk_name, cd->name); /* umm... */ disk->fops = &pcd_bdops; + disk->flags |= GENHD_FL_NO_PART; disk->events = DISK_EVENT_MEDIA_CHANGE; disk->event_flags = DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE; diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index bf8d0ef41a0a..b84a6448a4f7 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -942,6 +942,7 @@ static int __init pf_init_unit(struct pf_unit *pf, bool autoprobe, int port, disk->minors = 1; strcpy(disk->disk_name, pf->name); disk->fops = &pf_fops; + disk->flags |= GENHD_FL_NO_PART; disk->events = DISK_EVENT_MEDIA_CHANGE; disk->private_data = pf; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index b53f648302c1..3af0499857ec 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2719,7 +2719,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) disk->first_minor = idx; disk->minors = 1; disk->fops = &pktcdvd_ops; - disk->flags = GENHD_FL_REMOVABLE; + disk->flags = GENHD_FL_REMOVABLE | GENHD_FL_NO_PART; strcpy(disk->disk_name, pd->name); disk->private_data = pd; diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index c1876646a4cb..4f90819e245e 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -742,6 +742,7 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) priv->gendisk = gendisk; gendisk->major = ps3vram_major; gendisk->minors = 1; + gendisk->flags |= GENHD_FL_NO_PART; gendisk->fops = &ps3vram_fops; gendisk->private_data = dev; strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 953fa134cd3d..8f140da1efe3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4924,12 +4924,10 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->dev_id); disk->major = rbd_dev->major; disk->first_minor = rbd_dev->minor; - if (single_major) { + if (single_major) disk->minors = (1 << RBD_SINGLE_MAJOR_PART_SHIFT); - disk->flags |= GENHD_FL_EXT_DEVT; - } else { + else disk->minors = RBD_MINORS_PER_MAJOR; - } disk->fops = &rbd_bd_ops; disk->private_data = rbd_dev; diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 821594cd1315..fef65a18d56f 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -840,6 +840,7 @@ static int swim_floppy_init(struct swim_priv *swd) swd->unit[drive].disk->minors = 1; sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive); swd->unit[drive].disk->fops = &floppy_fops; + swd->unit[drive].disk->flags |= GENHD_FL_NO_PART; swd->unit[drive].disk->events = DISK_EVENT_MEDIA_CHANGE; swd->unit[drive].disk->private_data = &swd->unit[drive]; set_capacity(swd->unit[drive].disk, 2880); diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 4b91c9aa5892..6c39f2c9f806 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -1227,7 +1227,7 @@ static int swim3_attach(struct macio_dev *mdev, disk->fops = &floppy_fops; disk->private_data = fs; disk->events = DISK_EVENT_MEDIA_CHANGE; - disk->flags |= GENHD_FL_REMOVABLE; + disk->flags |= GENHD_FL_REMOVABLE | GENHD_FL_NO_PART; sprintf(disk->disk_name, "fd%d", floppy_count); set_capacity(disk, 2880); rc = add_disk(disk); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 6ae38776e30e..cfa303fa7318 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -843,7 +843,6 @@ static int virtblk_probe(struct virtio_device *vdev) vblk->disk->minors = 1 << PART_BITS; vblk->disk->private_data = vblk; vblk->disk->fops = &virtblk_fops; - vblk->disk->flags |= GENHD_FL_EXT_DEVT; vblk->index = index; /* configure queue flush support */ diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index ccc52c935faf..7a6ed83481b8 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -327,6 +327,7 @@ static int z2ram_register_disk(int minor) disk->major = Z2RAM_MAJOR; disk->first_minor = minor; disk->minors = 1; + disk->flags |= GENHD_FL_NO_PART; disk->fops = &z2_fops; if (minor) sprintf(disk->disk_name, "z2ram%d", minor); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 25071126995b..f6da5293b913 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1947,6 +1947,7 @@ static int zram_add(void) zram->disk->major = zram_major; zram->disk->first_minor = device_id; zram->disk->minors = 1; + zram->disk->flags |= GENHD_FL_NO_PART; zram->disk->fops = &zram_devops; zram->disk->private_data = zram; snprintf(zram->disk->disk_name, 16, "zram%d", device_id); diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index d50cc1fd34d5..faead41709bc 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -719,6 +719,7 @@ static void probe_gdrom_setupdisk(void) gd.disk->major = gdrom_major; gd.disk->first_minor = 1; gd.disk->minors = 1; + gd.disk->flags |= GENHD_FL_NO_PART; strcpy(gd.disk->disk_name, GDROM_DEV_NAME); } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 662742a310cb..280918cdcabd 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1778,6 +1778,7 @@ static struct mapped_device *alloc_dev(int minor) md->disk->major = _major; md->disk->first_minor = minor; md->disk->minors = 1; + md->disk->flags |= GENHD_FL_NO_PART; md->disk->fops = &dm_blk_dops; md->disk->queue = md->queue; md->disk->private_data = md; diff --git a/drivers/md/md.c b/drivers/md/md.c index 5111ed966947..7fbf6f0ac01b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5707,11 +5707,6 @@ static int md_alloc(dev_t dev, char *name) mddev->queue = disk->queue; blk_set_stacking_limits(&mddev->queue->limits); blk_queue_write_cache(mddev->queue, true, true); - /* Allow extended partitions. This makes the - * 'mdp' device redundant, but we can't really - * remove it now. - */ - disk->flags |= GENHD_FL_EXT_DEVT; disk->events |= DISK_EVENT_MEDIA_CHANGE; mddev->gendisk = disk; error = add_disk(disk); diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 2dd93d49d822..5e0960560eab 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -2395,7 +2395,6 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, md->disk->private_data = md; md->parent = parent; set_disk_ro(md->disk, md->read_only || default_ro); - md->disk->flags = GENHD_FL_EXT_DEVT; if (area_type & (MMC_BLK_DATA_AREA_RPMB | MMC_BLK_DATA_AREA_BOOT)) md->disk->flags |= GENHD_FL_NO_PART; diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 302426ab30f8..a78fdf3b30f7 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -430,6 +430,7 @@ int ubiblock_create(struct ubi_volume_info *vi) ret = -ENODEV; goto out_cleanup_disk; } + gd->flags |= GENHD_FL_NO_PART; gd->private_data = dev; sprintf(gd->disk_name, "ubiblock%d_%d", dev->ubi_num, dev->vol_id); set_capacity(gd, disk_capacity); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 65875a598d62..bba1f5dafd38 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3566,7 +3566,6 @@ static int sd_probe(struct device *dev) sd_revalidate_disk(gd); - gd->flags = GENHD_FL_EXT_DEVT; if (sdp->removable) { gd->flags |= GENHD_FL_REMOVABLE; gd->events |= DISK_EVENT_MEDIA_CHANGE; diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 6646797a7756..cf093387e42a 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -684,6 +684,7 @@ static int sr_probe(struct device *dev) disk->minors = 1; sprintf(disk->disk_name, "sr%d", minor); disk->fops = &sr_bdops; + disk->flags |= GENHD_FL_NO_PART; disk->events = DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST; disk->event_flags = DISK_EVENT_FLAG_POLL | DISK_EVENT_FLAG_UEVENT | DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 64a2f33ae9ea..b8ced80178d6 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -46,11 +46,6 @@ struct partition_meta_info { * Must not be set for devices which are removed entirely when the * media is removed. * - * ``GENHD_FL_EXT_DEVT`` (0x0040): the driver supports extended - * dynamic ``dev_t``, i.e. it wants extended device numbers - * (``BLOCK_EXT_MAJOR``). - * This affects the maximum number of partitions. - * * ``GENHD_FL_NO_PART`` (0x0200): partition support is disabled. * The kernel will not scan for partitions from add_disk, and users * can't add partitions manually. @@ -64,7 +59,6 @@ struct partition_meta_info { #define GENHD_FL_REMOVABLE 0x0001 /* 2 is unused (used to be GENHD_FL_DRIVERFS) */ /* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */ -#define GENHD_FL_EXT_DEVT 0x0040 #define GENHD_FL_NO_PART 0x0200 #define GENHD_FL_HIDDEN 0x0400 @@ -94,13 +88,13 @@ struct blk_integrity { }; struct gendisk { - /* major, first_minor and minors are input parameters only, - * don't use directly. Use disk_devt() and disk_max_parts(). + /* + * major/first_minor/minors should not be set by any new driver, the + * block core will take care of allocating them automatically. */ - int major; /* major number of driver */ + int major; int first_minor; - int minors; /* maximum number of minors, =1 for - * disks that can't be partitioned. */ + int minors; char disk_name[DISK_NAME_LEN]; /* name of major driver */ @@ -164,18 +158,6 @@ static inline bool disk_live(struct gendisk *disk) #define disk_to_cdi(disk) NULL #endif -static inline int disk_max_parts(struct gendisk *disk) -{ - if (disk->flags & GENHD_FL_EXT_DEVT) - return DISK_MAX_PARTS; - return disk->minors; -} - -static inline bool disk_part_scan_enabled(struct gendisk *disk) -{ - return disk_max_parts(disk) > 1 && !(disk->flags & GENHD_FL_NO_PART); -} - static inline dev_t disk_devt(struct gendisk *disk) { return MKDEV(disk->major, disk->first_minor); From 9f18db572c97bc327b63528d195fdb252f47e9de Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Mon, 22 Nov 2021 14:06:23 +0100 Subject: [PATCH 023/132] block: don't set GENHD_FL_NO_PART for hidden gendisks Hidden gendisks can't be opened using blkdev_get_*, so we can't really reach any of the partition scanning paths or partitioning ioctls except for the initial partition scan from add_disk. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211122130625.1136848-13-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/genhd.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 628632537129..8e9cbf23c510 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -376,7 +376,7 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode) { struct block_device *bdev; - if (disk->flags & GENHD_FL_NO_PART) + if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) return -EINVAL; if (disk->open_partitions) return -EBUSY; @@ -493,12 +493,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, if (ret) goto out_put_slave_dir; - if (disk->flags & GENHD_FL_HIDDEN) { - /* - * Don't bother scanning for partitions. - */ - disk->flags |= GENHD_FL_NO_PART; - } else { + if (!(disk->flags & GENHD_FL_HIDDEN)) { ret = bdi_register(disk->bdi, "%u:%u", disk->major, disk->first_minor); if (ret) From 430cc5d3ab4d0ba0bd011cfbb0035e46ba92920c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Mon, 22 Nov 2021 14:06:24 +0100 Subject: [PATCH 024/132] block: cleanup the GENHD_FL_* definitions Switch to an enum and tidy up the documentation. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211122130625.1136848-14-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- include/linux/genhd.h | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/include/linux/genhd.h b/include/linux/genhd.h index b8ced80178d6..6906a45bc761 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -39,28 +39,24 @@ struct partition_meta_info { /** * DOC: genhd capability flags * - * ``GENHD_FL_REMOVABLE`` (0x0001): indicates that the block device - * gives access to removable media. - * When set, the device remains present even when media is not - * inserted. - * Must not be set for devices which are removed entirely when the + * ``GENHD_FL_REMOVABLE``: indicates that the block device gives access to + * removable media. When set, the device remains present even when media is not + * inserted. Shall not be set for devices which are removed entirely when the * media is removed. * - * ``GENHD_FL_NO_PART`` (0x0200): partition support is disabled. - * The kernel will not scan for partitions from add_disk, and users - * can't add partitions manually. + * ``GENHD_FL_HIDDEN``: the block device is hidden; it doesn't produce events, + * doesn't appear in sysfs, and can't be opened from userspace or using + * blkdev_get*. Used for the underlying components of multipath devices. + * + * ``GENHD_FL_NO_PART``: partition support is disabled. The kernel will not + * scan for partitions from add_disk, and users can't add partitions manually. * - * ``GENHD_FL_HIDDEN`` (0x0400): the block device is hidden; it - * doesn't produce events, doesn't appear in sysfs, and doesn't have - * an associated ``bdev``. - * Implies ``GENHD_FL_NO_PART``. - * Used for multipath devices. */ -#define GENHD_FL_REMOVABLE 0x0001 -/* 2 is unused (used to be GENHD_FL_DRIVERFS) */ -/* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */ -#define GENHD_FL_NO_PART 0x0200 -#define GENHD_FL_HIDDEN 0x0400 +enum { + GENHD_FL_REMOVABLE = 1 << 0, + GENHD_FL_HIDDEN = 1 << 1, + GENHD_FL_NO_PART = 1 << 2, +}; enum { DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ From a4561f9fccc57a5fd56c53e21514f63825d8ace7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Mon, 22 Nov 2021 14:06:25 +0100 Subject: [PATCH 025/132] sr: set GENHD_FL_REMOVABLE earlier Set up GENHD_FL_REMOVABLE together with the rest of the gendisk fields. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211122130625.1136848-15-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- drivers/scsi/sr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index cf093387e42a..411e2b01966e 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -684,7 +684,7 @@ static int sr_probe(struct device *dev) disk->minors = 1; sprintf(disk->disk_name, "sr%d", minor); disk->fops = &sr_bdops; - disk->flags |= GENHD_FL_NO_PART; + disk->flags |= GENHD_FL_REMOVABLE | GENHD_FL_NO_PART; disk->events = DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST; disk->event_flags = DISK_EVENT_FLAG_POLL | DISK_EVENT_FLAG_UEVENT | DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE; @@ -726,7 +726,6 @@ static int sr_probe(struct device *dev) blk_pm_runtime_init(sdev->request_queue, dev); dev_set_drvdata(dev, cd); - disk->flags |= GENHD_FL_REMOVABLE; sr_revalidate_disk(cd); error = device_add_disk(&sdev->sdev_gendev, disk, NULL); From 0c5bcc92d94a8f578ab2b06c1274e076cc8aecd3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Tue, 23 Nov 2021 17:04:41 +0100 Subject: [PATCH 026/132] blk-mq: simplify the plug handling in blk_mq_submit_bio blk_mq_submit_bio has two different plug cases, one that uses full plugging and a limited plugging one. The limited plugging case is only used for a corner case that does not matter in real life: - no ->commit_rqs (so not NVMe) - no shared tags (so not SCSI) - not rotational (so no old disk or floppy driver) - must have multiple queues (so no eMMC) Remove the limited merging case and all the related junk to simplify blk_mq_submit_bio and the functions called from it. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211123160443.1315598-2-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-merge.c | 9 +------ block/blk-mq.c | 68 +++++++++-------------------------------------- block/blk.h | 2 +- 3 files changed, 15 insertions(+), 64 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 893c1a60b701..ba761c3f482b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -1067,7 +1067,6 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q, * @q: request_queue new bio is being queued at * @bio: new bio being queued * @nr_segs: number of segments in @bio - * @same_queue_rq: output value, will be true if there's an existing request * from the passed in @q already in the plug list * * Determine whether @bio being queued on @q can be merged with the previous @@ -1084,7 +1083,7 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q, * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs, bool *same_queue_rq) + unsigned int nr_segs) { struct blk_plug *plug; struct request *rq; @@ -1096,12 +1095,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, /* check the previously added entry for a quick merge attempt */ rq = rq_list_peek(&plug->mq_list); if (rq->q == q) { - /* - * Only blk-mq multiple hardware queues case checks the rq in - * the same queue, there should be only one such rq in a queue - */ - *same_queue_rq = true; - if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == BIO_MERGE_OK) return true; diff --git a/block/blk-mq.c b/block/blk-mq.c index 00df1eb031c0..4a13900b640e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2690,11 +2690,10 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) } static bool blk_mq_attempt_bio_merge(struct request_queue *q, - struct bio *bio, unsigned int nr_segs, - bool *same_queue_rq) + struct bio *bio, unsigned int nr_segs) { if (!blk_queue_nomerges(q) && bio_mergeable(bio)) { - if (blk_attempt_plug_merge(q, bio, nr_segs, same_queue_rq)) + if (blk_attempt_plug_merge(q, bio, nr_segs)) return true; if (blk_mq_sched_bio_merge(q, bio, nr_segs)) return true; @@ -2705,8 +2704,7 @@ static bool blk_mq_attempt_bio_merge(struct request_queue *q, static struct request *blk_mq_get_new_requests(struct request_queue *q, struct blk_plug *plug, struct bio *bio, - unsigned int nsegs, - bool *same_queue_rq) + unsigned int nsegs) { struct blk_mq_alloc_data data = { .q = q, @@ -2715,7 +2713,7 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, }; struct request *rq; - if (blk_mq_attempt_bio_merge(q, bio, nsegs, same_queue_rq)) + if (blk_mq_attempt_bio_merge(q, bio, nsegs)) return NULL; rq_qos_throttle(q, bio); @@ -2751,8 +2749,7 @@ static inline bool blk_mq_can_use_cached_rq(struct request *rq, struct bio *bio) static inline struct request *blk_mq_get_request(struct request_queue *q, struct blk_plug *plug, struct bio *bio, - unsigned int nsegs, - bool *same_queue_rq) + unsigned int nsegs) { struct request *rq; bool checked = false; @@ -2762,8 +2759,7 @@ static inline struct request *blk_mq_get_request(struct request_queue *q, if (rq && rq->q == q) { if (unlikely(!submit_bio_checks(bio))) return NULL; - if (blk_mq_attempt_bio_merge(q, bio, nsegs, - same_queue_rq)) + if (blk_mq_attempt_bio_merge(q, bio, nsegs)) return NULL; checked = true; if (!blk_mq_can_use_cached_rq(rq, bio)) @@ -2781,7 +2777,7 @@ fallback: return NULL; if (unlikely(!checked && !submit_bio_checks(bio))) goto out_put; - rq = blk_mq_get_new_requests(q, plug, bio, nsegs, same_queue_rq); + rq = blk_mq_get_new_requests(q, plug, bio, nsegs); if (rq) return rq; out_put: @@ -2808,7 +2804,6 @@ void blk_mq_submit_bio(struct bio *bio) const int is_sync = op_is_sync(bio->bi_opf); struct request *rq; struct blk_plug *plug; - bool same_queue_rq = false; unsigned int nr_segs = 1; blk_status_t ret; @@ -2823,7 +2818,7 @@ void blk_mq_submit_bio(struct bio *bio) return; plug = blk_mq_plug(q, bio); - rq = blk_mq_get_request(q, plug, bio, nr_segs, &same_queue_rq); + rq = blk_mq_get_request(q, plug, bio, nr_segs); if (unlikely(!rq)) return; @@ -2846,16 +2841,7 @@ void blk_mq_submit_bio(struct bio *bio) return; } - if (plug && (q->nr_hw_queues == 1 || - blk_mq_is_shared_tags(rq->mq_hctx->flags) || - q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) { - /* - * Use plugging if we have a ->commit_rqs() hook as well, as - * we know the driver uses bd->last in a smart fashion. - * - * Use normal plugging if this disk is slow HDD, as sequential - * IO may benefit a lot from plug merging. - */ + if (plug) { unsigned int request_count = plug->rq_count; struct request *last = NULL; @@ -2873,40 +2859,12 @@ void blk_mq_submit_bio(struct bio *bio) } blk_add_rq_to_plug(plug, rq); - } else if (rq->rq_flags & RQF_ELV) { - /* Insert the request at the IO scheduler queue */ + } else if ((rq->rq_flags & RQF_ELV) || + (rq->mq_hctx->dispatch_busy && + (q->nr_hw_queues == 1 || !is_sync))) { blk_mq_sched_insert_request(rq, false, true, true); - } else if (plug && !blk_queue_nomerges(q)) { - struct request *next_rq = NULL; - - /* - * We do limited plugging. If the bio can be merged, do that. - * Otherwise the existing request in the plug list will be - * issued. So the plug list will have one request at most - * The plug list might get flushed before this. If that happens, - * the plug list is empty, and same_queue_rq is invalid. - */ - if (same_queue_rq) { - next_rq = rq_list_pop(&plug->mq_list); - plug->rq_count--; - } - blk_add_rq_to_plug(plug, rq); - trace_block_plug(q); - - if (next_rq) { - trace_block_unplug(q, 1, true); - blk_mq_try_issue_directly(next_rq->mq_hctx, next_rq); - } - } else if ((q->nr_hw_queues > 1 && is_sync) || - !rq->mq_hctx->dispatch_busy) { - /* - * There is no scheduler and we can try to send directly - * to the hardware. - */ - blk_mq_try_issue_directly(rq->mq_hctx, rq); } else { - /* Default case. */ - blk_mq_sched_insert_request(rq, false, true, true); + blk_mq_try_issue_directly(rq->mq_hctx, rq); } } diff --git a/block/blk.h b/block/blk.h index 423cba8ea0a6..5d4d08df772b 100644 --- a/block/blk.h +++ b/block/blk.h @@ -253,7 +253,7 @@ void blk_add_timer(struct request *req); const char *blk_status_to_str(blk_status_t status); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs, bool *same_queue_rq); + unsigned int nr_segs); bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio, unsigned int nr_segs); From 1e9c23034d7b216b02f1491505779dfe9a1a6e23 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Tue, 23 Nov 2021 17:04:42 +0100 Subject: [PATCH 027/132] blk-mq: move more plug handling from blk_mq_submit_bio into blk_add_rq_to_plug Keep all the functionality for adding a request to a plug in a single place. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211123160443.1315598-3-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq.c | 64 +++++++++++++++++++++----------------------------- 1 file changed, 27 insertions(+), 37 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 4a13900b640e..3af88ffc9e2c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2662,21 +2662,6 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, hctx->queue->mq_ops->commit_rqs(hctx); } -static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) -{ - if (!plug->multiple_queues) { - struct request *nxt = rq_list_peek(&plug->mq_list); - - if (nxt && nxt->q != rq->q) - plug->multiple_queues = true; - } - if (!plug->has_elevator && (rq->rq_flags & RQF_ELV)) - plug->has_elevator = true; - rq->rq_next = NULL; - rq_list_add(&plug->mq_list, rq); - plug->rq_count++; -} - /* * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple * queues. This is important for md arrays to benefit from merging @@ -2689,6 +2674,28 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) return BLK_MAX_REQUEST_COUNT; } +static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) +{ + struct request *last = rq_list_peek(&plug->mq_list); + + if (!plug->rq_count) { + trace_block_plug(rq->q); + } else if (plug->rq_count >= blk_plug_max_rq_count(plug) || + (!blk_queue_nomerges(rq->q) && + blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { + blk_mq_flush_plug_list(plug, false); + trace_block_plug(rq->q); + } + + if (!plug->multiple_queues && last && last->q != rq->q) + plug->multiple_queues = true; + if (!plug->has_elevator && (rq->rq_flags & RQF_ELV)) + plug->has_elevator = true; + rq->rq_next = NULL; + rq_list_add(&plug->mq_list, rq); + plug->rq_count++; +} + static bool blk_mq_attempt_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { @@ -2841,31 +2848,14 @@ void blk_mq_submit_bio(struct bio *bio) return; } - if (plug) { - unsigned int request_count = plug->rq_count; - struct request *last = NULL; - - if (!request_count) { - trace_block_plug(q); - } else if (!blk_queue_nomerges(q)) { - last = rq_list_peek(&plug->mq_list); - if (blk_rq_bytes(last) < BLK_PLUG_FLUSH_SIZE) - last = NULL; - } - - if (request_count >= blk_plug_max_rq_count(plug) || last) { - blk_mq_flush_plug_list(plug, false); - trace_block_plug(q); - } - + if (plug) blk_add_rq_to_plug(plug, rq); - } else if ((rq->rq_flags & RQF_ELV) || - (rq->mq_hctx->dispatch_busy && - (q->nr_hw_queues == 1 || !is_sync))) { + else if ((rq->rq_flags & RQF_ELV) || + (rq->mq_hctx->dispatch_busy && + (q->nr_hw_queues == 1 || !is_sync))) blk_mq_sched_insert_request(rq, false, true, true); - } else { + else blk_mq_try_issue_directly(rq->mq_hctx, rq); - } } /** From 25c4b5e058578066db56d757ad3a7adeaff35856 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Sat, 13 Nov 2021 13:37:38 -0700 Subject: [PATCH 028/132] blk-ioprio: don't set bio priority if not needed We don't need to write to the bio if: 1) No ioprio value has ever been assigned to the blkcg 2) We wouldn't anyway, depending on bio and blkcg IO priority Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioprio.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index 332a07761bf8..2e7f10e1c03f 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -62,6 +62,7 @@ struct ioprio_blkg { struct ioprio_blkcg { struct blkcg_policy_data cpd; enum prio_policy prio_policy; + bool prio_set; }; static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd) @@ -112,7 +113,7 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, if (ret < 0) return ret; blkcg->prio_policy = ret; - + blkcg->prio_set = true; return nbytes; } @@ -190,6 +191,10 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio); + u16 prio; + + if (!blkcg->prio_set) + return; /* * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers @@ -199,8 +204,10 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq, * bio I/O priority is not modified. If the bio I/O priority equals * IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio. */ - bio->bi_ioprio = max_t(u16, bio->bi_ioprio, - IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); + prio = max_t(u16, bio->bi_ioprio, + IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); + if (prio > bio->bi_ioprio) + bio->bi_ioprio = prio; } static void blkcg_ioprio_exit(struct rq_qos *rqos) From 48b5c1fbcd8c5bc6b91a56399a5257b801391dd8 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Sat, 13 Nov 2021 14:03:26 -0700 Subject: [PATCH 029/132] block: only allocate poll_stats if there's a user of them This is essentially never used, yet it's about 1/3rd of the total queue size. Allocate it when needed, and don't embed it in the queue. Kill the queue flag for this while at it, since we can just check the assigned pointer now. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq-debugfs.c | 1 - block/blk-mq.c | 10 ++++------ block/blk-stat.c | 18 ++++++++++++++++++ block/blk-stat.h | 1 + block/blk-sysfs.c | 3 ++- include/linux/blkdev.h | 3 +-- 6 files changed, 26 insertions(+), 10 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 4f2cf8399f3d..f4022b198580 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -122,7 +122,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(FUA), QUEUE_FLAG_NAME(DAX), QUEUE_FLAG_NAME(STATS), - QUEUE_FLAG_NAME(POLL_STATS), QUEUE_FLAG_NAME(REGISTERED), QUEUE_FLAG_NAME(QUIESCED), QUEUE_FLAG_NAME(PCI_P2PDMA), diff --git a/block/blk-mq.c b/block/blk-mq.c index 3af88ffc9e2c..7cd408408a37 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4581,11 +4581,10 @@ EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); /* Enable polling stats and return whether they were already enabled. */ static bool blk_poll_stats_enable(struct request_queue *q) { - if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || - blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q)) + if (q->poll_stat) return true; - blk_stat_add_callback(q, q->poll_cb); - return false; + + return blk_stats_alloc_enable(q); } static void blk_mq_poll_stats_start(struct request_queue *q) @@ -4594,8 +4593,7 @@ static void blk_mq_poll_stats_start(struct request_queue *q) * We don't arm the callback if polling stats are not enabled or the * callback is already active. */ - if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || - blk_stat_is_active(q->poll_cb)) + if (!q->poll_stat || blk_stat_is_active(q->poll_cb)) return; blk_stat_activate_msecs(q->poll_cb, 100); diff --git a/block/blk-stat.c b/block/blk-stat.c index ae3dd1fb8e61..efb2a80db906 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -219,3 +219,21 @@ void blk_free_queue_stats(struct blk_queue_stats *stats) kfree(stats); } + +bool blk_stats_alloc_enable(struct request_queue *q) +{ + struct blk_rq_stat *poll_stat; + + poll_stat = kcalloc(BLK_MQ_POLL_STATS_BKTS, sizeof(*poll_stat), + GFP_ATOMIC); + if (!poll_stat) + return false; + + if (cmpxchg(&q->poll_stat, NULL, poll_stat) != NULL) { + kfree(poll_stat); + return true; + } + + blk_stat_add_callback(q, q->poll_cb); + return false; +} diff --git a/block/blk-stat.h b/block/blk-stat.h index 17b47a86eefb..58f029af49e5 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h @@ -64,6 +64,7 @@ struct blk_stat_callback { struct blk_queue_stats *blk_alloc_queue_stats(void); void blk_free_queue_stats(struct blk_queue_stats *); +bool blk_stats_alloc_enable(struct request_queue *q); void blk_stat_add(struct request *rq, u64 now); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index cd75b0f73dc6..c079be1c58a3 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -785,11 +785,12 @@ static void blk_release_queue(struct kobject *kobj) might_sleep(); - if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) + if (q->poll_stat) blk_stat_remove_callback(q, q->poll_cb); blk_stat_free_callback(q->poll_cb); blk_free_queue_stats(q->stats); + kfree(q->poll_stat); blk_exit_queue(q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bd4370baccca..74118e67f649 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -267,7 +267,7 @@ struct request_queue { int poll_nsec; struct blk_stat_callback *poll_cb; - struct blk_rq_stat poll_stat[BLK_MQ_POLL_STATS_BKTS]; + struct blk_rq_stat *poll_stat; struct timer_list timeout; struct work_struct timeout_work; @@ -397,7 +397,6 @@ struct request_queue { #define QUEUE_FLAG_FUA 18 /* device supports FUA writes */ #define QUEUE_FLAG_DAX 19 /* device supports DAX */ #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ -#define QUEUE_FLAG_POLL_STATS 21 /* collecting stats for hybrid polling */ #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ #define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ From 5a9d041ba2f6da468c891ca0fe263758e2c12091 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Sat, 13 Nov 2021 11:18:32 -0700 Subject: [PATCH 030/132] block: move io_context creation into where it's needed The only user of the io_context for IO is BFQ, yet we put the checking and logic of it into the normal IO path. Put the creation into blk_mq_sched_assign_ioc(), and have BFQ use that helper. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bfq-iosched.c | 2 ++ block/blk-core.c | 9 --------- block/blk-mq-sched.c | 5 +++++ block/blk-mq.c | 3 --- 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index fec18118dc30..1ce1a99a7160 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6573,6 +6573,8 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, */ static void bfq_prepare_request(struct request *rq) { + blk_mq_sched_assign_ioc(rq); + /* * Regardless of whether we have an icq attached, we have to * clear the scheduler pointers, as they might point to diff --git a/block/blk-core.c b/block/blk-core.c index 35a04d8c180a..2053d1b0e90e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -750,15 +750,6 @@ noinline_for_stack bool submit_bio_checks(struct bio *bio) break; } - /* - * Various block parts want %current->io_context, so allocate it up - * front rather than dealing with lots of pain to allocate it only - * where needed. This may fail and the block layer knows how to live - * with it. - */ - if (unlikely(!current->io_context)) - create_task_io_context(current, GFP_ATOMIC, q->node); - if (blk_throtl_bio(bio)) return false; diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index ba21449439cc..b942b38000e5 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -24,6 +24,10 @@ void blk_mq_sched_assign_ioc(struct request *rq) struct io_context *ioc; struct io_cq *icq; + /* create task io_context, if we don't have one already */ + if (unlikely(!current->io_context)) + create_task_io_context(current, GFP_ATOMIC, q->node); + /* * May not have an IO context if it's a passthrough request */ @@ -43,6 +47,7 @@ void blk_mq_sched_assign_ioc(struct request *rq) get_io_context(icq->ioc); rq->elv.icq = icq; } +EXPORT_SYMBOL_GPL(blk_mq_sched_assign_ioc); /* * Mark a hardware queue as needing a restart. For shared queues, maintain diff --git a/block/blk-mq.c b/block/blk-mq.c index 7cd408408a37..d6e7634e5e1f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -406,9 +406,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, if (!op_is_flush(data->cmd_flags) && e->type->ops.prepare_request) { - if (e->type->icq_cache) - blk_mq_sched_assign_ioc(rq); - e->type->ops.prepare_request(rq); rq->rq_flags |= RQF_ELVPRIV; } From 35c90e6ec9608d8225b82ce609489b531cfd0a40 Mon Sep 17 00:00:00 2001 From: Guo Zhengkui <guozhengkui@vivo.com> Date: Tue, 23 Nov 2021 14:33:40 +0800 Subject: [PATCH 031/132] blk_mq: remove repeated includes Remove a repeated "#include<linux/sched/sysctl.h>". Signed-off-by: Guo Zhengkui <guozhengkui@vivo.com> Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com> Link: https://lore.kernel.org/r/20211123063340.25882-1-guozhengkui@vivo.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index d6e7634e5e1f..4dff401bc642 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -28,7 +28,6 @@ #include <linux/crash_dump.h> #include <linux/prefetch.h> #include <linux/blk-crypto.h> -#include <linux/sched/sysctl.h> #include <trace/events/block.h> From 0281ed3cf44d2a7061ec3c1680e1f86e55ad57b9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Tue, 23 Nov 2021 19:53:05 +0100 Subject: [PATCH 032/132] block: move blk_get_flush_queue to blk-flush.c blk_get_flush_queue is only used in blk-flush.c, so move it there. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211123185312.1432157-2-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-flush.c | 6 ++++++ block/blk.h | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 1fce6d16e6d3..86ee50455e41 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -95,6 +95,12 @@ enum { static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, unsigned int flags); +static inline struct blk_flush_queue * +blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) +{ + return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; +} + static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) { unsigned int policy = 0; diff --git a/block/blk.h b/block/blk.h index 5d4d08df772b..1346085d89ce 100644 --- a/block/blk.h +++ b/block/blk.h @@ -35,12 +35,6 @@ extern struct kmem_cache *blk_requestq_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; -static inline struct blk_flush_queue * -blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) -{ - return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; -} - static inline void __blk_get_queue(struct request_queue *q) { kobject_get(&q->kobj); From f46b81c54b241bf1ec01c8936a37201c99f94fc7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Tue, 23 Nov 2021 19:53:06 +0100 Subject: [PATCH 033/132] block: remove elevator_exit Open code elevator_exit in it's only caller, and rename __elevator_exit to elevator_exit. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211123185312.1432157-3-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-sysfs.c | 2 +- block/blk.h | 11 +---------- block/elevator.c | 4 +++- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index c079be1c58a3..1677eb4a680c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -747,7 +747,7 @@ static void blk_exit_queue(struct request_queue *q) */ if (q->elevator) { ioc_clear_queue(q); - __elevator_exit(q, q->elevator); + elevator_exit(q, q->elevator); } /* diff --git a/block/blk.h b/block/blk.h index 1346085d89ce..2266cb1f7df5 100644 --- a/block/blk.h +++ b/block/blk.h @@ -266,19 +266,10 @@ void blk_insert_flush(struct request *rq); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); -void __elevator_exit(struct request_queue *, struct elevator_queue *); +void elevator_exit(struct request_queue *, struct elevator_queue *); int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q); -static inline void elevator_exit(struct request_queue *q, - struct elevator_queue *e) -{ - lockdep_assert_held(&q->sysfs_lock); - - blk_mq_sched_free_rqs(q); - __elevator_exit(q, e); -} - ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, diff --git a/block/elevator.c b/block/elevator.c index 19a78d5516ba..3536cdd5fa12 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -188,7 +188,7 @@ static void elevator_release(struct kobject *kobj) kfree(e); } -void __elevator_exit(struct request_queue *q, struct elevator_queue *e) +void elevator_exit(struct request_queue *q, struct elevator_queue *e) { mutex_lock(&e->sysfs_lock); blk_mq_exit_sched(q, e); @@ -595,6 +595,7 @@ int elevator_switch_mq(struct request_queue *q, elv_unregister_queue(q); ioc_clear_queue(q); + blk_mq_sched_free_rqs(q); elevator_exit(q, q->elevator); } @@ -605,6 +606,7 @@ int elevator_switch_mq(struct request_queue *q, if (new_e) { ret = elv_register_queue(q, true); if (ret) { + blk_mq_sched_free_rqs(q); elevator_exit(q, q->elevator); goto out; } From 0c6cb3a293fa9adc7be24d67ff7d201aefa50362 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Tue, 23 Nov 2021 19:53:07 +0100 Subject: [PATCH 034/132] block: remove the e argument to elevator_exit All callers pass q->elevator. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211123185312.1432157-4-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-sysfs.c | 2 +- block/blk.h | 2 +- block/elevator.c | 8 +++++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1677eb4a680c..772347adc56b 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -747,7 +747,7 @@ static void blk_exit_queue(struct request_queue *q) */ if (q->elevator) { ioc_clear_queue(q); - elevator_exit(q, q->elevator); + elevator_exit(q); } /* diff --git a/block/blk.h b/block/blk.h index 2266cb1f7df5..4df2ce8d4999 100644 --- a/block/blk.h +++ b/block/blk.h @@ -266,7 +266,7 @@ void blk_insert_flush(struct request *rq); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); -void elevator_exit(struct request_queue *, struct elevator_queue *); +void elevator_exit(struct request_queue *q); int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q); diff --git a/block/elevator.c b/block/elevator.c index 3536cdd5fa12..ec98aed39c4f 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -188,8 +188,10 @@ static void elevator_release(struct kobject *kobj) kfree(e); } -void elevator_exit(struct request_queue *q, struct elevator_queue *e) +void elevator_exit(struct request_queue *q) { + struct elevator_queue *e = q->elevator; + mutex_lock(&e->sysfs_lock); blk_mq_exit_sched(q, e); mutex_unlock(&e->sysfs_lock); @@ -596,7 +598,7 @@ int elevator_switch_mq(struct request_queue *q, ioc_clear_queue(q); blk_mq_sched_free_rqs(q); - elevator_exit(q, q->elevator); + elevator_exit(q); } ret = blk_mq_init_sched(q, new_e); @@ -607,7 +609,7 @@ int elevator_switch_mq(struct request_queue *q, ret = elv_register_queue(q, true); if (ret) { blk_mq_sched_free_rqs(q); - elevator_exit(q, q->elevator); + elevator_exit(q); goto out; } } From 2aa7745bf6db55b7800c4433e102b07c649fd001 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Tue, 23 Nov 2021 19:53:08 +0100 Subject: [PATCH 035/132] block: don't include blk-mq-sched.h in blk.h No needed, shift it into the source files that need it instead. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211123185312.1432157-5-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-core.c | 1 + block/blk-ioc.c | 1 + block/blk-merge.c | 1 + block/blk-mq-debugfs.c | 1 + block/blk-sysfs.c | 1 + block/blk.h | 1 - block/genhd.c | 1 + 7 files changed, 6 insertions(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 2053d1b0e90e..1acd94ba10c7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -46,6 +46,7 @@ #include <trace/events/block.h> #include "blk.h" +#include "blk-mq-sched.h" #include "blk-pm.h" #include "blk-throttle.h" diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 57299f860d41..70c99e85aee5 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -11,6 +11,7 @@ #include <linux/sched/task.h> #include "blk.h" +#include "blk-mq-sched.h" /* * For io context allocations diff --git a/block/blk-merge.c b/block/blk-merge.c index ba761c3f482b..456fb88c49b1 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -12,6 +12,7 @@ #include <trace/events/block.h> #include "blk.h" +#include "blk-mq-sched.h" #include "blk-rq-qos.h" #include "blk-throttle.h" diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index f4022b198580..7f27dca3a45e 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -11,6 +11,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" +#include "blk-mq-sched.h" #include "blk-mq-tag.h" #include "blk-rq-qos.h" diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 772347adc56b..4622da4bb992 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -16,6 +16,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" +#include "blk-mq-sched.h" #include "blk-wbt.h" #include "blk-throttle.h" diff --git a/block/blk.h b/block/blk.h index 4df2ce8d4999..db6efa351d3e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -10,7 +10,6 @@ #include <xen/xen.h> #include "blk-crypto-internal.h" #include "blk-mq.h" -#include "blk-mq-sched.h" struct elevator_type; diff --git a/block/genhd.c b/block/genhd.c index 8e9cbf23c510..01606db8c625 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -27,6 +27,7 @@ #include <linux/badblocks.h> #include "blk.h" +#include "blk-mq-sched.h" #include "blk-rq-qos.h" static struct kobject *block_depr; From e4a19f7289f3fa9b2a4e65d0f4b3a7816e8e445b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Tue, 23 Nov 2021 19:53:09 +0100 Subject: [PATCH 036/132] block: don't include blk-mq.h in blk.h No needed, shift a blk-stat.h include into the source file that needs it instead. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211123185312.1432157-6-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-throttle.c | 1 + block/blk.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 39bb6e68a9a2..7c462c006b26 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -13,6 +13,7 @@ #include <linux/blk-cgroup.h> #include "blk.h" #include "blk-cgroup-rwstat.h" +#include "blk-stat.h" #include "blk-throttle.h" /* Max dispatch from a group in 1 round */ diff --git a/block/blk.h b/block/blk.h index db6efa351d3e..0f9472bea616 100644 --- a/block/blk.h +++ b/block/blk.h @@ -9,7 +9,6 @@ #include <linux/memblock.h> /* for max_pfn/max_low_pfn */ #include <xen/xen.h> #include "blk-crypto-internal.h" -#include "blk-mq.h" struct elevator_type; From a2ff7781cfe6a2104c64deb8311532ccd4d4c57d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Tue, 23 Nov 2021 19:53:10 +0100 Subject: [PATCH 037/132] block: don't include <linux/blk-mq.h> in blk.h Not needed. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211123185312.1432157-7-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk.h | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk.h b/block/blk.h index 0f9472bea616..a6e9ce376780 100644 --- a/block/blk.h +++ b/block/blk.h @@ -3,7 +3,6 @@ #define BLK_INTERNAL_H #include <linux/idr.h> -#include <linux/blk-mq.h> #include <linux/part_stat.h> #include <linux/blk-crypto.h> #include <linux/memblock.h> /* for max_pfn/max_low_pfn */ From ca5b304cabef55c670a22bbe53a883fe5c3b2620 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Tue, 23 Nov 2021 19:53:11 +0100 Subject: [PATCH 038/132] block: don't include <linux/idr.h> in blk.h Not needed. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211123185312.1432157-8-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk.h | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk.h b/block/blk.h index a6e9ce376780..4089aeffca4b 100644 --- a/block/blk.h +++ b/block/blk.h @@ -2,7 +2,6 @@ #ifndef BLK_INTERNAL_H #define BLK_INTERNAL_H -#include <linux/idr.h> #include <linux/part_stat.h> #include <linux/blk-crypto.h> #include <linux/memblock.h> /* for max_pfn/max_low_pfn */ From 82d981d4230bc0a19540fc540d4bdf49a3769f05 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Tue, 23 Nov 2021 19:53:12 +0100 Subject: [PATCH 039/132] block: don't include <linux/part_stat.h> in blk.h Not needed, shift it into the source files that need it instead. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211123185312.1432157-9-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-cgroup.c | 1 + block/blk-core.c | 1 + block/blk-flush.c | 1 + block/blk-merge.c | 1 + block/blk-mq.c | 1 + block/blk.h | 1 - block/genhd.c | 1 + 7 files changed, 6 insertions(+), 1 deletion(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 663aabfeba18..650f7e27989f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -30,6 +30,7 @@ #include <linux/blk-cgroup.h> #include <linux/tracehook.h> #include <linux/psi.h> +#include <linux/part_stat.h> #include "blk.h" #include "blk-ioprio.h" #include "blk-throttle.h" diff --git a/block/blk-core.c b/block/blk-core.c index 1acd94ba10c7..b0660c9df852 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -39,6 +39,7 @@ #include <linux/debugfs.h> #include <linux/bpf.h> #include <linux/psi.h> +#include <linux/part_stat.h> #include <linux/sched/sysctl.h> #include <linux/blk-crypto.h> diff --git a/block/blk-flush.c b/block/blk-flush.c index 86ee50455e41..902e80e48e4a 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -69,6 +69,7 @@ #include <linux/blkdev.h> #include <linux/gfp.h> #include <linux/blk-mq.h> +#include <linux/part_stat.h> #include "blk.h" #include "blk-mq.h" diff --git a/block/blk-merge.c b/block/blk-merge.c index 456fb88c49b1..e07f5a1ae86e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -8,6 +8,7 @@ #include <linux/blkdev.h> #include <linux/blk-integrity.h> #include <linux/scatterlist.h> +#include <linux/part_stat.h> #include <trace/events/block.h> diff --git a/block/blk-mq.c b/block/blk-mq.c index 4dff401bc642..4bdc3bc54ea3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -28,6 +28,7 @@ #include <linux/crash_dump.h> #include <linux/prefetch.h> #include <linux/blk-crypto.h> +#include <linux/part_stat.h> #include <trace/events/block.h> diff --git a/block/blk.h b/block/blk.h index 4089aeffca4b..a57c84654d0a 100644 --- a/block/blk.h +++ b/block/blk.h @@ -2,7 +2,6 @@ #ifndef BLK_INTERNAL_H #define BLK_INTERNAL_H -#include <linux/part_stat.h> #include <linux/blk-crypto.h> #include <linux/memblock.h> /* for max_pfn/max_low_pfn */ #include <xen/xen.h> diff --git a/block/genhd.c b/block/genhd.c index 01606db8c625..5179a4f00fba 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -25,6 +25,7 @@ #include <linux/log2.h> #include <linux/pm_runtime.h> #include <linux/badblocks.h> +#include <linux/part_stat.h> #include "blk.h" #include "blk-mq-sched.h" From 5b13bc8a3fd519d86e5b1a0b1d1b996cace62f3f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Wed, 24 Nov 2021 07:28:56 +0100 Subject: [PATCH 040/132] blk-mq: cleanup request allocation Refactor the request alloction so that blk_mq_get_cached_request tries to find a cached request first, and the entirely separate and now self contained blk_mq_get_new_requests allocates one or more requests if that is not possible. There is a small change in behavior as submit_bio_checks is called twice now if a cached request is present but can't be used, but that is a small price to pay for unwinding this code. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211124062856.1444266-1-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq.c | 102 +++++++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 58 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 4bdc3bc54ea3..a89a624dd1df 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2717,8 +2717,12 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, }; struct request *rq; - if (blk_mq_attempt_bio_merge(q, bio, nsegs)) + if (unlikely(bio_queue_enter(bio))) return NULL; + if (unlikely(!submit_bio_checks(bio))) + goto queue_exit; + if (blk_mq_attempt_bio_merge(q, bio, nsegs)) + goto queue_exit; rq_qos_throttle(q, bio); @@ -2729,66 +2733,46 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, } rq = __blk_mq_alloc_requests(&data); - if (rq) - return rq; + if (!rq) + goto fail; + return rq; +fail: rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); - - return NULL; -} - -static inline bool blk_mq_can_use_cached_rq(struct request *rq, struct bio *bio) -{ - if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type) - return false; - - if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) - return false; - - return true; -} - -static inline struct request *blk_mq_get_request(struct request_queue *q, - struct blk_plug *plug, - struct bio *bio, - unsigned int nsegs) -{ - struct request *rq; - bool checked = false; - - if (plug) { - rq = rq_list_peek(&plug->cached_rq); - if (rq && rq->q == q) { - if (unlikely(!submit_bio_checks(bio))) - return NULL; - if (blk_mq_attempt_bio_merge(q, bio, nsegs)) - return NULL; - checked = true; - if (!blk_mq_can_use_cached_rq(rq, bio)) - goto fallback; - rq->cmd_flags = bio->bi_opf; - plug->cached_rq = rq_list_next(rq); - INIT_LIST_HEAD(&rq->queuelist); - rq_qos_throttle(q, bio); - return rq; - } - } - -fallback: - if (unlikely(bio_queue_enter(bio))) - return NULL; - if (unlikely(!checked && !submit_bio_checks(bio))) - goto out_put; - rq = blk_mq_get_new_requests(q, plug, bio, nsegs); - if (rq) - return rq; -out_put: +queue_exit: blk_queue_exit(q); return NULL; } +static inline struct request *blk_mq_get_cached_request(struct request_queue *q, + struct blk_plug *plug, struct bio *bio, unsigned int nsegs) +{ + struct request *rq; + + if (!plug) + return NULL; + rq = rq_list_peek(&plug->cached_rq); + if (!rq || rq->q != q) + return NULL; + + if (unlikely(!submit_bio_checks(bio))) + return NULL; + if (blk_mq_attempt_bio_merge(q, bio, nsegs)) + return NULL; + if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type) + return NULL; + if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) + return NULL; + + rq->cmd_flags = bio->bi_opf; + plug->cached_rq = rq_list_next(rq); + INIT_LIST_HEAD(&rq->queuelist); + rq_qos_throttle(q, bio); + return rq; +} + /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. @@ -2805,9 +2789,9 @@ out_put: void blk_mq_submit_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); + struct blk_plug *plug = blk_mq_plug(q, bio); const int is_sync = op_is_sync(bio->bi_opf); struct request *rq; - struct blk_plug *plug; unsigned int nr_segs = 1; blk_status_t ret; @@ -2821,10 +2805,12 @@ void blk_mq_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) return; - plug = blk_mq_plug(q, bio); - rq = blk_mq_get_request(q, plug, bio, nr_segs); - if (unlikely(!rq)) - return; + rq = blk_mq_get_cached_request(q, plug, bio, nr_segs); + if (!rq) { + rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); + if (unlikely(!rq)) + return; + } trace_block_getrq(bio); From 72cd9df2ef788d88c138d51223a01ca6281f232d Mon Sep 17 00:00:00 2001 From: Eric Biggers <ebiggers@google.com> Date: Tue, 23 Nov 2021 17:37:33 -0800 Subject: [PATCH 041/132] blk-crypto: remove blk_crypto_unregister() This function is trivial and is only used in one place. Having this function is misleading because it implies that blk_crypto_register() needs to be paired with blk_crypto_unregister(), which is not the case. Just set disk->queue->crypto_profile to NULL directly. Signed-off-by: Eric Biggers <ebiggers@google.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211124013733.347612-1-ebiggers@kernel.org Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-crypto-profile.c | 5 ----- block/blk-integrity.c | 2 +- include/linux/blkdev.h | 4 ---- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c index 605ba0626a5c..96c511967386 100644 --- a/block/blk-crypto-profile.c +++ b/block/blk-crypto-profile.c @@ -463,11 +463,6 @@ bool blk_crypto_register(struct blk_crypto_profile *profile, } EXPORT_SYMBOL_GPL(blk_crypto_register); -void blk_crypto_unregister(struct request_queue *q) -{ - q->crypto_profile = NULL; -} - /** * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities * by child device diff --git a/block/blk-integrity.c b/block/blk-integrity.c index d670d54e5f7a..69eed260a823 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -411,7 +411,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template #ifdef CONFIG_BLK_INLINE_ENCRYPTION if (disk->queue->crypto_profile) { pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); - blk_crypto_unregister(disk->queue); + disk->queue->crypto_profile = NULL; } #endif } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 74118e67f649..0a4416ef4fbf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1170,8 +1170,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned lo bool blk_crypto_register(struct blk_crypto_profile *profile, struct request_queue *q); -void blk_crypto_unregister(struct request_queue *q); - #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline bool blk_crypto_register(struct blk_crypto_profile *profile, @@ -1180,8 +1178,6 @@ static inline bool blk_crypto_register(struct blk_crypto_profile *profile, return true; } -static inline void blk_crypto_unregister(struct request_queue *q) { } - #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ enum blk_unique_id { From e8dc17e2893b4107366004810ca2a4acf1fc8563 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Date: Mon, 25 Oct 2021 09:06:57 +0200 Subject: [PATCH 042/132] blk-mq: Add blk_mq_complete_request_direct() Add blk_mq_complete_request_direct() which completes the block request directly instead deferring it to softirq for single queue devices. This is useful for devices which complete the requests in preemptible context and raising softirq from means scheduling ksoftirqd. Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Reviewed-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211025070658.1565848-2-bigeasy@linutronix.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- include/linux/blk-mq.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 308edc2a4925..d952c3442261 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -752,6 +752,17 @@ static inline void blk_mq_set_request_complete(struct request *rq) WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); } +/* + * Complete the request directly instead of deferring it to softirq or + * completing it another CPU. Useful in preemptible instead of an interrupt. + */ +static inline void blk_mq_complete_request_direct(struct request *rq, + void (*complete)(struct request *rq)) +{ + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); + complete(rq); +} + void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); From 639d353143fa3bfa81bbe7af263260d93d23d822 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Date: Mon, 25 Oct 2021 09:06:58 +0200 Subject: [PATCH 043/132] mmc: core: Use blk_mq_complete_request_direct(). The completion callback for the sdhci-pci device is invoked from a kworker. I couldn't identify in which context is mmc_blk_mq_req_done() invoke but the remaining caller are from invoked from preemptible context. Here it would make sense to complete the request directly instead scheduling ksoftirqd for its completion. Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org> Acked-by: Adrian Hunter <adrian.hunter@intel.com> Link: https://lore.kernel.org/r/20211025070658.1565848-3-bigeasy@linutronix.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- drivers/mmc/core/block.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 5e0960560eab..635b79899b9f 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -2051,7 +2051,8 @@ static void mmc_blk_mq_dec_in_flight(struct mmc_queue *mq, struct request *req) mmc_put_card(mq->card, &mq->ctx); } -static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req) +static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req, + bool can_sleep) { struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req); struct mmc_request *mrq = &mqrq->brq.mrq; @@ -2063,10 +2064,14 @@ static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req) * Block layer timeouts race with completions which means the normal * completion path cannot be used during recovery. */ - if (mq->in_recovery) + if (mq->in_recovery) { mmc_blk_mq_complete_rq(mq, req); - else if (likely(!blk_should_fake_timeout(req->q))) - blk_mq_complete_request(req); + } else if (likely(!blk_should_fake_timeout(req->q))) { + if (can_sleep) + blk_mq_complete_request_direct(req, mmc_blk_mq_complete); + else + blk_mq_complete_request(req); + } mmc_blk_mq_dec_in_flight(mq, req); } @@ -2087,7 +2092,7 @@ void mmc_blk_mq_recovery(struct mmc_queue *mq) mmc_blk_urgent_bkops(mq, mqrq); - mmc_blk_mq_post_req(mq, req); + mmc_blk_mq_post_req(mq, req, true); } static void mmc_blk_mq_complete_prev_req(struct mmc_queue *mq, @@ -2106,7 +2111,7 @@ static void mmc_blk_mq_complete_prev_req(struct mmc_queue *mq, if (prev_req) *prev_req = mq->complete_req; else - mmc_blk_mq_post_req(mq, mq->complete_req); + mmc_blk_mq_post_req(mq, mq->complete_req, true); mq->complete_req = NULL; @@ -2178,7 +2183,8 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq) mq->rw_wait = false; wake_up(&mq->wait); - mmc_blk_mq_post_req(mq, req); + /* context unknown */ + mmc_blk_mq_post_req(mq, req, false); } static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err) @@ -2238,7 +2244,7 @@ static int mmc_blk_mq_issue_rw_rq(struct mmc_queue *mq, err = mmc_start_request(host, &mqrq->brq.mrq); if (prev_req) - mmc_blk_mq_post_req(mq, prev_req); + mmc_blk_mq_post_req(mq, prev_req, true); if (err) mq->rw_wait = false; From 790cf9c84837b232eb413b8b6b5d57817176cb23 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Thu, 25 Nov 2021 14:36:34 +0100 Subject: [PATCH 044/132] block: Provide blk_mq_sched_get_icq() Currently we lookup ICQ only after the request is allocated. However BFQ will want to decide how many scheduler tags it allows a given bfq queue (effectively a process) to consume based on cgroup weight. So provide a function blk_mq_sched_get_icq() so that BFQ can lookup ICQ earlier. Acked-by: Paolo Valente <paolo.valente@linaro.org> Signed-off-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211125133645.27483-1-jack@suse.cz Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq-sched.c | 26 +++++++++++++++----------- block/blk-mq-sched.h | 1 + 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index b942b38000e5..98c6a97729f2 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -18,9 +18,8 @@ #include "blk-mq-tag.h" #include "blk-wbt.h" -void blk_mq_sched_assign_ioc(struct request *rq) +struct io_cq *blk_mq_sched_get_icq(struct request_queue *q) { - struct request_queue *q = rq->q; struct io_context *ioc; struct io_cq *icq; @@ -28,22 +27,27 @@ void blk_mq_sched_assign_ioc(struct request *rq) if (unlikely(!current->io_context)) create_task_io_context(current, GFP_ATOMIC, q->node); - /* - * May not have an IO context if it's a passthrough request - */ + /* May not have an IO context if context creation failed */ ioc = current->io_context; if (!ioc) - return; + return NULL; spin_lock_irq(&q->queue_lock); icq = ioc_lookup_icq(ioc, q); spin_unlock_irq(&q->queue_lock); + if (icq) + return icq; + return ioc_create_icq(ioc, q, GFP_ATOMIC); +} +EXPORT_SYMBOL(blk_mq_sched_get_icq); - if (!icq) { - icq = ioc_create_icq(ioc, q, GFP_ATOMIC); - if (!icq) - return; - } +void blk_mq_sched_assign_ioc(struct request *rq) +{ + struct io_cq *icq; + + icq = blk_mq_sched_get_icq(rq->q); + if (!icq) + return; get_io_context(icq->ioc); rq->elv.icq = icq; } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 25d1034952b6..add651ec06da 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -8,6 +8,7 @@ #define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ) +struct io_cq *blk_mq_sched_get_icq(struct request_queue *q); void blk_mq_sched_assign_ioc(struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, From 98f044999ba141a6b6b79cb3a996a73f05a46820 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Thu, 25 Nov 2021 14:36:35 +0100 Subject: [PATCH 045/132] bfq: Track number of allocated requests in bfq_entity When we want to limit number of requests used by each bfqq and also cgroup, we need to track also number of requests used by each cgroup. So track number of allocated requests for each bfq_entity. Acked-by: Paolo Valente <paolo.valente@linaro.org> Signed-off-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211125133645.27483-2-jack@suse.cz Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bfq-iosched.c | 28 ++++++++++++++++++++++------ block/bfq-iosched.h | 5 +++-- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 1ce1a99a7160..1d564499614e 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1113,7 +1113,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, static int bfqq_process_refs(struct bfq_queue *bfqq) { - return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv - + return bfqq->ref - bfqq->entity.allocated - + bfqq->entity.on_st_or_in_serv - (bfqq->weight_counter != NULL) - bfqq->stable_ref; } @@ -5878,6 +5879,22 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, } } +static void bfqq_request_allocated(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + for_each_entity(entity) + entity->allocated++; +} + +static void bfqq_request_freed(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + for_each_entity(entity) + entity->allocated--; +} + /* returns true if it causes the idle timer to be disabled */ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) { @@ -5891,8 +5908,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) * Release the request's reference to the old bfqq * and make sure one is taken to the shared queue. */ - new_bfqq->allocated++; - bfqq->allocated--; + bfqq_request_allocated(new_bfqq); + bfqq_request_freed(bfqq); new_bfqq->ref++; /* * If the bic associated with the process @@ -6251,8 +6268,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) { - bfqq->allocated--; - + bfqq_request_freed(bfqq); bfq_put_queue(bfqq); } @@ -6674,7 +6690,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) } } - bfqq->allocated++; + bfqq_request_allocated(bfqq); bfqq->ref++; bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", rq, bfqq, bfqq->ref); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index a73488eec8a4..3787cfb0febb 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -170,6 +170,9 @@ struct bfq_entity { /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ int budget; + /* Number of requests allocated in the subtree of this entity */ + int allocated; + /* device weight, if non-zero, it overrides the default weight of * bfq_group_data */ int dev_weight; @@ -266,8 +269,6 @@ struct bfq_queue { struct request *next_rq; /* number of sync and async requests queued */ int queued[2]; - /* number of requests currently allocated */ - int allocated; /* number of pending metadata requests */ int meta_pending; /* fifo list of requests in sort_list */ From 44dfa279f117646163db0c8760addb45dd6a0e8c Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Thu, 25 Nov 2021 14:36:36 +0100 Subject: [PATCH 046/132] bfq: Store full bitmap depth in bfq_data Store bitmap depth shift inside bfq_data so that we can use it in bfq_limit_depth() for proportioning when limiting number of available request tags for a cgroup. Acked-by: Paolo Valente <paolo.valente@linaro.org> Signed-off-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211125133645.27483-3-jack@suse.cz Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bfq-iosched.c | 10 ++++++---- block/bfq-iosched.h | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 1d564499614e..cf9247301e3c 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6857,7 +6857,9 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) { unsigned int i, j, min_shallow = UINT_MAX; + unsigned int depth = 1U << bt->sb.shift; + bfqd->full_depth_shift = bt->sb.shift; /* * In-word depths if no bfq_queue is being weight-raised: * leaving 25% of tags only for sync reads. @@ -6869,13 +6871,13 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd, * limit 'something'. */ /* no more than 50% of tags for async I/O */ - bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U); + bfqd->word_depths[0][0] = max(depth >> 1, 1U); /* * no more than 75% of tags for sync writes (25% extra tags * w.r.t. async I/O, to prevent async I/O from starving sync * writes) */ - bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U); + bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U); /* * In-word depths in case some bfq_queue is being weight- @@ -6885,9 +6887,9 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd, * shortage. */ /* no more than ~18% of tags for async I/O */ - bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U); + bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U); /* no more than ~37% of tags for sync writes (~20% extra tags) */ - bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U); + bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U); for (i = 0; i < 2; i++) for (j = 0; j < 2; j++) diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 3787cfb0febb..820cb8c2d1fe 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -769,6 +769,7 @@ struct bfq_data { * function) */ unsigned int word_depths[2][2]; + unsigned int full_depth_shift; }; enum bfqq_state_flags { From 76f1df88bbc2f984eb0418cc90de0a8384e63604 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Thu, 25 Nov 2021 14:36:37 +0100 Subject: [PATCH 047/132] bfq: Limit number of requests consumed by each cgroup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When cgroup IO scheduling is used with BFQ it does not really provide service differentiation if the cgroup drives a big IO depth. That for example happens with writeback which asynchronously submits lots of IO but it can happen with AIO as well. The problem is that if we have two cgroups that submit IO with different weights, the cgroup with higher weight properly gets more IO time and is able to dispatch more IO. However this causes lower weight cgroup to accumulate more requests inside BFQ and eventually lower weight cgroup consumes most of IO scheduler tags. At that point higher weight cgroup stops getting better service as it is mostly blocked waiting for a scheduler tag while its queues inside BFQ are empty and thus lower weight cgroup gets served. Check how many requests submitting cgroup has allocated in bfq_limit_depth() and if it consumes more requests than what would correspond to its weight limit available depth to 1 so that the cgroup cannot consume many more requests. With this limitation the higher weight cgroup gets proper service even with writeback. Reviewed-by: Michal Koutný <mkoutny@suse.com> Acked-by: Paolo Valente <paolo.valente@linaro.org> Signed-off-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211125133645.27483-4-jack@suse.cz Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bfq-iosched.c | 135 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 117 insertions(+), 18 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index cf9247301e3c..95a19d1fbedf 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -565,26 +565,134 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, } } +#define BFQ_LIMIT_INLINE_DEPTH 16 + +#ifdef CONFIG_BFQ_GROUP_IOSCHED +static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) +{ + struct bfq_data *bfqd = bfqq->bfqd; + struct bfq_entity *entity = &bfqq->entity; + struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH]; + struct bfq_entity **entities = inline_entities; + int depth, level; + int class_idx = bfqq->ioprio_class - 1; + struct bfq_sched_data *sched_data; + unsigned long wsum; + bool ret = false; + + if (!entity->on_st_or_in_serv) + return false; + + /* +1 for bfqq entity, root cgroup not included */ + depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1; + if (depth > BFQ_LIMIT_INLINE_DEPTH) { + entities = kmalloc_array(depth, sizeof(*entities), GFP_NOIO); + if (!entities) + return false; + } + + spin_lock_irq(&bfqd->lock); + sched_data = entity->sched_data; + /* Gather our ancestors as we need to traverse them in reverse order */ + level = 0; + for_each_entity(entity) { + /* + * If at some level entity is not even active, allow request + * queueing so that BFQ knows there's work to do and activate + * entities. + */ + if (!entity->on_st_or_in_serv) + goto out; + /* Uh, more parents than cgroup subsystem thinks? */ + if (WARN_ON_ONCE(level >= depth)) + break; + entities[level++] = entity; + } + WARN_ON_ONCE(level != depth); + for (level--; level >= 0; level--) { + entity = entities[level]; + if (level > 0) { + wsum = bfq_entity_service_tree(entity)->wsum; + } else { + int i; + /* + * For bfqq itself we take into account service trees + * of all higher priority classes and multiply their + * weights so that low prio queue from higher class + * gets more requests than high prio queue from lower + * class. + */ + wsum = 0; + for (i = 0; i <= class_idx; i++) { + wsum = wsum * IOPRIO_BE_NR + + sched_data->service_tree[i].wsum; + } + } + limit = DIV_ROUND_CLOSEST(limit * entity->weight, wsum); + if (entity->allocated >= limit) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "too many requests: allocated %d limit %d level %d", + entity->allocated, limit, level); + ret = true; + break; + } + } +out: + spin_unlock_irq(&bfqd->lock); + if (entities != inline_entities) + kfree(entities); + return ret; +} +#else +static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) +{ + return false; +} +#endif + /* * Async I/O can easily starve sync I/O (both sync reads and sync * writes), by consuming all tags. Similarly, storms of sync writes, * such as those that sync(2) may trigger, can starve sync reads. * Limit depths of async I/O and sync writes so as to counter both * problems. + * + * Also if a bfq queue or its parent cgroup consume more tags than would be + * appropriate for their weight, we trim the available tag depth to 1. This + * avoids a situation where one cgroup can starve another cgroup from tags and + * thus block service differentiation among cgroups. Note that because the + * queue / cgroup already has many requests allocated and queued, this does not + * significantly affect service guarantees coming from the BFQ scheduling + * algorithm. */ static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) { struct bfq_data *bfqd = data->q->elevator->elevator_data; + struct bfq_io_cq *bic = icq_to_bic(blk_mq_sched_get_icq(data->q)); + struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(op)) : NULL; + int depth; + unsigned limit = data->q->nr_requests; - if (op_is_sync(op) && !op_is_write(op)) - return; + /* Sync reads have full depth available */ + if (op_is_sync(op) && !op_is_write(op)) { + depth = 0; + } else { + depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + limit = (limit * depth) >> bfqd->full_depth_shift; + } - data->shallow_depth = - bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + /* + * Does queue (or any parent entity) exceed number of requests that + * should be available to it? Heavily limit depth so that it cannot + * consume more available requests and thus starve other entities. + */ + if (bfqq && bfqq_request_over_limit(bfqq, limit)) + depth = 1; bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", - __func__, bfqd->wr_busy_queues, op_is_sync(op), - data->shallow_depth); + __func__, bfqd->wr_busy_queues, op_is_sync(op), depth); + if (depth) + data->shallow_depth = depth; } static struct bfq_queue * @@ -6853,10 +6961,8 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) * See the comments on bfq_limit_depth for the purpose of * the depths set in the function. Return minimum shallow depth we'll use. */ -static unsigned int bfq_update_depths(struct bfq_data *bfqd, - struct sbitmap_queue *bt) +static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) { - unsigned int i, j, min_shallow = UINT_MAX; unsigned int depth = 1U << bt->sb.shift; bfqd->full_depth_shift = bt->sb.shift; @@ -6890,22 +6996,15 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd, bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U); /* no more than ~37% of tags for sync writes (~20% extra tags) */ bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U); - - for (i = 0; i < 2; i++) - for (j = 0; j < 2; j++) - min_shallow = min(min_shallow, bfqd->word_depths[i][j]); - - return min_shallow; } static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; - unsigned int min_shallow; - min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags); - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow); + bfq_update_depths(bfqd, &tags->bitmap_tags); + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); } static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) From 1f18b7005b49b96782cd984babd59c286973b526 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Thu, 25 Nov 2021 14:36:38 +0100 Subject: [PATCH 048/132] bfq: Limit waker detection in time Currently, when process A starts issuing requests shortly after process B has completed some IO three times in a row, we decide that B is a "waker" of A meaning that completing IO of B is needed for A to make progress and generally stop separating A's and B's IO much. This logic is useful to avoid unnecessary idling and thus throughput loss for cases where workload needs to switch e.g. between the process and the journaling thread doing IO. However the detection heuristic tends to frequently give false positives when A and B are fighting IO bandwidth and other processes aren't doing much IO as we are basically deemed to eventually accumulate three occurences of a situation where one process starts issuing requests after the other has completed some IO. To reduce these false positives, cancel the waker detection also if we didn't accumulate three detected wakeups within given timeout. The rationale is that if wakeups are really rare, the pointless idling doesn't hurt throughput that much anyway. This significantly reduces false waker detection for workload like: [global] directory=/mnt/repro/ rw=write size=8g time_based runtime=30 ramp_time=10 blocksize=1m direct=0 ioengine=sync [slowwriter] numjobs=1 fsync=200 [fastwriter] numjobs=1 fsync=200 Acked-by: Paolo Valente <paolo.valente@linaro.org> Signed-off-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211125133645.27483-5-jack@suse.cz Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bfq-iosched.c | 38 +++++++++++++++++++++++--------------- block/bfq-iosched.h | 2 ++ 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 95a19d1fbedf..83a2225e407b 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2091,20 +2091,19 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) * aspect, see the comments on the choice of the queue for injection * in bfq_select_queue(). * - * Turning back to the detection of a waker queue, a queue Q is deemed - * as a waker queue for bfqq if, for three consecutive times, bfqq - * happens to become non empty right after a request of Q has been - * completed. In this respect, even if bfqq is empty, we do not check - * for a waker if it still has some in-flight I/O. In fact, in this - * case bfqq is actually still being served by the drive, and may - * receive new I/O on the completion of some of the in-flight - * requests. In particular, on the first time, Q is tentatively set as - * a candidate waker queue, while on the third consecutive time that Q - * is detected, the field waker_bfqq is set to Q, to confirm that Q is - * a waker queue for bfqq. These detection steps are performed only if - * bfqq has a long think time, so as to make it more likely that - * bfqq's I/O is actually being blocked by a synchronization. This - * last filter, plus the above three-times requirement, make false + * Turning back to the detection of a waker queue, a queue Q is deemed as a + * waker queue for bfqq if, for three consecutive times, bfqq happens to become + * non empty right after a request of Q has been completed within given + * timeout. In this respect, even if bfqq is empty, we do not check for a waker + * if it still has some in-flight I/O. In fact, in this case bfqq is actually + * still being served by the drive, and may receive new I/O on the completion + * of some of the in-flight requests. In particular, on the first time, Q is + * tentatively set as a candidate waker queue, while on the third consecutive + * time that Q is detected, the field waker_bfqq is set to Q, to confirm that Q + * is a waker queue for bfqq. These detection steps are performed only if bfqq + * has a long think time, so as to make it more likely that bfqq's I/O is + * actually being blocked by a synchronization. This last filter, plus the + * above three-times requirement and time limit for detection, make false * positives less likely. * * NOTE @@ -2136,8 +2135,16 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq) return; + /* + * We reset waker detection logic also if too much time has passed + * since the first detection. If wakeups are rare, pointless idling + * doesn't hurt throughput that much. The condition below makes sure + * we do not uselessly idle blocking waker in more than 1/64 cases. + */ if (bfqd->last_completed_rq_bfqq != - bfqq->tentative_waker_bfqq) { + bfqq->tentative_waker_bfqq || + now_ns > bfqq->waker_detection_started + + 128 * (u64)bfqd->bfq_slice_idle) { /* * First synchronization detected with a * candidate waker queue, or with a different @@ -2146,6 +2153,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->tentative_waker_bfqq = bfqd->last_completed_rq_bfqq; bfqq->num_waker_detections = 1; + bfqq->waker_detection_started = now_ns; } else /* Same tentative waker queue detected again */ bfqq->num_waker_detections++; diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 820cb8c2d1fe..bb8180c52a31 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -388,6 +388,8 @@ struct bfq_queue { struct bfq_queue *tentative_waker_bfqq; /* number of times the same tentative waker has been detected */ unsigned int num_waker_detections; + /* time when we started considering this waker */ + u64 waker_detection_started; /* node for woken_list, see below */ struct hlist_node woken_list_node; From 582f04e19ad7b41df993c669805e48a01bcd9c5b Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Thu, 25 Nov 2021 14:36:39 +0100 Subject: [PATCH 049/132] bfq: Provide helper to generate bfqq name Instead of having helper formating bfqq pid, provide a helper to generate full bfqq name as used in the traces. It saves some code duplication and will save more in the coming tracepoints. Acked-by: Paolo Valente <paolo.valente@linaro.org> Signed-off-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211125133645.27483-6-jack@suse.cz Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bfq-iosched.h | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index bb8180c52a31..07288b9da389 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -25,7 +25,7 @@ #define BFQ_DEFAULT_GRP_IOPRIO 0 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -#define MAX_PID_STR_LENGTH 12 +#define MAX_BFQQ_NAME_LENGTH 16 /* * Soft real-time applications are extremely more latency sensitive @@ -1083,26 +1083,27 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq); /* --------------- end of interface of B-WF2Q+ ---------------- */ /* Logging facilities. */ -static inline void bfq_pid_to_str(int pid, char *str, int len) +static inline void bfq_bfqq_name(struct bfq_queue *bfqq, char *str, int len) { - if (pid != -1) - snprintf(str, len, "%d", pid); + char type = bfq_bfqq_sync(bfqq) ? 'S' : 'A'; + + if (bfqq->pid != -1) + snprintf(str, len, "bfq%d%c", bfqq->pid, type); else - snprintf(str, len, "SHARED-"); + snprintf(str, len, "bfqSHARED-%c", type); } #ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfq_group *bfqq_group(struct bfq_queue *bfqq); #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char pid_str[MAX_PID_STR_LENGTH]; \ + char pid_str[MAX_BFQQ_NAME_LENGTH]; \ if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ break; \ - bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ + bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \ blk_add_cgroup_trace_msg((bfqd)->queue, \ bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \ - "bfq%s%c " fmt, pid_str, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \ + "%s " fmt, pid_str, ##args); \ } while (0) #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ @@ -1113,13 +1114,11 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); #else /* CONFIG_BFQ_GROUP_IOSCHED */ #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char pid_str[MAX_PID_STR_LENGTH]; \ + char pid_str[MAX_BFQQ_NAME_LENGTH]; \ if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ break; \ - bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ - blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ - ##args); \ + bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \ + blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \ } while (0) #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) From 1eb17f5e15b73669df635fb07df2853cb1244a69 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Thu, 25 Nov 2021 14:36:40 +0100 Subject: [PATCH 050/132] bfq: Log waker detections Waker - wakee relationships are important in deciding whether one queue can preempt the other one. Print information about detected waker-wakee relationships so that scheduling decisions can be better understood from block traces. Acked-by: Paolo Valente <paolo.valente@linaro.org> Signed-off-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211125133645.27483-7-jack@suse.cz Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bfq-iosched.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 83a2225e407b..69144003a694 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2127,6 +2127,8 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, u64 now_ns) { + char waker_name[MAX_BFQQ_NAME_LENGTH]; + if (!bfqd->last_completed_rq_bfqq || bfqd->last_completed_rq_bfqq == bfqq || bfq_bfqq_has_short_ttime(bfqq) || @@ -2154,12 +2156,18 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqd->last_completed_rq_bfqq; bfqq->num_waker_detections = 1; bfqq->waker_detection_started = now_ns; + bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name, + MAX_BFQQ_NAME_LENGTH); + bfq_log_bfqq(bfqd, bfqq, "set tenative waker %s", waker_name); } else /* Same tentative waker queue detected again */ bfqq->num_waker_detections++; if (bfqq->num_waker_detections == 3) { bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; bfqq->tentative_waker_bfqq = NULL; + bfq_bfqq_name(bfqq->waker_bfqq, waker_name, + MAX_BFQQ_NAME_LENGTH); + bfq_log_bfqq(bfqd, bfqq, "set waker %s", waker_name); /* * If the waker queue disappears, then From c65e6fd460b4df796ecd6ea22e132076ed1f2820 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Thu, 25 Nov 2021 14:36:41 +0100 Subject: [PATCH 051/132] bfq: Do not let waker requests skip proper accounting Commit 7cc4ffc55564 ("block, bfq: put reqs of waker and woken in dispatch list") added a condition to bfq_insert_request() which added waker's requests directly to dispatch list. The rationale was that completing waker's IO is needed to get more IO for the current queue. Although this rationale is valid, there is a hole in it. The waker does not necessarily serve the IO only for the current queue and maybe it's current IO is not needed for current queue to make progress. Furthermore injecting IO like this completely bypasses any service accounting within bfq and thus we do not properly track how much service is waker's queue getting or that the waker is actually doing any IO. Depending on the conditions this can result in the waker getting too much or too few service. Consider for example the following job file: [global] directory=/mnt/repro/ rw=write size=8g time_based runtime=30 ramp_time=10 blocksize=1m direct=0 ioengine=sync [slowwriter] numjobs=1 prioclass=2 prio=7 fsync=200 [fastwriter] numjobs=1 prioclass=2 prio=0 fsync=200 Despite processes have very different IO priorities, they get the same about of service. The reason is that bfq identifies these processes as having waker-wakee relationship and once that happens, IO from fastwriter gets injected during slowwriter's time slice. As a result bfq is not aware that fastwriter has any IO to do and constantly schedules only slowwriter's queue. Thus fastwriter is forced to compete with slowwriter's IO all the time instead of getting its share of time based on IO priority. Drop the special injection condition from bfq_insert_request(). As a result, requests will be tracked and queued in a normal way and on next dispatch bfq_select_queue() can decide whether the waker's inserted requests should be injected during the current queue's timeslice or not. Fixes: 7cc4ffc55564 ("block, bfq: put reqs of waker and woken in dispatch list") Acked-by: Paolo Valente <paolo.valente@linaro.org> Signed-off-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211125133645.27483-8-jack@suse.cz Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bfq-iosched.c | 44 +------------------------------------------- 1 file changed, 1 insertion(+), 43 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 69144003a694..85554b800970 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6132,48 +6132,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, spin_lock_irq(&bfqd->lock); bfqq = bfq_init_rq(rq); - - /* - * Reqs with at_head or passthrough flags set are to be put - * directly into dispatch list. Additional case for putting rq - * directly into the dispatch queue: the only active - * bfq_queues are bfqq and either its waker bfq_queue or one - * of its woken bfq_queues. The rationale behind this - * additional condition is as follows: - * - consider a bfq_queue, say Q1, detected as a waker of - * another bfq_queue, say Q2 - * - by definition of a waker, Q1 blocks the I/O of Q2, i.e., - * some I/O of Q1 needs to be completed for new I/O of Q2 - * to arrive. A notable example of waker is journald - * - so, Q1 and Q2 are in any respect the queues of two - * cooperating processes (or of two cooperating sets of - * processes): the goal of Q1's I/O is doing what needs to - * be done so that new Q2's I/O can finally be - * issued. Therefore, if the service of Q1's I/O is delayed, - * then Q2's I/O is delayed too. Conversely, if Q2's I/O is - * delayed, the goal of Q1's I/O is hindered. - * - as a consequence, if some I/O of Q1/Q2 arrives while - * Q2/Q1 is the only queue in service, there is absolutely - * no point in delaying the service of such an I/O. The - * only possible result is a throughput loss - * - so, when the above condition holds, the best option is to - * have the new I/O dispatched as soon as possible - * - the most effective and efficient way to attain the above - * goal is to put the new I/O directly in the dispatch - * list - * - as an additional restriction, Q1 and Q2 must be the only - * busy queues for this commit to put the I/O of Q2/Q1 in - * the dispatch list. This is necessary, because, if also - * other queues are waiting for service, then putting new - * I/O directly in the dispatch list may evidently cause a - * violation of service guarantees for the other queues - */ - if (!bfqq || - (bfqq != bfqd->in_service_queue && - bfqd->in_service_queue != NULL && - bfq_tot_busy_queues(bfqd) == 1 + bfq_bfqq_busy(bfqq) && - (bfqq->waker_bfqq == bfqd->in_service_queue || - bfqd->in_service_queue->waker_bfqq == bfqq)) || at_head) { + if (!bfqq || at_head) { if (at_head) list_add(&rq->queuelist, &bfqd->dispatch); else @@ -6200,7 +6159,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * merge). */ cmd_flags = rq->cmd_flags; - spin_unlock_irq(&bfqd->lock); bfq_update_insert_stats(q, bfqq, idle_timer_disabled, From 5f480b1a6325748f26999e2151c9912e00cc4087 Mon Sep 17 00:00:00 2001 From: Ming Lei <ming.lei@redhat.com> Date: Sat, 27 Nov 2021 00:19:43 +0800 Subject: [PATCH 052/132] blk-mq: use bio->bi_opf after bio is checked bio->bi_opf isn't finalized before checking the bio, so use it after submit_bio_checks() returns. Fixes: 5b13bc8a3fd5 ("blk-mq: cleanup request allocation") Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a89a624dd1df..143a8edf6300 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2713,7 +2713,6 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, struct blk_mq_alloc_data data = { .q = q, .nr_tags = 1, - .cmd_flags = bio->bi_opf, }; struct request *rq; @@ -2726,6 +2725,8 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, rq_qos_throttle(q, bio); + /* ->bi_opf is finalized after submit_bio_checks() returns */ + data.cmd_flags = bio->bi_opf; if (plug) { data.nr_tags = plug->nr_ios; plug->nr_ios = 1; From e92a559e6c9db93662a6071f1ecbfa2c1c3be5d3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 12:58:04 +0100 Subject: [PATCH 053/132] RDMA/qib: rename copy_io to qib_copy_io Add the proper module prefix to avoid conflicts with a function in the scheduler. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211126115817.2087431-2-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- drivers/infiniband/hw/qib/qib_verbs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index ef91bff5c23c..0080f0be72fe 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -425,7 +425,7 @@ static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off) } #endif -static void copy_io(u32 __iomem *piobuf, struct rvt_sge_state *ss, +static void qib_copy_io(u32 __iomem *piobuf, struct rvt_sge_state *ss, u32 length, unsigned flush_wc) { u32 extra = 0; @@ -975,7 +975,7 @@ static int qib_verbs_send_pio(struct rvt_qp *qp, struct ib_header *ibhdr, qib_pio_copy(piobuf, addr, dwords); goto done; } - copy_io(piobuf, ss, len, flush_wc); + qib_copy_io(piobuf, ss, len, flush_wc); done: if (dd->flags & QIB_USE_SPCL_TRIG) { u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023; From 88c9a2ce520ba381bb70658c80ec704f4d60f728 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 12:58:05 +0100 Subject: [PATCH 054/132] fork: move copy_io to block/blk-ioc.c Move the copying of the I/O context to the block layer as that is where we can use the proper low-level interfaces. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211126115817.2087431-3-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 27 +++++++++++++++++++++++++++ include/linux/iocontext.h | 23 +++++++++++++---------- kernel/fork.c | 26 -------------------------- 3 files changed, 40 insertions(+), 36 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 70c99e85aee5..3b31cfad4b75 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -322,6 +322,33 @@ struct io_context *get_task_io_context(struct task_struct *task, return NULL; } +int __copy_io(unsigned long clone_flags, struct task_struct *tsk) +{ + struct io_context *ioc = current->io_context; + struct io_context *new_ioc; + + /* + * Share io context with parent, if CLONE_IO is set + */ + if (clone_flags & CLONE_IO) { + get_io_context_active(ioc); + + WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0); + atomic_inc(&ioc->nr_tasks); + + tsk->io_context = ioc; + } else if (ioprio_valid(ioc->ioprio)) { + new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); + if (unlikely(!new_ioc)) + return -ENOMEM; + + new_ioc->ioprio = ioc->ioprio; + put_io_context(new_ioc); + } + + return 0; +} + /** * ioc_lookup_icq - lookup io_cq from ioc * @ioc: the associated io_context diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 0a9dc40b7be8..bcd47d104d8e 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -129,14 +129,6 @@ static inline void get_io_context_active(struct io_context *ioc) atomic_inc(&ioc->active_ref); } -static inline void ioc_task_link(struct io_context *ioc) -{ - get_io_context_active(ioc); - - WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0); - atomic_inc(&ioc->nr_tasks); -} - struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); @@ -144,10 +136,21 @@ void put_io_context_active(struct io_context *ioc); void exit_io_context(struct task_struct *task); struct io_context *get_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node); +int __copy_io(unsigned long clone_flags, struct task_struct *tsk); +static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +{ + if (!current->io_context) + return 0; + return __copy_io(clone_flags, tsk); +} #else struct io_context; static inline void put_io_context(struct io_context *ioc) { } static inline void exit_io_context(struct task_struct *task) { } -#endif +static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +{ + return 0; +} +#endif /* CONFIG_BLOCK */ -#endif +#endif /* IOCONTEXT_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 3244cc56b697..3161d7980155 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1556,32 +1556,6 @@ out: return error; } -static int copy_io(unsigned long clone_flags, struct task_struct *tsk) -{ -#ifdef CONFIG_BLOCK - struct io_context *ioc = current->io_context; - struct io_context *new_ioc; - - if (!ioc) - return 0; - /* - * Share io context with parent, if CLONE_IO is set - */ - if (clone_flags & CLONE_IO) { - ioc_task_link(ioc); - tsk->io_context = ioc; - } else if (ioprio_valid(ioc->ioprio)) { - new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); - if (unlikely(!new_ioc)) - return -ENOMEM; - - new_ioc->ioprio = ioc->ioprio; - put_io_context(new_ioc); - } -#endif - return 0; -} - static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; From 836b394b633e3d618fa44292290bf3d9a1761e0c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 12:58:06 +0100 Subject: [PATCH 055/132] bfq: simplify bfq_bic_lookup Remove the unused bfqd argument, and hardcode ioc to current->io_context. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211126115817.2087431-4-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bfq-iosched.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 85554b800970..c990c6409c11 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -433,26 +433,21 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) /** * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. - * @bfqd: the lookup key. - * @ioc: the io_context of the process doing I/O. * @q: the request queue. */ -static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - struct io_context *ioc, - struct request_queue *q) +static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) { - if (ioc) { - unsigned long flags; - struct bfq_io_cq *icq; + struct bfq_io_cq *icq; + unsigned long flags; - spin_lock_irqsave(&q->queue_lock, flags); - icq = icq_to_bic(ioc_lookup_icq(ioc, q)); - spin_unlock_irqrestore(&q->queue_lock, flags); + if (!current->io_context) + return NULL; - return icq; - } + spin_lock_irqsave(&q->queue_lock, flags); + icq = icq_to_bic(ioc_lookup_icq(current->io_context, q)); + spin_unlock_irqrestore(&q->queue_lock, flags); - return NULL; + return icq; } /* @@ -2457,7 +2452,7 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, * returned by bfq_bic_lookup does not go away before * bfqd->lock is taken. */ - struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q); + struct bfq_io_cq *bic = bfq_bic_lookup(q); bool ret; spin_lock_irq(&bfqd->lock); From a0725c22cd8487f107a80ef87abf03c6379ec927 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 12:58:07 +0100 Subject: [PATCH 056/132] bfq: use bfq_bic_lookup in bfq_limit_depth No need to create a new I/O context if there is none present yet in ->limit_depth. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211126115817.2087431-5-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index c990c6409c11..ecc2e57e6863 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -663,7 +663,7 @@ static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) { struct bfq_data *bfqd = data->q->elevator->elevator_data; - struct bfq_io_cq *bic = icq_to_bic(blk_mq_sched_get_icq(data->q)); + struct bfq_io_cq *bic = bfq_bic_lookup(data->q); struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(op)) : NULL; int depth; unsigned limit = data->q->nr_requests; From c2a32464f449370bff27a21b64b1b7d2e1d037f6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 12:58:08 +0100 Subject: [PATCH 057/132] Revert "block: Provide blk_mq_sched_get_icq()" This reverts commit 4896c4e64ba5d5d5acdbcf68c5910dd4f6d8fa62. The helper is not needed any more. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211126115817.2087431-6-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq-sched.c | 26 +++++++++++--------------- block/blk-mq-sched.h | 1 - 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 98c6a97729f2..b942b38000e5 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -18,8 +18,9 @@ #include "blk-mq-tag.h" #include "blk-wbt.h" -struct io_cq *blk_mq_sched_get_icq(struct request_queue *q) +void blk_mq_sched_assign_ioc(struct request *rq) { + struct request_queue *q = rq->q; struct io_context *ioc; struct io_cq *icq; @@ -27,27 +28,22 @@ struct io_cq *blk_mq_sched_get_icq(struct request_queue *q) if (unlikely(!current->io_context)) create_task_io_context(current, GFP_ATOMIC, q->node); - /* May not have an IO context if context creation failed */ + /* + * May not have an IO context if it's a passthrough request + */ ioc = current->io_context; if (!ioc) - return NULL; + return; spin_lock_irq(&q->queue_lock); icq = ioc_lookup_icq(ioc, q); spin_unlock_irq(&q->queue_lock); - if (icq) - return icq; - return ioc_create_icq(ioc, q, GFP_ATOMIC); -} -EXPORT_SYMBOL(blk_mq_sched_get_icq); -void blk_mq_sched_assign_ioc(struct request *rq) -{ - struct io_cq *icq; - - icq = blk_mq_sched_get_icq(rq->q); - if (!icq) - return; + if (!icq) { + icq = ioc_create_icq(ioc, q, GFP_ATOMIC); + if (!icq) + return; + } get_io_context(icq->ioc); rq->elv.icq = icq; } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index add651ec06da..25d1034952b6 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -8,7 +8,6 @@ #define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ) -struct io_cq *blk_mq_sched_get_icq(struct request_queue *q); void blk_mq_sched_assign_ioc(struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, From 3304742562d27fb87a6d8291cc48824dd20f6964 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 12:58:09 +0100 Subject: [PATCH 058/132] block: mark put_io_context_active static Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211126115817.2087431-7-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 2 +- include/linux/iocontext.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 3b31cfad4b75..f3ff495756cb 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -175,7 +175,7 @@ void put_io_context(struct io_context *ioc) * Undo get_io_context_active(). If active reference reaches zero after * put, @ioc can never issue further IOs and ioscheds are notified. */ -void put_io_context_active(struct io_context *ioc) +static void put_io_context_active(struct io_context *ioc) { struct io_cq *icq; diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index bcd47d104d8e..3ba45953d522 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -132,7 +132,6 @@ static inline void get_io_context_active(struct io_context *ioc) struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); -void put_io_context_active(struct io_context *ioc); void exit_io_context(struct task_struct *task); struct io_context *get_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node); From 87dd1d63dcbd0f508a8b23785752e78d082fd176 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 12:58:10 +0100 Subject: [PATCH 059/132] block: move blk_mq_sched_assign_ioc to blk-ioc.c Move blk_mq_sched_assign_ioc so that many interfaces from the file can be marked static. Rename the function to ioc_find_get_icq as well and return the icq to simplify the interface. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211126115817.2087431-8-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bfq-iosched.c | 2 +- block/blk-ioc.c | 39 +++++++++++++++++++++++++++++++++++---- block/blk-mq-sched.c | 31 ------------------------------- block/blk-mq-sched.h | 2 -- block/blk.h | 6 +----- 5 files changed, 37 insertions(+), 43 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index ecc2e57e6863..2d484d3f7f22 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6666,7 +6666,7 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, */ static void bfq_prepare_request(struct request *rq) { - blk_mq_sched_assign_ioc(rq); + rq->elv.icq = ioc_find_get_icq(rq->q); /* * Regardless of whether we have an icq attached, we have to diff --git a/block/blk-ioc.c b/block/blk-ioc.c index f3ff495756cb..f4f84a2072be 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -24,7 +24,7 @@ static struct kmem_cache *iocontext_cachep; * * Increment reference count to @ioc. */ -void get_io_context(struct io_context *ioc) +static void get_io_context(struct io_context *ioc) { BUG_ON(atomic_long_read(&ioc->refcount) <= 0); atomic_long_inc(&ioc->refcount); @@ -248,7 +248,8 @@ void ioc_clear_queue(struct request_queue *q) __ioc_clear_queue(&icq_list); } -int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) +static int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, + int node) { struct io_context *ioc; int ret; @@ -397,8 +398,8 @@ EXPORT_SYMBOL(ioc_lookup_icq); * The caller is responsible for ensuring @ioc won't go away and @q is * alive and will stay alive until this function returns. */ -struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, - gfp_t gfp_mask) +static struct io_cq *ioc_create_icq(struct io_context *ioc, + struct request_queue *q, gfp_t gfp_mask) { struct elevator_type *et = q->elevator->type; struct io_cq *icq; @@ -441,6 +442,36 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, return icq; } +struct io_cq *ioc_find_get_icq(struct request_queue *q) +{ + struct io_context *ioc; + struct io_cq *icq; + + /* create task io_context, if we don't have one already */ + if (unlikely(!current->io_context)) + create_task_io_context(current, GFP_ATOMIC, q->node); + + /* + * May not have an IO context if it's a passthrough request + */ + ioc = current->io_context; + if (!ioc) + return NULL; + + spin_lock_irq(&q->queue_lock); + icq = ioc_lookup_icq(ioc, q); + spin_unlock_irq(&q->queue_lock); + + if (!icq) { + icq = ioc_create_icq(ioc, q, GFP_ATOMIC); + if (!icq) + return NULL; + } + get_io_context(icq->ioc); + return icq; +} +EXPORT_SYMBOL_GPL(ioc_find_get_icq); + static int __init blk_ioc_init(void) { iocontext_cachep = kmem_cache_create("blkdev_ioc", diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index b942b38000e5..0d7257848f7e 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -18,37 +18,6 @@ #include "blk-mq-tag.h" #include "blk-wbt.h" -void blk_mq_sched_assign_ioc(struct request *rq) -{ - struct request_queue *q = rq->q; - struct io_context *ioc; - struct io_cq *icq; - - /* create task io_context, if we don't have one already */ - if (unlikely(!current->io_context)) - create_task_io_context(current, GFP_ATOMIC, q->node); - - /* - * May not have an IO context if it's a passthrough request - */ - ioc = current->io_context; - if (!ioc) - return; - - spin_lock_irq(&q->queue_lock); - icq = ioc_lookup_icq(ioc, q); - spin_unlock_irq(&q->queue_lock); - - if (!icq) { - icq = ioc_create_icq(ioc, q, GFP_ATOMIC); - if (!icq) - return; - } - get_io_context(icq->ioc); - rq->elv.icq = icq; -} -EXPORT_SYMBOL_GPL(blk_mq_sched_assign_ioc); - /* * Mark a hardware queue as needing a restart. For shared queues, maintain * a count of how many hardware queues are marked for restart. diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 25d1034952b6..025013972453 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -8,8 +8,6 @@ #define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ) -void blk_mq_sched_assign_ioc(struct request *rq); - bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, diff --git a/block/blk.h b/block/blk.h index a57c84654d0a..187cb2654ffd 100644 --- a/block/blk.h +++ b/block/blk.h @@ -363,14 +363,10 @@ static inline unsigned int bio_aligned_discard_max_sectors( /* * Internal io_context interface */ -void get_io_context(struct io_context *ioc); +struct io_cq *ioc_find_get_icq(struct request_queue *q); struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); -struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, - gfp_t gfp_mask); void ioc_clear_queue(struct request_queue *q); -int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); - #ifdef CONFIG_BLK_DEV_THROTTLING_LOW extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); extern ssize_t blk_throtl_sample_time_store(struct request_queue *q, From 222ee581b84582dc472d5395b77d7e0cb5268d1c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 12:58:11 +0100 Subject: [PATCH 060/132] block: move the remaining elv.icq handling to the I/O scheduler After the prepare side has been moved to the only I/O scheduler that cares, do the same for the cleanup and the NULL initialization. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211126115817.2087431-9-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bfq-iosched.c | 12 +++++++++++- block/blk-ioc.c | 1 + block/blk-mq.c | 14 +++----------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 2d484d3f7f22..8295b0f96cbf 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6569,6 +6569,16 @@ static void bfq_finish_requeue_request(struct request *rq) rq->elv.priv[1] = NULL; } +static void bfq_finish_request(struct request *rq) +{ + bfq_finish_requeue_request(rq); + + if (rq->elv.icq) { + put_io_context(rq->elv.icq->ioc); + rq->elv.icq = NULL; + } +} + /* * Removes the association between the current task and bfqq, assuming * that bic points to the bfq iocontext of the task. @@ -7388,7 +7398,7 @@ static struct elevator_type iosched_bfq_mq = { .limit_depth = bfq_limit_depth, .prepare_request = bfq_prepare_request, .requeue_request = bfq_finish_requeue_request, - .finish_request = bfq_finish_requeue_request, + .finish_request = bfq_finish_request, .exit_icq = bfq_exit_icq, .insert_requests = bfq_insert_requests, .dispatch_request = bfq_dispatch_request, diff --git a/block/blk-ioc.c b/block/blk-ioc.c index f4f84a2072be..3ba15c867dfa 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -167,6 +167,7 @@ void put_io_context(struct io_context *ioc) if (free_ioc) kmem_cache_free(iocontext_cachep, ioc); } +EXPORT_SYMBOL_GPL(put_io_context); /** * put_io_context_active - put active reference on ioc diff --git a/block/blk-mq.c b/block/blk-mq.c index 143a8edf6300..3e67662f7801 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -400,7 +400,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, if (rq->rq_flags & RQF_ELV) { struct elevator_queue *e = data->q->elevator; - rq->elv.icq = NULL; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); @@ -631,16 +630,9 @@ void blk_mq_free_request(struct request *rq) struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - if (rq->rq_flags & RQF_ELVPRIV) { - struct elevator_queue *e = q->elevator; - - if (e->type->ops.finish_request) - e->type->ops.finish_request(rq); - if (rq->elv.icq) { - put_io_context(rq->elv.icq->ioc); - rq->elv.icq = NULL; - } - } + if ((rq->rq_flags & RQF_ELVPRIV) && + q->elevator->type->ops.finish_request) + q->elevator->type->ops.finish_request(rq); if (rq->rq_flags & RQF_MQ_INFLIGHT) __blk_mq_dec_active_requests(hctx); From 50569c24be61eafb3efa06e2a3ccd447f75ae1b0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 12:58:12 +0100 Subject: [PATCH 061/132] block: remove get_io_context_active Fold it into it's only caller, and remove a lof of the debug checks that are not needed. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211126115817.2087431-10-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 8 +++----- include/linux/iocontext.h | 16 ---------------- 2 files changed, 3 insertions(+), 21 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 3ba15c867dfa..cc4eb2ba87f7 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -173,7 +173,7 @@ EXPORT_SYMBOL_GPL(put_io_context); * put_io_context_active - put active reference on ioc * @ioc: ioc of interest * - * Undo get_io_context_active(). If active reference reaches zero after + * Put an active reference to an ioc. If active reference reaches zero after * put, @ioc can never issue further IOs and ioscheds are notified. */ static void put_io_context_active(struct io_context *ioc) @@ -333,11 +333,9 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk) * Share io context with parent, if CLONE_IO is set */ if (clone_flags & CLONE_IO) { - get_io_context_active(ioc); - - WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0); + atomic_long_inc(&ioc->refcount); + atomic_inc(&ioc->active_ref); atomic_inc(&ioc->nr_tasks); - tsk->io_context = ioc; } else if (ioprio_valid(ioc->ioprio)) { new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 3ba45953d522..c1229fbd6691 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -113,22 +113,6 @@ struct io_context { struct work_struct release_work; }; -/** - * get_io_context_active - get active reference on ioc - * @ioc: ioc of interest - * - * Only iocs with active reference can issue new IOs. This function - * acquires an active reference on @ioc. The caller must already have an - * active reference on @ioc. - */ -static inline void get_io_context_active(struct io_context *ioc) -{ - WARN_ON_ONCE(atomic_long_read(&ioc->refcount) <= 0); - WARN_ON_ONCE(atomic_read(&ioc->active_ref) <= 0); - atomic_long_inc(&ioc->refcount); - atomic_inc(&ioc->active_ref); -} - struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); From a0f14d8baaca3e2f3e57bdb062eb476175c90e83 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 12:58:13 +0100 Subject: [PATCH 062/132] block: factor out a alloc_io_context helper Factor out a helper that just allocate an I/O context. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211126115817.2087431-11-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index cc4eb2ba87f7..b42fbb82d5c0 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -249,18 +249,15 @@ void ioc_clear_queue(struct request_queue *q) __ioc_clear_queue(&icq_list); } -static int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, - int node) +static struct io_context *alloc_io_context(gfp_t gfp_flags, int node) { struct io_context *ioc; - int ret; ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, node); if (unlikely(!ioc)) - return -ENOMEM; + return NULL; - /* initialize */ atomic_long_set(&ioc->refcount, 1); atomic_set(&ioc->nr_tasks, 1); atomic_set(&ioc->active_ref, 1); @@ -268,6 +265,18 @@ static int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC); INIT_HLIST_HEAD(&ioc->icq_list); INIT_WORK(&ioc->release_work, ioc_release_fn); + return ioc; +} + +static int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, + int node) +{ + struct io_context *ioc; + int ret; + + ioc = alloc_io_context(gfp_flags, node); + if (!ioc) + return -ENOMEM; /* * Try to install. ioc shouldn't be installed if someone else From 8ffc13680eac16a1eec86275b65fc6f0e27a30d8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 12:58:14 +0100 Subject: [PATCH 063/132] block: use alloc_io_context in __copy_io In __copy_io we know that the newly allocate task_struct does not have an I/O context yet and is not exiting. So just allocate the I/O context struct and install it directly. There is no need to lock the task either as it is just being created. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211126115817.2087431-12-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index b42fbb82d5c0..f06d1040442c 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -336,7 +336,6 @@ struct io_context *get_task_io_context(struct task_struct *task, int __copy_io(unsigned long clone_flags, struct task_struct *tsk) { struct io_context *ioc = current->io_context; - struct io_context *new_ioc; /* * Share io context with parent, if CLONE_IO is set @@ -347,12 +346,10 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk) atomic_inc(&ioc->nr_tasks); tsk->io_context = ioc; } else if (ioprio_valid(ioc->ioprio)) { - new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); - if (unlikely(!new_ioc)) + tsk->io_context = alloc_io_context(GFP_KERNEL, NUMA_NO_NODE); + if (!tsk->io_context) return -ENOMEM; - - new_ioc->ioprio = ioc->ioprio; - put_io_context(new_ioc); + tsk->io_context->ioprio = ioc->ioprio; } return 0; From d538ea4cb8e7241af8091eee30483fabf64444a5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 12:58:15 +0100 Subject: [PATCH 064/132] block: return the io_context from create_task_io_context Grab a reference to the newly allocated or existing io_context in create_task_io_context and return it. This simplifies the callers and removes the need for double lookups. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211126115817.2087431-13-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 66 ++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 36 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index f06d1040442c..5bfe810496fc 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -268,15 +268,14 @@ static struct io_context *alloc_io_context(gfp_t gfp_flags, int node) return ioc; } -static int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, - int node) +static struct io_context *create_task_io_context(struct task_struct *task, + gfp_t gfp_flags, int node) { struct io_context *ioc; - int ret; ioc = alloc_io_context(gfp_flags, node); if (!ioc) - return -ENOMEM; + return NULL; /* * Try to install. ioc shouldn't be installed if someone else @@ -292,11 +291,11 @@ static int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, else kmem_cache_free(iocontext_cachep, ioc); - ret = task->io_context ? 0 : -EBUSY; - + ioc = task->io_context; + if (ioc) + get_io_context(ioc); task_unlock(task); - - return ret; + return ioc; } /** @@ -319,18 +318,15 @@ struct io_context *get_task_io_context(struct task_struct *task, might_sleep_if(gfpflags_allow_blocking(gfp_flags)); - do { - task_lock(task); - ioc = task->io_context; - if (likely(ioc)) { - get_io_context(ioc); - task_unlock(task); - return ioc; - } + task_lock(task); + ioc = task->io_context; + if (unlikely(!ioc)) { task_unlock(task); - } while (!create_task_io_context(task, gfp_flags, node)); - - return NULL; + return create_task_io_context(task, gfp_flags, node); + } + get_io_context(ioc); + task_unlock(task); + return ioc; } int __copy_io(unsigned long clone_flags, struct task_struct *tsk) @@ -449,30 +445,28 @@ static struct io_cq *ioc_create_icq(struct io_context *ioc, struct io_cq *ioc_find_get_icq(struct request_queue *q) { - struct io_context *ioc; - struct io_cq *icq; + struct io_context *ioc = current->io_context; + struct io_cq *icq = NULL; - /* create task io_context, if we don't have one already */ - if (unlikely(!current->io_context)) - create_task_io_context(current, GFP_ATOMIC, q->node); + if (unlikely(!ioc)) { + ioc = create_task_io_context(current, GFP_ATOMIC, q->node); + if (!ioc) + return NULL; + } else { + get_io_context(ioc); - /* - * May not have an IO context if it's a passthrough request - */ - ioc = current->io_context; - if (!ioc) - return NULL; - - spin_lock_irq(&q->queue_lock); - icq = ioc_lookup_icq(ioc, q); - spin_unlock_irq(&q->queue_lock); + spin_lock_irq(&q->queue_lock); + icq = ioc_lookup_icq(ioc, q); + spin_unlock_irq(&q->queue_lock); + } if (!icq) { icq = ioc_create_icq(ioc, q, GFP_ATOMIC); - if (!icq) + if (!icq) { + put_io_context(ioc); return NULL; + } } - get_io_context(icq->ioc); return icq; } EXPORT_SYMBOL_GPL(ioc_find_get_icq); From 18b74c4dcad8150e855755697d4d594506e3de78 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 12:58:16 +0100 Subject: [PATCH 065/132] block: simplify ioc_create_icq Remove the ioc and gfp_mask argument, which are hard coded by the caller. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211126115817.2087431-14-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 5bfe810496fc..c56648f7cad4 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -389,9 +389,7 @@ EXPORT_SYMBOL(ioc_lookup_icq); /** * ioc_create_icq - create and link io_cq - * @ioc: io_context of interest * @q: request_queue of interest - * @gfp_mask: allocation mask * * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they * will be created using @gfp_mask. @@ -399,19 +397,19 @@ EXPORT_SYMBOL(ioc_lookup_icq); * The caller is responsible for ensuring @ioc won't go away and @q is * alive and will stay alive until this function returns. */ -static struct io_cq *ioc_create_icq(struct io_context *ioc, - struct request_queue *q, gfp_t gfp_mask) +static struct io_cq *ioc_create_icq(struct request_queue *q) { + struct io_context *ioc = current->io_context; struct elevator_type *et = q->elevator->type; struct io_cq *icq; /* allocate stuff */ - icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, + icq = kmem_cache_alloc_node(et->icq_cache, GFP_ATOMIC | __GFP_ZERO, q->node); if (!icq) return NULL; - if (radix_tree_maybe_preload(gfp_mask) < 0) { + if (radix_tree_maybe_preload(GFP_ATOMIC) < 0) { kmem_cache_free(et->icq_cache, icq); return NULL; } @@ -461,7 +459,7 @@ struct io_cq *ioc_find_get_icq(struct request_queue *q) } if (!icq) { - icq = ioc_create_icq(ioc, q, GFP_ATOMIC); + icq = ioc_create_icq(q); if (!icq) { put_io_context(ioc); return NULL; From eca5892a5d616d39185d652820931f21cab2f190 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 12:58:17 +0100 Subject: [PATCH 066/132] block: simplify ioc_lookup_icq Remove the ioc argument as it always points to current->io_context. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211126115817.2087431-15-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bfq-iosched.c | 2 +- block/blk-ioc.c | 8 ++++---- block/blk.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 8295b0f96cbf..0c612a911696 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -444,7 +444,7 @@ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) return NULL; spin_lock_irqsave(&q->queue_lock, flags); - icq = icq_to_bic(ioc_lookup_icq(current->io_context, q)); + icq = icq_to_bic(ioc_lookup_icq(q)); spin_unlock_irqrestore(&q->queue_lock, flags); return icq; diff --git a/block/blk-ioc.c b/block/blk-ioc.c index c56648f7cad4..536fb496ad76 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -353,14 +353,14 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk) /** * ioc_lookup_icq - lookup io_cq from ioc - * @ioc: the associated io_context * @q: the associated request_queue * * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called * with @q->queue_lock held. */ -struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q) +struct io_cq *ioc_lookup_icq(struct request_queue *q) { + struct io_context *ioc = current->io_context; struct io_cq *icq; lockdep_assert_held(&q->queue_lock); @@ -430,7 +430,7 @@ static struct io_cq *ioc_create_icq(struct request_queue *q) et->ops.init_icq(icq); } else { kmem_cache_free(et->icq_cache, icq); - icq = ioc_lookup_icq(ioc, q); + icq = ioc_lookup_icq(q); if (!icq) printk(KERN_ERR "cfq: icq link failed!\n"); } @@ -454,7 +454,7 @@ struct io_cq *ioc_find_get_icq(struct request_queue *q) get_io_context(ioc); spin_lock_irq(&q->queue_lock); - icq = ioc_lookup_icq(ioc, q); + icq = ioc_lookup_icq(q); spin_unlock_irq(&q->queue_lock); } diff --git a/block/blk.h b/block/blk.h index 187cb2654ffd..3be0fdf76c9a 100644 --- a/block/blk.h +++ b/block/blk.h @@ -364,7 +364,7 @@ static inline unsigned int bio_aligned_discard_max_sectors( * Internal io_context interface */ struct io_cq *ioc_find_get_icq(struct request_queue *q); -struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); +struct io_cq *ioc_lookup_icq(struct request_queue *q); void ioc_clear_queue(struct request_queue *q); #ifdef CONFIG_BLK_DEV_THROTTLING_LOW From af22fef3e7a51cbd339814a0e196086e2bb2aa26 Mon Sep 17 00:00:00 2001 From: Colin Ian King <colin.i.king@googlemail.com> Date: Fri, 26 Nov 2021 23:06:52 +0000 Subject: [PATCH 067/132] block: Remove redundant initialization of variable ret The variable ret is being initialized with a value that is never read, it is being updated later on. The assignment is redundant and can be removed. Signed-off-by: Colin Ian King <colin.i.king@gmail.com> Link: https://lore.kernel.org/r/20211126230652.1175636-1-colin.i.king@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bdev.c b/block/bdev.c index e9ada04e71be..587645231d60 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -665,7 +665,7 @@ static void blkdev_flush_mapping(struct block_device *bdev) static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; - int ret = 0; + int ret; if (disk->fops->open) { ret = disk->fops->open(bdev, mode); From 82baa324dc41005ab8afc5afee5bfe9cc54f12c4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 13:17:58 +0100 Subject: [PATCH 068/132] mtd_blkdevs: remove the sector out of range check in do_blktrans_request The block layer already performs this check, no need to duplicate it in the driver. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Link: https://lore.kernel.org/r/20211126121802.2090656-2-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- drivers/mtd/mtd_blkdevs.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 66f81d42fe77..113f86df7603 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -54,17 +54,11 @@ static blk_status_t do_blktrans_request(struct mtd_blktrans_ops *tr, block = blk_rq_pos(req) << 9 >> tr->blkshift; nsect = blk_rq_cur_bytes(req) >> tr->blkshift; - if (req_op(req) == REQ_OP_FLUSH) { + switch (req_op(req)) { + case REQ_OP_FLUSH: if (tr->flush(dev)) return BLK_STS_IOERR; return BLK_STS_OK; - } - - if (blk_rq_pos(req) + blk_rq_cur_sectors(req) > - get_capacity(req->rq_disk)) - return BLK_STS_IOERR; - - switch (req_op(req)) { case REQ_OP_DISCARD: if (tr->discard(dev, block, nsect)) return BLK_STS_IOERR; From 79bb1dbd12005f2143670a9a4f13d91e64725717 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 13:17:59 +0100 Subject: [PATCH 069/132] block: don't check ->rq_disk in merges There is a 1:1 relationship between request_queues and gendisks now, so no need for these extra checks. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Link: https://lore.kernel.org/r/20211126121802.2090656-3-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-merge.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index e07f5a1ae86e..4de34a332c9f 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -777,8 +777,7 @@ static struct request *attempt_merge(struct request_queue *q, if (req_op(req) != req_op(next)) return NULL; - if (rq_data_dir(req) != rq_data_dir(next) - || req->rq_disk != next->rq_disk) + if (rq_data_dir(req) != rq_data_dir(next)) return NULL; if (req_op(req) == REQ_OP_WRITE_SAME && @@ -905,10 +904,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (bio_data_dir(bio) != rq_data_dir(rq)) return false; - /* must be same device */ - if (rq->rq_disk != bio->bi_bdev->bd_disk) - return false; - /* only merge integrity protected bio into ditto rq */ if (blk_integrity_merge_bio(rq->q, rq, bio) == false) return false; From f3fa33acca9f0058157214800f68b10d8e71ab7a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 13:18:00 +0100 Subject: [PATCH 070/132] block: remove the ->rq_disk field in struct request Just use the disk attached to the request_queue instead. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Link: https://lore.kernel.org/r/20211126121802.2090656-4-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-flush.c | 3 +-- block/blk-mq.c | 14 ++++++-------- block/blk.h | 2 +- drivers/block/amiflop.c | 2 +- drivers/block/ataflop.c | 6 +++--- drivers/block/floppy.c | 6 +++--- drivers/block/null_blk/trace.h | 2 +- drivers/block/paride/pcd.c | 2 +- drivers/block/paride/pd.c | 4 ++-- drivers/block/paride/pf.c | 4 ++-- drivers/block/rnbd/rnbd-clt.c | 4 ++-- drivers/block/sunvdc.c | 2 +- drivers/md/dm-mpath.c | 1 - drivers/mmc/core/block.c | 2 +- drivers/nvme/host/fault_inject.c | 2 +- drivers/nvme/host/trace.h | 6 +++--- drivers/scsi/scsi_lib.c | 3 ++- drivers/scsi/scsi_logging.c | 4 +++- drivers/scsi/sd.c | 24 ++++++++++++------------ drivers/scsi/sd_zbc.c | 8 ++++---- drivers/scsi/sr.c | 4 ++-- drivers/scsi/virtio_scsi.c | 2 +- drivers/usb/storage/transport.c | 2 +- include/linux/blk-mq.h | 4 ---- include/scsi/scsi_cmnd.h | 2 +- include/scsi/scsi_device.h | 4 ++-- include/trace/events/block.h | 8 ++++---- kernel/trace/blktrace.c | 2 +- 28 files changed, 62 insertions(+), 67 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 902e80e48e4a..fd5187a0898d 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -145,7 +145,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front) static void blk_account_io_flush(struct request *rq) { - struct block_device *part = rq->rq_disk->part0; + struct block_device *part = rq->q->disk->part0; part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); @@ -339,7 +339,6 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); flush_rq->rq_flags |= RQF_FLUSH_SEQ; - flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; /* * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one diff --git a/block/blk-mq.c b/block/blk-mq.c index 3e67662f7801..f1abfd2e24f7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -377,7 +377,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->start_time_ns = ktime_get_ns(); else rq->start_time_ns = 0; - rq->rq_disk = NULL; rq->part = NULL; #ifdef CONFIG_BLK_RQ_ALLOC_TIME rq->alloc_time_ns = alloc_time_ns; @@ -659,7 +658,7 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug) void blk_dump_rq_flags(struct request *rq, char *msg) { printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, - rq->rq_disk ? rq->rq_disk->disk_name : "?", + rq->q->disk ? rq->q->disk->disk_name : "?", (unsigned long long) rq->cmd_flags); printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", @@ -712,7 +711,7 @@ static void blk_print_req_error(struct request *req, blk_status_t status) "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " "phys_seg %u prio class %u\n", blk_status_to_str(status), - req->rq_disk ? req->rq_disk->disk_name : "?", + req->q->disk ? req->q->disk->disk_name : "?", blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), req->cmd_flags & ~REQ_OP_MASK, req->nr_phys_segments, @@ -853,8 +852,8 @@ static void __blk_account_io_start(struct request *rq) /* passthrough requests can hold bios that do not have ->bi_bdev set */ if (rq->bio && rq->bio->bi_bdev) rq->part = rq->bio->bi_bdev; - else - rq->part = rq->rq_disk->part0; + else if (rq->q->disk) + rq->part = rq->q->disk->part0; part_stat_lock(); update_io_ticks(rq->part, jiffies, false); @@ -1172,7 +1171,6 @@ void blk_execute_rq_nowait(struct gendisk *bd_disk, struct request *rq, WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); - rq->rq_disk = bd_disk; rq->end_io = done; blk_account_io_start(rq); @@ -2902,8 +2900,8 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * if (ret != BLK_STS_OK) return ret; - if (rq->rq_disk && - should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq))) + if (rq->q->disk && + should_fail_request(rq->q->disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; if (blk_crypto_insert_cloned_request(rq)) diff --git a/block/blk.h b/block/blk.h index 3be0fdf76c9a..a55d82c3d1c2 100644 --- a/block/blk.h +++ b/block/blk.h @@ -324,7 +324,7 @@ int blk_dev_init(void); */ static inline bool blk_do_io_stat(struct request *rq) { - return (rq->rq_flags & RQF_IO_STAT) && rq->rq_disk; + return (rq->rq_flags & RQF_IO_STAT) && rq->q->disk; } void update_io_ticks(struct block_device *part, unsigned long now, bool end); diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 1eec5113d0b5..5a566f2fd533 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1505,7 +1505,7 @@ static blk_status_t amiflop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; - struct amiga_floppy_struct *floppy = rq->rq_disk->private_data; + struct amiga_floppy_struct *floppy = rq->q->disk->private_data; blk_status_t err; if (!spin_trylock_irq(&amiflop_lock)) diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index f3ff9babdb5c..5d819a466e2f 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1502,7 +1502,7 @@ static void setup_req_params( int drive ) static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { - struct atari_floppy_struct *floppy = bd->rq->rq_disk->private_data; + struct atari_floppy_struct *floppy = bd->rq->q->disk->private_data; int drive = floppy - unit; int type = floppy->type; @@ -1538,7 +1538,7 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, if (!UDT) { Probing = 1; UDT = atari_disk_type + StartDiskType[DriveType]; - set_capacity(bd->rq->rq_disk, UDT->blocks); + set_capacity(bd->rq->q->disk, UDT->blocks); UD.autoprobe = 1; } } @@ -1558,7 +1558,7 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, } type = minor2disktype[type].index; UDT = &atari_disk_type[type]; - set_capacity(bd->rq->rq_disk, UDT->blocks); + set_capacity(bd->rq->q->disk, UDT->blocks); UD.autoprobe = 0; } diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 7f0a60c4079f..0c638de25023 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2259,7 +2259,7 @@ static int do_format(int drive, struct format_descr *tmp_format_req) static void floppy_end_request(struct request *req, blk_status_t error) { unsigned int nr_sectors = current_count_sectors; - unsigned int drive = (unsigned long)req->rq_disk->private_data; + unsigned int drive = (unsigned long)req->q->disk->private_data; /* current_count_sectors can be zero if transfer failed */ if (error) @@ -2550,7 +2550,7 @@ static int make_raw_rw_request(void) if (WARN(max_buffer_sectors == 0, "VFS: Block I/O scheduled on unopened device\n")) return 0; - set_fdc((long)current_req->rq_disk->private_data); + set_fdc((long)current_req->q->disk->private_data); raw_cmd = &default_raw_cmd; raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK; @@ -2792,7 +2792,7 @@ do_request: return; } } - drive = (long)current_req->rq_disk->private_data; + drive = (long)current_req->q->disk->private_data; set_fdc(drive); reschedule_timeout(current_drive, "redo fd request"); diff --git a/drivers/block/null_blk/trace.h b/drivers/block/null_blk/trace.h index ce3b430e88c5..86d6c12c603c 100644 --- a/drivers/block/null_blk/trace.h +++ b/drivers/block/null_blk/trace.h @@ -44,7 +44,7 @@ TRACE_EVENT(nullb_zone_op, __entry->op = req_op(cmd->rq); __entry->zone_no = zone_no; __entry->zone_cond = zone_cond; - __assign_disk_name(__entry->disk, cmd->rq->rq_disk); + __assign_disk_name(__entry->disk, cmd->rq->q->disk); ), TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s", __print_disk_name(__entry->disk), diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 255fd3d4b8a8..f462ad67931a 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -690,7 +690,7 @@ static void pcd_request(void) if (!pcd_req && !set_next_request()) return; - cd = pcd_req->rq_disk->private_data; + cd = pcd_req->q->disk->private_data; if (cd != pcd_current) pcd_bufblk = -1; pcd_current = cd; diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index fba865058a17..4f8cce510562 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -430,7 +430,7 @@ static void run_fsm(void) int stop = 0; if (!phase) { - pd_current = pd_req->rq_disk->private_data; + pd_current = pd_req->q->disk->private_data; pi_current = pd_current->pi; phase = do_pd_io_start; } @@ -492,7 +492,7 @@ static enum action do_pd_io_start(void) case REQ_OP_WRITE: pd_block = blk_rq_pos(pd_req); pd_count = blk_rq_cur_sectors(pd_req); - if (pd_block + pd_count > get_capacity(pd_req->rq_disk)) + if (pd_block + pd_count > get_capacity(pd_req->q->disk)) return Fail; pd_run = blk_rq_sectors(pd_req); pd_buf = bio_data(pd_req->bio); diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index b84a6448a4f7..292e9a4ce1b9 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -746,12 +746,12 @@ repeat: if (!pf_req && !set_next_request()) return; - pf_current = pf_req->rq_disk->private_data; + pf_current = pf_req->q->disk->private_data; pf_block = blk_rq_pos(pf_req); pf_run = blk_rq_sectors(pf_req); pf_count = blk_rq_cur_sectors(pf_req); - if (pf_block + pf_count > get_capacity(pf_req->rq_disk)) { + if (pf_block + pf_count > get_capacity(pf_req->q->disk)) { pf_end_request(BLK_STS_IOERR); goto repeat; } diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 2df0657cdf00..67a8edbaa1fd 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -393,7 +393,7 @@ static void rnbd_put_iu(struct rnbd_clt_session *sess, struct rnbd_iu *iu) static void rnbd_softirq_done_fn(struct request *rq) { - struct rnbd_clt_dev *dev = rq->rq_disk->private_data; + struct rnbd_clt_dev *dev = rq->q->disk->private_data; struct rnbd_clt_session *sess = dev->sess; struct rnbd_iu *iu; @@ -1133,7 +1133,7 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; - struct rnbd_clt_dev *dev = rq->rq_disk->private_data; + struct rnbd_clt_dev *dev = rq->q->disk->private_data; struct rnbd_iu *iu = blk_mq_rq_to_pdu(rq); int err; blk_status_t ret = BLK_STS_IOERR; diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 2157936de623..146d85d80e0e 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -462,7 +462,7 @@ static int __vdc_tx_trigger(struct vdc_port *port) static int __send_request(struct request *req) { - struct vdc_port *port = req->rq_disk->private_data; + struct vdc_port *port = req->q->disk->private_data; struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; struct scatterlist sg[MAX_RING_COOKIES]; struct vdc_req_entry *rqe; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 90dc9cc48881..f4719b65e5e3 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -550,7 +550,6 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, return DM_MAPIO_REQUEUE; } clone->bio = clone->biotail = NULL; - clone->rq_disk = bdev->bd_disk; clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; *__clone = clone; diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 635b79899b9f..dc094f73d335 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -1837,7 +1837,7 @@ static void mmc_blk_mq_rw_recovery(struct mmc_queue *mq, struct request *req) /* Reset if the card is in a bad state */ if (!mmc_host_is_spi(mq->card->host) && err && mmc_blk_reset(md, card->host, type)) { - pr_err("%s: recovery failed!\n", req->rq_disk->disk_name); + pr_err("%s: recovery failed!\n", req->q->disk->disk_name); mqrq->retries = MMC_NO_RETRIES; return; } diff --git a/drivers/nvme/host/fault_inject.c b/drivers/nvme/host/fault_inject.c index 1352159733b0..83d2e6860d38 100644 --- a/drivers/nvme/host/fault_inject.c +++ b/drivers/nvme/host/fault_inject.c @@ -56,7 +56,7 @@ void nvme_fault_inject_fini(struct nvme_fault_inject *fault_inject) void nvme_should_fail(struct request *req) { - struct gendisk *disk = req->rq_disk; + struct gendisk *disk = req->q->disk; struct nvme_fault_inject *fault_inject = NULL; u16 status; diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index 35bac7a25422..b5f85259461a 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -68,7 +68,7 @@ TRACE_EVENT(nvme_setup_cmd, __entry->nsid = le32_to_cpu(cmd->common.nsid); __entry->metadata = !!blk_integrity_rq(req); __entry->fctype = cmd->fabrics.fctype; - __assign_disk_name(__entry->disk, req->rq_disk); + __assign_disk_name(__entry->disk, req->q->disk); memcpy(__entry->cdw10, &cmd->common.cdw10, sizeof(__entry->cdw10)); ), @@ -103,7 +103,7 @@ TRACE_EVENT(nvme_complete_rq, __entry->retries = nvme_req(req)->retries; __entry->flags = nvme_req(req)->flags; __entry->status = nvme_req(req)->status; - __assign_disk_name(__entry->disk, req->rq_disk); + __assign_disk_name(__entry->disk, req->q->disk); ), TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%#llx, retries=%u, flags=0x%x, status=%#x", __entry->ctrl_id, __print_disk_name(__entry->disk), @@ -153,7 +153,7 @@ TRACE_EVENT(nvme_sq, ), TP_fast_assign( __entry->ctrl_id = nvme_req(req)->ctrl->instance; - __assign_disk_name(__entry->disk, req->rq_disk); + __assign_disk_name(__entry->disk, req->q->disk); __entry->qid = nvme_req_qid(req); __entry->sq_head = le16_to_cpu(sq_head); __entry->sq_tail = sq_tail; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 5e8b5ecb3245..c23cf8e7b3c3 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -543,8 +543,9 @@ static bool scsi_end_request(struct request *req, blk_status_t error, if (blk_update_request(req, error, bytes)) return true; + // XXX: if (blk_queue_add_random(q)) - add_disk_randomness(req->rq_disk); + add_disk_randomness(req->q->disk); if (!blk_rq_is_passthrough(req)) { WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED)); diff --git a/drivers/scsi/scsi_logging.c b/drivers/scsi/scsi_logging.c index ed9572252a42..1f8f80b2dbfc 100644 --- a/drivers/scsi/scsi_logging.c +++ b/drivers/scsi/scsi_logging.c @@ -30,7 +30,9 @@ static inline const char *scmd_name(const struct scsi_cmnd *scmd) { struct request *rq = scsi_cmd_to_rq((struct scsi_cmnd *)scmd); - return rq->rq_disk ? rq->rq_disk->disk_name : NULL; + if (!rq->q->disk) + return NULL; + return rq->q->disk->disk_name; } static size_t sdev_format_header(char *logbuf, size_t logbuf_len, diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index bba1f5dafd38..8181857ddf53 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -872,7 +872,7 @@ static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); unsigned int data_len = 24; @@ -908,7 +908,7 @@ static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; @@ -940,7 +940,7 @@ static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; @@ -971,7 +971,7 @@ static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); @@ -1068,7 +1068,7 @@ static blk_status_t sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); struct bio *bio = rq->bio; u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); @@ -1116,7 +1116,7 @@ static blk_status_t sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); /* flush requests don't perform I/O, zero the S/G table */ memset(&cmd->sdb, 0, sizeof(cmd->sdb)); @@ -1215,7 +1215,7 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); sector_t lba = sectors_to_logical(sdp, blk_rq_pos(rq)); sector_t threshold; unsigned int nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); @@ -1236,7 +1236,7 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) goto fail; } - if (blk_rq_pos(rq) + blk_rq_sectors(rq) > get_capacity(rq->rq_disk)) { + if (blk_rq_pos(rq) + blk_rq_sectors(rq) > get_capacity(rq->q->disk)) { scmd_printk(KERN_ERR, cmd, "access beyond end of device\n"); goto fail; } @@ -1331,7 +1331,7 @@ static blk_status_t sd_init_command(struct scsi_cmnd *cmd) switch (req_op(rq)) { case REQ_OP_DISCARD: - switch (scsi_disk(rq->rq_disk)->provisioning_mode) { + switch (scsi_disk(rq->q->disk)->provisioning_mode) { case SD_LBP_UNMAP: return sd_setup_unmap_cmnd(cmd); case SD_LBP_WS16: @@ -1917,7 +1917,7 @@ static const struct block_device_operations sd_fops = { **/ static void sd_eh_reset(struct scsi_cmnd *scmd) { - struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->rq_disk); + struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->q->disk); /* New SCSI EH run, reset gate variable */ sdkp->ignore_medium_access_errors = false; @@ -1937,7 +1937,7 @@ static void sd_eh_reset(struct scsi_cmnd *scmd) **/ static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp) { - struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->rq_disk); + struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->q->disk); struct scsi_device *sdev = scmd->device; if (!scsi_device_online(sdev) || @@ -2034,7 +2034,7 @@ static int sd_done(struct scsi_cmnd *SCpnt) unsigned int resid; struct scsi_sense_hdr sshdr; struct request *req = scsi_cmd_to_rq(SCpnt); - struct scsi_disk *sdkp = scsi_disk(req->rq_disk); + struct scsi_disk *sdkp = scsi_disk(req->q->disk); int sense_valid = 0; int sense_deferred = 0; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index ed06798983f8..65bfd1e170da 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -244,7 +244,7 @@ out: static blk_status_t sd_zbc_cmnd_checks(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); sector_t sector = blk_rq_pos(rq); if (!sd_is_zoned(sdkp)) @@ -322,7 +322,7 @@ blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, unsigned int nr_blocks) { struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); unsigned int wp_offset, zno = blk_rq_zone_no(rq); unsigned long flags; blk_status_t ret; @@ -388,7 +388,7 @@ blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, { struct request *rq = scsi_cmd_to_rq(cmd); sector_t sector = blk_rq_pos(rq); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); sector_t block = sectors_to_logical(sdkp->device, sector); blk_status_t ret; @@ -443,7 +443,7 @@ static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd, { int result = cmd->result; struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); unsigned int zno = blk_rq_zone_no(rq); enum req_opf op = req_op(rq); unsigned long flags; diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 411e2b01966e..7db595c08b20 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -335,7 +335,7 @@ static int sr_done(struct scsi_cmnd *SCpnt) int block_sectors = 0; long error_sector; struct request *rq = scsi_cmd_to_rq(SCpnt); - struct scsi_cd *cd = scsi_cd(rq->rq_disk); + struct scsi_cd *cd = scsi_cd(rq->q->disk); #ifdef DEBUG scmd_printk(KERN_INFO, SCpnt, "done: %x\n", result); @@ -402,7 +402,7 @@ static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt) ret = scsi_alloc_sgtables(SCpnt); if (ret != BLK_STS_OK) return ret; - cd = scsi_cd(rq->rq_disk); + cd = scsi_cd(rq->q->disk); SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, SCpnt, "Doing sr request, block = %d\n", block)); diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 28e1d98ae102..65c642b24ecf 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -528,7 +528,7 @@ static void virtio_scsi_init_hdr_pi(struct virtio_device *vdev, if (!rq || !scsi_prot_sg_count(sc)) return; - bi = blk_get_integrity(rq->rq_disk); + bi = blk_get_integrity(rq->q->disk); if (sc->sc_data_direction == DMA_TO_DEVICE) cmd_pi->pi_bytesout = cpu_to_virtio32(vdev, diff --git a/drivers/usb/storage/transport.c b/drivers/usb/storage/transport.c index 4c5a0a49035f..1928b3918242 100644 --- a/drivers/usb/storage/transport.c +++ b/drivers/usb/storage/transport.c @@ -551,7 +551,7 @@ static void last_sector_hacks(struct us_data *us, struct scsi_cmnd *srb) /* Did this command access the last sector? */ sector = (srb->cmnd[2] << 24) | (srb->cmnd[3] << 16) | (srb->cmnd[4] << 8) | (srb->cmnd[5]); - disk = scsi_cmd_to_rq(srb)->rq_disk; + disk = scsi_cmd_to_rq(srb)->q->disk; if (!disk) goto done; sdkp = scsi_disk(disk); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d952c3442261..ede7bef8880a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -100,7 +100,6 @@ struct request { struct request *rq_next; }; - struct gendisk *rq_disk; struct block_device *part; #ifdef CONFIG_BLK_RQ_ALLOC_TIME /* Time that the first bio started allocating this request. */ @@ -890,9 +889,6 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; rq->ioprio = bio_prio(bio); - - if (bio->bi_bdev) - rq->rq_disk = bio->bi_bdev->bd_disk; } void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index 477a800a9543..6794d7322cbd 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -164,7 +164,7 @@ static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); - return *(struct scsi_driver **)rq->rq_disk->private_data; + return *(struct scsi_driver **)rq->q->disk->private_data; } void scsi_done(struct scsi_cmnd *cmd); diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index d1c6fc83b1e3..ab7557d84f75 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -275,9 +275,9 @@ scmd_printk(const char *, const struct scsi_cmnd *, const char *, ...); do { \ struct request *__rq = scsi_cmd_to_rq((scmd)); \ \ - if (__rq->rq_disk) \ + if (__rq->q->disk) \ sdev_dbg((scmd)->device, "[%s] " fmt, \ - __rq->rq_disk->disk_name, ##a); \ + __rq->q->disk->disk_name, ##a); \ else \ sdev_dbg((scmd)->device, fmt, ##a); \ } while (0) diff --git a/include/trace/events/block.h b/include/trace/events/block.h index a95daa4d4caa..27170e40e8c9 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -85,7 +85,7 @@ TRACE_EVENT(block_rq_requeue, ), TP_fast_assign( - __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); @@ -128,7 +128,7 @@ TRACE_EVENT(block_rq_complete, ), TP_fast_assign( - __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_pos(rq); __entry->nr_sector = nr_bytes >> 9; __entry->error = blk_status_to_errno(error); @@ -161,7 +161,7 @@ DECLARE_EVENT_CLASS(block_rq, ), TP_fast_assign( - __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); __entry->bytes = blk_rq_bytes(rq); @@ -512,7 +512,7 @@ TRACE_EVENT(block_rq_remap, ), TP_fast_assign( - __entry->dev = disk_devt(rq->rq_disk); + __entry->dev = disk_devt(rq->q->disk); __entry->sector = blk_rq_pos(rq); __entry->nr_sector = blk_rq_sectors(rq); __entry->old_dev = dev; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 1183c88634aa..431e41bc4c23 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1045,7 +1045,7 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, } r.device_from = cpu_to_be32(dev); - r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); + r.device_to = cpu_to_be32(disk_devt(rq->q->disk)); r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), From b84ba30b6c7a75babdf73b83bc3c7b59b944501a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 13:18:01 +0100 Subject: [PATCH 071/132] block: remove the gendisk argument to blk_execute_rq Remove the gendisk aregument to blk_execute_rq and blk_execute_rq_nowait given that it is unused now. Also convert the boolean at_head parameter to actually use the bool type while touching the prototype. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Link: https://lore.kernel.org/r/20211126121802.2090656-5-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq.c | 10 +++------- block/bsg-lib.c | 2 +- drivers/block/mtip32xx/mtip32xx.c | 2 +- drivers/block/paride/pd.c | 2 +- drivers/block/pktcdvd.c | 2 +- drivers/block/sx8.c | 4 ++-- drivers/block/virtio_blk.c | 2 +- drivers/mmc/core/block.c | 10 +++++----- drivers/nvme/host/core.c | 4 ++-- drivers/nvme/host/pci.c | 7 +++---- drivers/nvme/target/passthru.c | 3 +-- drivers/scsi/scsi_bsg.c | 2 +- drivers/scsi/scsi_error.c | 2 +- drivers/scsi/scsi_ioctl.c | 4 ++-- drivers/scsi/scsi_lib.c | 2 +- drivers/scsi/sg.c | 2 +- drivers/scsi/sr.c | 2 +- drivers/scsi/st.c | 2 +- drivers/scsi/ufs/ufshpb.c | 4 ++-- drivers/target/target_core_pscsi.c | 2 +- include/linux/blk-mq.h | 7 +++---- 21 files changed, 35 insertions(+), 42 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index f1abfd2e24f7..ecfc47fad236 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1153,7 +1153,6 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) /** * blk_execute_rq_nowait - insert a request to I/O scheduler for execution - * @bd_disk: matching gendisk * @rq: request to insert * @at_head: insert request at head or tail of queue * @done: I/O completion handler @@ -1165,8 +1164,7 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) * Note: * This function will invoke @done directly if the queue is dead. */ -void blk_execute_rq_nowait(struct gendisk *bd_disk, struct request *rq, - int at_head, rq_end_io_fn *done) +void blk_execute_rq_nowait(struct request *rq, bool at_head, rq_end_io_fn *done) { WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); @@ -1204,7 +1202,6 @@ static void blk_rq_poll_completion(struct request *rq, struct completion *wait) /** * blk_execute_rq - insert a request into queue for execution - * @bd_disk: matching gendisk * @rq: request to insert * @at_head: insert request at head or tail of queue * @@ -1213,14 +1210,13 @@ static void blk_rq_poll_completion(struct request *rq, struct completion *wait) * for execution and wait for completion. * Return: The blk_status_t result provided to blk_mq_end_request(). */ -blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq, - int at_head) +blk_status_t blk_execute_rq(struct request *rq, bool at_head) { DECLARE_COMPLETION_ONSTACK(wait); unsigned long hang_check; rq->end_io_data = &wait; - blk_execute_rq_nowait(bd_disk, rq, at_head, blk_end_sync_rq); + blk_execute_rq_nowait(rq, at_head, blk_end_sync_rq); /* Prevent hang_check timer from firing at us during very long I/O */ hang_check = sysctl_hung_task_timeout_secs; diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 10aa378702fa..acfe1357bf6c 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -92,7 +92,7 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, goto out_unmap_bidi_rq; bio = rq->bio; - blk_execute_rq(NULL, rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL)); + blk_execute_rq(rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL)); /* * The assignments below don't make much sense, but are kept for diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index c91b9010c1a6..30f471021a40 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -1015,7 +1015,7 @@ static int mtip_exec_internal_command(struct mtip_port *port, rq->timeout = timeout; /* insert request and run queue */ - blk_execute_rq(NULL, rq, true); + blk_execute_rq(rq, true); if (int_cmd->status) { dev_err(&dd->pdev->dev, "Internal command [%02X] failed %d\n", diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 4f8cce510562..3637c38c72f9 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -781,7 +781,7 @@ static int pd_special_command(struct pd_unit *disk, req = blk_mq_rq_to_pdu(rq); req->func = func; - blk_execute_rq(disk->gd, rq, 0); + blk_execute_rq(rq, false); blk_mq_free_request(rq); return 0; } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 3af0499857ec..887c98d61684 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -722,7 +722,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * if (cgc->quiet) rq->rq_flags |= RQF_QUIET; - blk_execute_rq(pd->bdev->bd_disk, rq, 0); + blk_execute_rq(rq, false); if (scsi_req(rq)->result) ret = -EIO; out: diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index d1676fe0da1a..b361583944b9 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -540,7 +540,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx) spin_unlock_irq(&host->lock); DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag); - blk_execute_rq_nowait(NULL, rq, true, NULL); + blk_execute_rq_nowait(rq, true, NULL); return 0; @@ -579,7 +579,7 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func) crq->msg_bucket = (u32) rc; DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag); - blk_execute_rq_nowait(NULL, rq, true, NULL); + blk_execute_rq_nowait(rq, true, NULL); return 0; } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index cfa303fa7318..c3dc3cd7a779 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -384,7 +384,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) if (err) goto out; - blk_execute_rq(vblk->disk, req, false); + blk_execute_rq(req, false); err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req))); out: blk_mq_free_request(req); diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index dc094f73d335..ef8f45fa2cee 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -264,7 +264,7 @@ static ssize_t power_ro_lock_store(struct device *dev, goto out_put; } req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_BOOT_WP; - blk_execute_rq(NULL, req, 0); + blk_execute_rq(req, false); ret = req_to_mmc_queue_req(req)->drv_op_result; blk_mq_free_request(req); @@ -657,7 +657,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md, rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL; req_to_mmc_queue_req(req)->drv_op_data = idatas; req_to_mmc_queue_req(req)->ioc_count = 1; - blk_execute_rq(NULL, req, 0); + blk_execute_rq(req, false); ioc_err = req_to_mmc_queue_req(req)->drv_op_result; err = mmc_blk_ioctl_copy_to_user(ic_ptr, idata); blk_mq_free_request(req); @@ -726,7 +726,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md, rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL; req_to_mmc_queue_req(req)->drv_op_data = idata; req_to_mmc_queue_req(req)->ioc_count = num_of_cmds; - blk_execute_rq(NULL, req, 0); + blk_execute_rq(req, false); ioc_err = req_to_mmc_queue_req(req)->drv_op_result; /* copy to user if data and response */ @@ -2743,7 +2743,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val) if (IS_ERR(req)) return PTR_ERR(req); req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_CARD_STATUS; - blk_execute_rq(NULL, req, 0); + blk_execute_rq(req, false); ret = req_to_mmc_queue_req(req)->drv_op_result; if (ret >= 0) { *val = ret; @@ -2782,7 +2782,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp) } req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_EXT_CSD; req_to_mmc_queue_req(req)->drv_op_data = &ext_csd; - blk_execute_rq(NULL, req, 0); + blk_execute_rq(req, false); err = req_to_mmc_queue_req(req)->drv_op_result; blk_mq_free_request(req); if (err) { diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4c63564adeaa..f82c098b1a61 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1056,7 +1056,7 @@ static int nvme_execute_rq(struct gendisk *disk, struct request *rq, { blk_status_t status; - status = blk_execute_rq(disk, rq, at_head); + status = blk_execute_rq(rq, at_head); if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) return -EINTR; if (nvme_req(rq)->status) @@ -1283,7 +1283,7 @@ static void nvme_keep_alive_work(struct work_struct *work) rq->timeout = ctrl->kato * HZ; rq->end_io_data = ctrl; - blk_execute_rq_nowait(NULL, rq, 0, nvme_keep_alive_end_io); + blk_execute_rq_nowait(rq, false, nvme_keep_alive_end_io); } static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ca2ee806d74b..8637538f3fd5 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1371,7 +1371,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) } abort_req->end_io_data = NULL; - blk_execute_rq_nowait(NULL, abort_req, 0, abort_endio); + blk_execute_rq_nowait(abort_req, false, abort_endio); /* * The aborted req will be completed on receiving the abort req. @@ -2416,9 +2416,8 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) req->end_io_data = nvmeq; init_completion(&nvmeq->delete_done); - blk_execute_rq_nowait(NULL, req, false, - opcode == nvme_admin_delete_cq ? - nvme_del_cq_end : nvme_del_queue_end); + blk_execute_rq_nowait(req, false, opcode == nvme_admin_delete_cq ? + nvme_del_cq_end : nvme_del_queue_end); return 0; } diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index f0efb3537989..9e5b89ae29df 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -284,8 +284,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) schedule_work(&req->p.work); } else { rq->end_io_data = req; - blk_execute_rq_nowait(ns ? ns->disk : NULL, rq, 0, - nvmet_passthru_req_done); + blk_execute_rq_nowait(rq, false, nvmet_passthru_req_done); } if (ns) diff --git a/drivers/scsi/scsi_bsg.c b/drivers/scsi/scsi_bsg.c index 081b84bb7985..b7a464383cc0 100644 --- a/drivers/scsi/scsi_bsg.c +++ b/drivers/scsi/scsi_bsg.c @@ -60,7 +60,7 @@ static int scsi_bsg_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, goto out_free_cmd; bio = rq->bio; - blk_execute_rq(NULL, rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL)); + blk_execute_rq(rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL)); /* * fill in all the output members diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 2371edbc3af4..3eae2392ef15 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -2040,7 +2040,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev) req->timeout = 10 * HZ; rq->retries = 5; - blk_execute_rq_nowait(NULL, req, 1, eh_lock_door_done); + blk_execute_rq_nowait(req, true, eh_lock_door_done); } /** diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c index 400df3354cd6..340ba0ad6e70 100644 --- a/drivers/scsi/scsi_ioctl.c +++ b/drivers/scsi/scsi_ioctl.c @@ -483,7 +483,7 @@ static int sg_io(struct scsi_device *sdev, struct gendisk *disk, start_time = jiffies; - blk_execute_rq(disk, rq, at_head); + blk_execute_rq(rq, at_head); hdr->duration = jiffies_to_msecs(jiffies - start_time); @@ -620,7 +620,7 @@ static int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, goto error; } - blk_execute_rq(disk, rq, 0); + blk_execute_rq(rq, false); err = req->result & 0xff; /* only 8 bit SCSI status */ if (err) { diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index c23cf8e7b3c3..35e381f6d371 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -241,7 +241,7 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, /* * head injection *required* here otherwise quiesce won't work */ - blk_execute_rq(NULL, req, 1); + blk_execute_rq(req, true); /* * Some devices (USB mass-storage in particular) may transfer diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 141099ab9092..6c8ffad88f9e 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -833,7 +833,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp, srp->rq->timeout = timeout; kref_get(&sfp->f_ref); /* sg_rq_end_io() does kref_put(). */ - blk_execute_rq_nowait(NULL, srp->rq, at_head, sg_rq_end_io); + blk_execute_rq_nowait(srp->rq, at_head, sg_rq_end_io); return 0; } diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 7db595c08b20..c29589815468 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -994,7 +994,7 @@ static int sr_read_cdda_bpc(struct cdrom_device_info *cdi, void __user *ubuf, rq->timeout = 60 * HZ; bio = rq->bio; - blk_execute_rq(disk, rq, 0); + blk_execute_rq(rq, false); if (scsi_req(rq)->result) { struct scsi_sense_hdr sshdr; diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index c2d5608f6b1a..06acc7db7755 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -581,7 +581,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd, rq->retries = retries; req->end_io_data = SRpnt; - blk_execute_rq_nowait(NULL, req, 1, st_scsi_execute_end); + blk_execute_rq_nowait(req, true, st_scsi_execute_end); return 0; } diff --git a/drivers/scsi/ufs/ufshpb.c b/drivers/scsi/ufs/ufshpb.c index ded5ba9b1466..13cd21204bf9 100644 --- a/drivers/scsi/ufs/ufshpb.c +++ b/drivers/scsi/ufs/ufshpb.c @@ -677,7 +677,7 @@ static void ufshpb_execute_umap_req(struct ufshpb_lu *hpb, ufshpb_set_unmap_cmd(rq->cmd, rgn); rq->cmd_len = HPB_WRITE_BUFFER_CMD_LENGTH; - blk_execute_rq_nowait(NULL, req, 1, ufshpb_umap_req_compl_fn); + blk_execute_rq_nowait(req, true, ufshpb_umap_req_compl_fn); hpb->stats.umap_req_cnt++; } @@ -719,7 +719,7 @@ static int ufshpb_execute_map_req(struct ufshpb_lu *hpb, map_req->rb.srgn_idx, mem_size); rq->cmd_len = HPB_READ_BUFFER_CMD_LENGTH; - blk_execute_rq_nowait(NULL, req, 1, ufshpb_map_req_compl_fn); + blk_execute_rq_nowait(req, true, ufshpb_map_req_compl_fn); hpb->stats.map_req_cnt++; return 0; diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 7fa57fb57bf2..807d06ecadee 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -1005,7 +1005,7 @@ pscsi_execute_cmd(struct se_cmd *cmd) req->timeout = PS_TIMEOUT_OTHER; scsi_req(req)->retries = PS_RETRY; - blk_execute_rq_nowait(NULL, req, (cmd->sam_task_attr == TCM_HEAD_TAG), + blk_execute_rq_nowait(req, cmd->sam_task_attr == TCM_HEAD_TAG, pscsi_req_done); return 0; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ede7bef8880a..1b87b7c8bbff 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -924,10 +924,9 @@ int blk_rq_unmap_user(struct bio *); int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t); int blk_rq_append_bio(struct request *rq, struct bio *bio); -void blk_execute_rq_nowait(struct gendisk *, struct request *, int, - rq_end_io_fn *); -blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq, - int at_head); +void blk_execute_rq_nowait(struct request *rq, bool at_head, + rq_end_io_fn *end_io); +blk_status_t blk_execute_rq(struct request *rq, bool at_head); struct req_iterator { struct bvec_iter iter; From a30e3441325ba4011ddf125932cda21ca820c0bb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 26 Nov 2021 13:18:02 +0100 Subject: [PATCH 072/132] scsi: remove the gendisk argument to scsi_ioctl Now that blk_execute_rq does not take a gendisk argument there is no need to pass it through the scsi_ioctl callchain either. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Link: https://lore.kernel.org/r/20211126121802.2090656-6-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- drivers/scsi/ch.c | 2 +- drivers/scsi/scsi_ioctl.c | 39 +++++++++++++++------------------------ drivers/scsi/sd.c | 2 +- drivers/scsi/sg.c | 4 ++-- drivers/scsi/sr.c | 5 ++--- drivers/scsi/st.c | 2 +- include/scsi/scsi_ioctl.h | 4 ++-- 7 files changed, 24 insertions(+), 34 deletions(-) diff --git a/drivers/scsi/ch.c b/drivers/scsi/ch.c index 27012908b586..6fa300daa31e 100644 --- a/drivers/scsi/ch.c +++ b/drivers/scsi/ch.c @@ -877,7 +877,7 @@ static long ch_ioctl(struct file *file, } default: - return scsi_ioctl(ch->device, NULL, file->f_mode, cmd, argp); + return scsi_ioctl(ch->device, file->f_mode, cmd, argp); } } diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c index 340ba0ad6e70..e13fd380deb6 100644 --- a/drivers/scsi/scsi_ioctl.c +++ b/drivers/scsi/scsi_ioctl.c @@ -408,8 +408,7 @@ static int scsi_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, return ret; } -static int sg_io(struct scsi_device *sdev, struct gendisk *disk, - struct sg_io_hdr *hdr, fmode_t mode) +static int sg_io(struct scsi_device *sdev, struct sg_io_hdr *hdr, fmode_t mode) { unsigned long start_time; ssize_t ret = 0; @@ -499,19 +498,12 @@ out_put_request: /** * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl * @q: request queue to send scsi commands down - * @disk: gendisk to operate on (option) * @mode: mode used to open the file through which the ioctl has been * submitted * @sic: userspace structure describing the command to perform * * Send down the scsi command described by @sic to the device below - * the request queue @q. If @file is non-NULL it's used to perform - * fine-grained permission checks that allow users to send down - * non-destructive SCSI commands. If the caller has a struct gendisk - * available it should be passed in as @disk to allow the low level - * driver to use the information contained in it. A non-NULL @disk - * is only allowed if the caller knows that the low level driver doesn't - * need it (e.g. in the scsi subsystem). + * the request queue @q. * * Notes: * - This interface is deprecated - users should use the SG_IO @@ -530,8 +522,8 @@ out_put_request: * Positive numbers returned are the compacted SCSI error codes (4 * bytes in one int) where the lowest byte is the SCSI status. */ -static int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, - fmode_t mode, struct scsi_ioctl_command __user *sic) +static int sg_scsi_ioctl(struct request_queue *q, fmode_t mode, + struct scsi_ioctl_command __user *sic) { enum { OMAX_SB_LEN = 16 }; /* For backward compatibility */ struct request *rq; @@ -806,8 +798,8 @@ static int scsi_put_cdrom_generic_arg(const struct cdrom_generic_command *cgc, return 0; } -static int scsi_cdrom_send_packet(struct scsi_device *sdev, struct gendisk *disk, - fmode_t mode, void __user *arg) +static int scsi_cdrom_send_packet(struct scsi_device *sdev, fmode_t mode, + void __user *arg) { struct cdrom_generic_command cgc; struct sg_io_hdr hdr; @@ -847,7 +839,7 @@ static int scsi_cdrom_send_packet(struct scsi_device *sdev, struct gendisk *disk hdr.cmdp = ((struct cdrom_generic_command __user *) arg)->cmd; hdr.cmd_len = sizeof(cgc.cmd); - err = sg_io(sdev, disk, &hdr, mode); + err = sg_io(sdev, &hdr, mode); if (err == -EFAULT) return -EFAULT; @@ -862,8 +854,8 @@ static int scsi_cdrom_send_packet(struct scsi_device *sdev, struct gendisk *disk return err; } -static int scsi_ioctl_sg_io(struct scsi_device *sdev, struct gendisk *disk, - fmode_t mode, void __user *argp) +static int scsi_ioctl_sg_io(struct scsi_device *sdev, fmode_t mode, + void __user *argp) { struct sg_io_hdr hdr; int error; @@ -871,7 +863,7 @@ static int scsi_ioctl_sg_io(struct scsi_device *sdev, struct gendisk *disk, error = get_sg_io_hdr(&hdr, argp); if (error) return error; - error = sg_io(sdev, disk, &hdr, mode); + error = sg_io(sdev, &hdr, mode); if (error == -EFAULT) return error; if (put_sg_io_hdr(&hdr, argp)) @@ -882,7 +874,6 @@ static int scsi_ioctl_sg_io(struct scsi_device *sdev, struct gendisk *disk, /** * scsi_ioctl - Dispatch ioctl to scsi device * @sdev: scsi device receiving ioctl - * @disk: disk receiving the ioctl * @mode: mode the block/char device is opened with * @cmd: which ioctl is it * @arg: data associated with ioctl @@ -891,8 +882,8 @@ static int scsi_ioctl_sg_io(struct scsi_device *sdev, struct gendisk *disk, * does not take a major/minor number as the dev field. Rather, it takes * a pointer to a &struct scsi_device. */ -int scsi_ioctl(struct scsi_device *sdev, struct gendisk *disk, fmode_t mode, - int cmd, void __user *arg) +int scsi_ioctl(struct scsi_device *sdev, fmode_t mode, int cmd, + void __user *arg) { struct request_queue *q = sdev->request_queue; struct scsi_sense_hdr sense_hdr; @@ -927,11 +918,11 @@ int scsi_ioctl(struct scsi_device *sdev, struct gendisk *disk, fmode_t mode, case SG_EMULATED_HOST: return sg_emulated_host(q, arg); case SG_IO: - return scsi_ioctl_sg_io(sdev, disk, mode, arg); + return scsi_ioctl_sg_io(sdev, mode, arg); case SCSI_IOCTL_SEND_COMMAND: - return sg_scsi_ioctl(q, disk, mode, arg); + return sg_scsi_ioctl(q, mode, arg); case CDROM_SEND_PACKET: - return scsi_cdrom_send_packet(sdev, disk, mode, arg); + return scsi_cdrom_send_packet(sdev, mode, arg); case CDROMCLOSETRAY: return scsi_send_start_stop(sdev, 3); case CDROMEJECT: diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 8181857ddf53..5ddb8e053a8e 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1574,7 +1574,7 @@ static int sd_ioctl(struct block_device *bdev, fmode_t mode, if (is_sed_ioctl(cmd)) return sed_ioctl(sdkp->opal_dev, cmd, p); - return scsi_ioctl(sdp, disk, mode, cmd, p); + return scsi_ioctl(sdp, mode, cmd, p); } static void set_media_not_present(struct scsi_disk *sdkp) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 6c8ffad88f9e..ad12b3261845 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1109,7 +1109,7 @@ sg_ioctl_common(struct file *filp, Sg_device *sdp, Sg_fd *sfp, case SCSI_IOCTL_SEND_COMMAND: if (atomic_read(&sdp->detaching)) return -ENODEV; - return scsi_ioctl(sdp->device, NULL, filp->f_mode, cmd_in, p); + return scsi_ioctl(sdp->device, filp->f_mode, cmd_in, p); case SG_SET_DEBUG: result = get_user(val, ip); if (result) @@ -1165,7 +1165,7 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) ret = sg_ioctl_common(filp, sdp, sfp, cmd_in, p); if (ret != -ENOIOCTLCMD) return ret; - return scsi_ioctl(sdp->device, NULL, filp->f_mode, cmd_in, p); + return scsi_ioctl(sdp->device, filp->f_mode, cmd_in, p); } static __poll_t diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index c29589815468..14c122839c40 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -561,8 +561,7 @@ static void sr_block_release(struct gendisk *disk, fmode_t mode) static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { - struct gendisk *disk = bdev->bd_disk; - struct scsi_cd *cd = scsi_cd(disk); + struct scsi_cd *cd = scsi_cd(bdev->bd_disk); struct scsi_device *sdev = cd->device; void __user *argp = (void __user *)arg; int ret; @@ -584,7 +583,7 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, if (ret != -ENOSYS) goto put; } - ret = scsi_ioctl(sdev, disk, mode, cmd, argp); + ret = scsi_ioctl(sdev, mode, cmd, argp); put: scsi_autopm_put_device(sdev); diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 06acc7db7755..e869e90e05af 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -3829,7 +3829,7 @@ static long st_ioctl(struct file *file, unsigned int cmd_in, unsigned long arg) break; } - retval = scsi_ioctl(STp->device, NULL, file->f_mode, cmd_in, p); + retval = scsi_ioctl(STp->device, file->f_mode, cmd_in, p); if (!retval && cmd_in == SCSI_IOCTL_STOP_UNIT) { /* unload */ STp->rew_at_close = 0; diff --git a/include/scsi/scsi_ioctl.h b/include/scsi/scsi_ioctl.h index d2cb9aeaf1f1..beac64e38b87 100644 --- a/include/scsi/scsi_ioctl.h +++ b/include/scsi/scsi_ioctl.h @@ -45,8 +45,8 @@ typedef struct scsi_fctargaddress { int scsi_ioctl_block_when_processing_errors(struct scsi_device *sdev, int cmd, bool ndelay); -int scsi_ioctl(struct scsi_device *sdev, struct gendisk *disk, fmode_t mode, - int cmd, void __user *arg); +int scsi_ioctl(struct scsi_device *sdev, fmode_t mode, int cmd, + void __user *arg); int get_sg_io_hdr(struct sg_io_hdr *hdr, const void __user *argp); int put_sg_io_hdr(const struct sg_io_hdr *hdr, void __user *argp); bool scsi_cmd_allowed(unsigned char *cmd, fmode_t mode); From 8a7518931baa8ea023700987f3db31cb0a80610b Mon Sep 17 00:00:00 2001 From: Ye Bin <yebin10@huawei.com> Date: Mon, 29 Nov 2021 09:26:59 +0800 Subject: [PATCH 073/132] block: Fix fsync always failed if once failed We do test with inject error fault base on v4.19, after test some time we found sync /dev/sda always failed. [root@localhost] sync /dev/sda sync: error syncing '/dev/sda': Input/output error scsi log as follows: [19069.812296] sd 0:0:0:0: [sda] tag#64 Send: scmd 0x00000000d03a0b6b [19069.812302] sd 0:0:0:0: [sda] tag#64 CDB: Synchronize Cache(10) 35 00 00 00 00 00 00 00 00 00 [19069.812533] sd 0:0:0:0: [sda] tag#64 Done: SUCCESS Result: hostbyte=DID_OK driverbyte=DRIVER_OK [19069.812536] sd 0:0:0:0: [sda] tag#64 CDB: Synchronize Cache(10) 35 00 00 00 00 00 00 00 00 00 [19069.812539] sd 0:0:0:0: [sda] tag#64 scsi host busy 1 failed 0 [19069.812542] sd 0:0:0:0: Notifying upper driver of completion (result 0) [19069.812546] sd 0:0:0:0: [sda] tag#64 sd_done: completed 0 of 0 bytes [19069.812549] sd 0:0:0:0: [sda] tag#64 0 sectors total, 0 bytes done. [19069.812564] print_req_error: I/O error, dev sda, sector 0 ftrace log as follows: rep-306069 [007] .... 19654.923315: block_bio_queue: 8,0 FWS 0 + 0 [rep] rep-306069 [007] .... 19654.923333: block_getrq: 8,0 FWS 0 + 0 [rep] kworker/7:1H-250 [007] .... 19654.923352: block_rq_issue: 8,0 FF 0 () 0 + 0 [kworker/7:1H] <idle>-0 [007] ..s. 19654.923562: block_rq_complete: 8,0 FF () 18446744073709551615 + 0 [0] <idle>-0 [007] d.s. 19654.923576: block_rq_complete: 8,0 WS () 0 + 0 [-5] As 8d6996630c03 introduce 'fq->rq_status', this data only update when 'flush_rq' reference count isn't zero. If flush request once failed and record error code in 'fq->rq_status'. If there is no chance to update 'fq->rq_status',then do fsync will always failed. To address this issue reset 'fq->rq_status' after return error code to upper layer. Fixes: 8d6996630c03("block: fix null pointer dereference in blk_mq_rq_timed_out()") Signed-off-by: Ye Bin <yebin10@huawei.com> Reviewed-by: Ming Lei <ming.lei@redhat.com> Link: https://lore.kernel.org/r/20211129012659.1553733-1-yebin10@huawei.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-flush.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index fd5187a0898d..f78bb39e589e 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -242,8 +242,10 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) * avoiding use-after-free. */ WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); - if (fq->rq_status != BLK_STS_OK) + if (fq->rq_status != BLK_STS_OK) { error = fq->rq_status; + fq->rq_status = BLK_STS_OK; + } if (!q->elevator) { flush_rq->tag = BLK_MQ_NO_TAG; From 18d78171c061889a9a43152f60d6a27a10fc7656 Mon Sep 17 00:00:00 2001 From: Ming Lei <ming.lei@redhat.com> Date: Thu, 2 Dec 2021 17:07:16 +0800 Subject: [PATCH 074/132] blk-mq: check q->poll_stat in queue_poll_stat_show Without checking q->poll_stat in queue_poll_stat_show(), kernel panic may be caused if q->poll_stat isn't allocated. Fixes: 48b5c1fbcd8c ("block: only allocate poll_stats if there's a user of them") Signed-off-by: Ming Lei <ming.lei@redhat.com> Link: https://lore.kernel.org/r/20211202090716.3292244-1-ming.lei@redhat.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq-debugfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 7f27dca3a45e..3a790eb4995c 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -30,6 +30,9 @@ static int queue_poll_stat_show(void *data, struct seq_file *m) struct request_queue *q = data; int bucket; + if (!q->poll_stat) + return 0; + for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) { seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket)); print_stat(m, &q->poll_stat[2 * bucket]); From 373b5416b4b03ebda5d8f0605b81eff0dc76ebcf Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Thu, 2 Dec 2021 12:42:58 -0700 Subject: [PATCH 075/132] block: get rid of useless goto and label in blk_mq_get_new_requests() Expected case is returning a request, just check for success and return the request rather than having an error label. Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index ecfc47fad236..ca33cb755c5f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2720,11 +2720,8 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, } rq = __blk_mq_alloc_requests(&data); - if (!rq) - goto fail; - return rq; - -fail: + if (rq) + return rq; rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); From a08ed9aae8a3d2321ef378d6581cc87a3fb75b44 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Thu, 2 Dec 2021 12:43:46 -0700 Subject: [PATCH 076/132] block: fix double bio queue when merging in cached request path When we attempt to merge off the cached request path, we return NULL if successful. This makes the caller believe that it's should allocate a new request, and hence we end up with the bio both merged and associated with a new request. This, predictably, leads to all sorts of crashes. Pass in a pointer to the bio pointer, and clear it for the merge case. Then the caller knows that the bio is already queued, and no new requests need to get allocated. Fixes: 5b13bc8a3fd5 ("blk-mq: cleanup request allocation") Reviewed-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index ca33cb755c5f..fc4520e992b1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2731,7 +2731,7 @@ queue_exit: } static inline struct request *blk_mq_get_cached_request(struct request_queue *q, - struct blk_plug *plug, struct bio *bio, unsigned int nsegs) + struct blk_plug *plug, struct bio **bio, unsigned int nsegs) { struct request *rq; @@ -2741,19 +2741,21 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, if (!rq || rq->q != q) return NULL; - if (unlikely(!submit_bio_checks(bio))) + if (unlikely(!submit_bio_checks(*bio))) return NULL; - if (blk_mq_attempt_bio_merge(q, bio, nsegs)) + if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) { + *bio = NULL; return NULL; - if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type) + } + if (blk_mq_get_hctx_type((*bio)->bi_opf) != rq->mq_hctx->type) return NULL; - if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) + if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf)) return NULL; - rq->cmd_flags = bio->bi_opf; + rq->cmd_flags = (*bio)->bi_opf; plug->cached_rq = rq_list_next(rq); INIT_LIST_HEAD(&rq->queuelist); - rq_qos_throttle(q, bio); + rq_qos_throttle(q, *bio); return rq; } @@ -2789,8 +2791,10 @@ void blk_mq_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) return; - rq = blk_mq_get_cached_request(q, plug, bio, nr_segs); + rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs); if (!rq) { + if (!bio) + return; rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); if (unlikely(!rq)) return; From 4bdcd1dd4d2f973b1a89fb20ba720d879e9e506b Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Thu, 28 Oct 2021 08:47:05 -0600 Subject: [PATCH 077/132] mm: move filemap_range_needs_writeback() into header No functional changes in this patch, just in preparation for efficiently calling this light function from the block O_DIRECT handling. Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- fs/iomap/direct-io.c | 1 + include/linux/fs.h | 2 -- include/linux/pagemap.h | 29 +++++++++++++++++++++++++++++ mm/filemap.c | 32 +++----------------------------- 4 files changed, 33 insertions(+), 31 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b4dc51063d36..03ea367df19a 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -6,6 +6,7 @@ #include <linux/module.h> #include <linux/compiler.h> #include <linux/fs.h> +#include <linux/pagemap.h> #include <linux/iomap.h> #include <linux/backing-dev.h> #include <linux/uio.h> diff --git a/include/linux/fs.h b/include/linux/fs.h index bbf812ce89a8..6b8dc1a78df6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2847,8 +2847,6 @@ static inline int filemap_fdatawait(struct address_space *mapping) extern bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); -extern bool filemap_range_needs_writeback(struct address_space *, - loff_t lstart, loff_t lend); extern int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); extern int __filemap_fdatawrite_range(struct address_space *mapping, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 605246452305..274a0710f2c5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -963,6 +963,35 @@ static inline int add_to_page_cache(struct page *page, int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp); +bool filemap_range_has_writeback(struct address_space *mapping, + loff_t start_byte, loff_t end_byte); + +/** + * filemap_range_needs_writeback - check if range potentially needs writeback + * @mapping: address space within which to check + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Find at least one page in the range supplied, usually used to check if + * direct writing in this range will trigger a writeback. Used by O_DIRECT + * read/write with IOCB_NOWAIT, to see if the caller needs to do + * filemap_write_and_wait_range() before proceeding. + * + * Return: %true if the caller should do filemap_write_and_wait_range() before + * doing O_DIRECT to a page in this range, %false otherwise. + */ +static inline bool filemap_range_needs_writeback(struct address_space *mapping, + loff_t start_byte, + loff_t end_byte) +{ + if (!mapping->nrpages) + return false; + if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && + !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) + return false; + return filemap_range_has_writeback(mapping, start_byte, end_byte); +} + /** * struct readahead_control - Describes a readahead request. * diff --git a/mm/filemap.c b/mm/filemap.c index daa0e23a6ee6..655c9eec06b3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -646,8 +646,8 @@ static bool mapping_needs_writeback(struct address_space *mapping) return mapping->nrpages; } -static bool filemap_range_has_writeback(struct address_space *mapping, - loff_t start_byte, loff_t end_byte) +bool filemap_range_has_writeback(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) { XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; @@ -667,34 +667,8 @@ static bool filemap_range_has_writeback(struct address_space *mapping, } rcu_read_unlock(); return page != NULL; - } - -/** - * filemap_range_needs_writeback - check if range potentially needs writeback - * @mapping: address space within which to check - * @start_byte: offset in bytes where the range starts - * @end_byte: offset in bytes where the range ends (inclusive) - * - * Find at least one page in the range supplied, usually used to check if - * direct writing in this range will trigger a writeback. Used by O_DIRECT - * read/write with IOCB_NOWAIT, to see if the caller needs to do - * filemap_write_and_wait_range() before proceeding. - * - * Return: %true if the caller should do filemap_write_and_wait_range() before - * doing O_DIRECT to a page in this range, %false otherwise. - */ -bool filemap_range_needs_writeback(struct address_space *mapping, - loff_t start_byte, loff_t end_byte) -{ - if (!mapping_needs_writeback(mapping)) - return false; - if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && - !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) - return false; - return filemap_range_has_writeback(mapping, start_byte, end_byte); -} -EXPORT_SYMBOL_GPL(filemap_range_needs_writeback); +EXPORT_SYMBOL_GPL(filemap_range_has_writeback); /** * filemap_write_and_wait_range - write out & wait on a file range From ceaa762527f41a431b552bc000de4b626d2d8cb7 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Thu, 28 Oct 2021 08:57:09 -0600 Subject: [PATCH 078/132] block: move direct_IO into our own read_iter handler Don't call into generic_file_read_iter() if we know it's O_DIRECT, just set it up ourselves and call our own handler. This avoids an indirect call for O_DIRECT. Fall back to filemap_read() if we fail. Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/fops.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/block/fops.c b/block/fops.c index ad732a36f9b3..4d0e220f379e 100644 --- a/block/fops.c +++ b/block/fops.c @@ -566,21 +566,48 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct block_device *bdev = iocb->ki_filp->private_data; loff_t size = bdev_nr_bytes(bdev); + size_t count = iov_iter_count(to); loff_t pos = iocb->ki_pos; size_t shorted = 0; - ssize_t ret; + ssize_t ret = 0; - if (unlikely(pos + iov_iter_count(to) > size)) { + if (unlikely(pos + count > size)) { if (pos >= size) return 0; size -= pos; - if (iov_iter_count(to) > size) { - shorted = iov_iter_count(to) - size; + if (count > size) { + shorted = count - size; iov_iter_truncate(to, size); } } - ret = generic_file_read_iter(iocb, to); + if (iocb->ki_flags & IOCB_DIRECT) { + struct address_space *mapping = iocb->ki_filp->f_mapping; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_needs_writeback(mapping, iocb->ki_pos, + iocb->ki_pos + count - 1)) + return -EAGAIN; + } else { + ret = filemap_write_and_wait_range(mapping, + iocb->ki_pos, + iocb->ki_pos + count - 1); + if (ret < 0) + return ret; + } + + file_accessed(iocb->ki_filp); + + ret = blkdev_direct_IO(iocb, to); + if (ret >= 0) { + iocb->ki_pos += ret; + count -= ret; + } + if (ret < 0 || !count) + return ret; + } + + ret = filemap_read(iocb, to, ret); if (unlikely(shorted)) iov_iter_reexpand(to, iov_iter_count(to) + shorted); From 0a467d0fdd9594fbb449ebc93852533332c528fd Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Thu, 14 Oct 2021 14:39:59 -0600 Subject: [PATCH 079/132] block: switch to atomic_t for request references refcount_t is not as expensive as it used to be, but it's still more expensive than the io_uring method of using atomic_t and just checking for potential over/underflow. This borrows that same implementation, which in turn is based on the mm implementation from Linus. Reviewed-by: Keith Busch <kbusch@kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-flush.c | 4 ++-- block/blk-mq-tag.c | 2 +- block/blk-mq.c | 12 ++++++------ block/blk.h | 31 +++++++++++++++++++++++++++++++ include/linux/blk-mq.h | 2 +- 5 files changed, 41 insertions(+), 10 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index f78bb39e589e..e4df894189ce 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -229,7 +229,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); - if (!refcount_dec_and_test(&flush_rq->ref)) { + if (!req_ref_put_and_test(flush_rq)) { fq->rq_status = error; spin_unlock_irqrestore(&fq->mq_flush_lock, flags); return; @@ -349,7 +349,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, * and READ flush_rq->end_io */ smp_wmb(); - refcount_set(&flush_rq->ref, 1); + req_ref_set(flush_rq, 1); blk_flush_queue_rq(flush_rq, false); } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 995336abee33..380e2dd31bfc 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -228,7 +228,7 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, spin_lock_irqsave(&tags->lock, flags); rq = tags->rqs[bitnr]; - if (!rq || rq->tag != bitnr || !refcount_inc_not_zero(&rq->ref)) + if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq)) rq = NULL; spin_unlock_irqrestore(&tags->lock, flags); return rq; diff --git a/block/blk-mq.c b/block/blk-mq.c index fc4520e992b1..8c7cab75229e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -394,7 +394,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, INIT_LIST_HEAD(&rq->queuelist); /* tag was already set */ WRITE_ONCE(rq->deadline, 0); - refcount_set(&rq->ref, 1); + req_ref_set(rq, 1); if (rq->rq_flags & RQF_ELV) { struct elevator_queue *e = data->q->elevator; @@ -642,7 +642,7 @@ void blk_mq_free_request(struct request *rq) rq_qos_done(q, rq); WRITE_ONCE(rq->state, MQ_RQ_IDLE); - if (refcount_dec_and_test(&rq->ref)) + if (req_ref_put_and_test(rq)) __blk_mq_free_request(rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); @@ -938,7 +938,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) rq_qos_done(rq->q, rq); WRITE_ONCE(rq->state, MQ_RQ_IDLE); - if (!refcount_dec_and_test(&rq->ref)) + if (!req_ref_put_and_test(rq)) continue; blk_crypto_free_request(rq); @@ -1401,7 +1401,7 @@ void blk_mq_put_rq_ref(struct request *rq) { if (is_flush_rq(rq)) rq->end_io(rq, 0); - else if (refcount_dec_and_test(&rq->ref)) + else if (req_ref_put_and_test(rq)) __blk_mq_free_request(rq); } @@ -3049,7 +3049,7 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, unsigned long rq_addr = (unsigned long)rq; if (rq_addr >= start && rq_addr < end) { - WARN_ON_ONCE(refcount_read(&rq->ref) != 0); + WARN_ON_ONCE(req_ref_read(rq) != 0); cmpxchg(&drv_tags->rqs[i], rq, NULL); } } @@ -3383,7 +3383,7 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, if (!tags) return; - WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0); + WARN_ON_ONCE(req_ref_read(flush_rq) != 0); for (i = 0; i < queue_depth; i++) cmpxchg(&tags->rqs[i], flush_rq, NULL); diff --git a/block/blk.h b/block/blk.h index a55d82c3d1c2..24d8b333bb03 100644 --- a/block/blk.h +++ b/block/blk.h @@ -461,4 +461,35 @@ static inline bool should_fail_request(struct block_device *part, } #endif /* CONFIG_FAIL_MAKE_REQUEST */ +/* + * Optimized request reference counting. Ideally we'd make timeouts be more + * clever, as that's the only reason we need references at all... But until + * this happens, this is faster than using refcount_t. Also see: + * + * abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count") + */ +#define req_ref_zero_or_close_to_overflow(req) \ + ((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u) + +static inline bool req_ref_inc_not_zero(struct request *req) +{ + return atomic_inc_not_zero(&req->ref); +} + +static inline bool req_ref_put_and_test(struct request *req) +{ + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + return atomic_dec_and_test(&req->ref); +} + +static inline void req_ref_set(struct request *req, int value) +{ + atomic_set(&req->ref, value); +} + +static inline int req_ref_read(struct request *req) +{ + return atomic_read(&req->ref); +} + #endif /* BLK_INTERNAL_H */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1b87b7c8bbff..561beb5be7ec 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -139,7 +139,7 @@ struct request { unsigned short ioprio; enum mq_rq_state state; - refcount_t ref; + atomic_t ref; unsigned long deadline; From 2a904d00855f94cb85751e45fa494f225d44ae0d Mon Sep 17 00:00:00 2001 From: Ming Lei <ming.lei@redhat.com> Date: Fri, 3 Dec 2021 21:15:31 +0800 Subject: [PATCH 080/132] blk-mq: remove hctx_lock and hctx_unlock Remove hctx_lock and hctx_unlock, and add one helper of blk_mq_run_dispatch_ops() to run code block defined in dispatch_ops with rcu/srcu read held. Compared with hctx_lock()/hctx_unlock(): 1) remove 2 branch to 1, so we just need to check (hctx->flags & BLK_MQ_F_BLOCKING) once when running one dispatch_ops 2) srcu_idx needn't to be touched in case of non-blocking 3) might_sleep_if() can be moved to the blocking branch Also put the added blk_mq_run_dispatch_ops() in private header, so that the following patch can use it out of blk-mq.c. Signed-off-by: Ming Lei <ming.lei@redhat.com> Link: https://lore.kernel.org/r/20211203131534.3668411-2-ming.lei@redhat.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq.c | 57 +++++++++----------------------------------------- block/blk-mq.h | 16 ++++++++++++++ 2 files changed, 26 insertions(+), 47 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 8c7cab75229e..494da31dc1a5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1071,26 +1071,6 @@ void blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); -static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) - __releases(hctx->srcu) -{ - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) - rcu_read_unlock(); - else - srcu_read_unlock(hctx->srcu, srcu_idx); -} - -static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) - __acquires(hctx->srcu) -{ - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { - /* shut up gcc false positive */ - *srcu_idx = 0; - rcu_read_lock(); - } else - *srcu_idx = srcu_read_lock(hctx->srcu); -} - /** * blk_mq_start_request - Start processing a request * @rq: Pointer to request to be started @@ -1947,19 +1927,13 @@ out: */ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { - int srcu_idx; - /* * We can't run the queue inline with ints disabled. Ensure that * we catch bad users of this early. */ WARN_ON_ONCE(in_interrupt()); - might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - - hctx_lock(hctx, &srcu_idx); - blk_mq_sched_dispatch_requests(hctx); - hctx_unlock(hctx, srcu_idx); + blk_mq_run_dispatch_ops(hctx, blk_mq_sched_dispatch_requests(hctx)); } static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) @@ -2071,7 +2045,6 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); */ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - int srcu_idx; bool need_run; /* @@ -2082,10 +2055,9 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) * And queue will be rerun in blk_mq_unquiesce_queue() if it is * quiesced. */ - hctx_lock(hctx, &srcu_idx); - need_run = !blk_queue_quiesced(hctx->queue) && - blk_mq_hctx_has_pending(hctx); - hctx_unlock(hctx, srcu_idx); + blk_mq_run_dispatch_ops(hctx, + need_run = !blk_queue_quiesced(hctx->queue) && + blk_mq_hctx_has_pending(hctx)); if (need_run) __blk_mq_delay_run_hw_queue(hctx, async, 0); @@ -2488,32 +2460,22 @@ insert: static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq) { - blk_status_t ret; - int srcu_idx; + blk_status_t ret = + __blk_mq_try_issue_directly(hctx, rq, false, true); - might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - - hctx_lock(hctx, &srcu_idx); - - ret = __blk_mq_try_issue_directly(hctx, rq, false, true); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) blk_mq_request_bypass_insert(rq, false, true); else if (ret != BLK_STS_OK) blk_mq_end_request(rq, ret); - - hctx_unlock(hctx, srcu_idx); } static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) { blk_status_t ret; - int srcu_idx; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - hctx_lock(hctx, &srcu_idx); - ret = __blk_mq_try_issue_directly(hctx, rq, true, last); - hctx_unlock(hctx, srcu_idx); - + blk_mq_run_dispatch_ops(hctx, + ret = __blk_mq_try_issue_directly(hctx, rq, true, last)); return ret; } @@ -2826,7 +2788,8 @@ void blk_mq_submit_bio(struct bio *bio) (q->nr_hw_queues == 1 || !is_sync))) blk_mq_sched_insert_request(rq, false, true, true); else - blk_mq_try_issue_directly(rq->mq_hctx, rq); + blk_mq_run_dispatch_ops(rq->mq_hctx, + blk_mq_try_issue_directly(rq->mq_hctx, rq)); } /** diff --git a/block/blk-mq.h b/block/blk-mq.h index d516c7a46f57..e4c396204928 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -374,5 +374,21 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, return __blk_mq_active_requests(hctx) < depth; } +/* run the code block in @dispatch_ops with rcu/srcu read lock held */ +#define blk_mq_run_dispatch_ops(hctx, dispatch_ops) \ +do { \ + if (!((hctx)->flags & BLK_MQ_F_BLOCKING)) { \ + rcu_read_lock(); \ + (dispatch_ops); \ + rcu_read_unlock(); \ + } else { \ + int srcu_idx; \ + \ + might_sleep(); \ + srcu_idx = srcu_read_lock((hctx)->srcu); \ + (dispatch_ops); \ + srcu_read_unlock((hctx)->srcu, srcu_idx); \ + } \ +} while (0) #endif From 704b914f15fb7daaf517e3acc4bed472b50ca19e Mon Sep 17 00:00:00 2001 From: Ming Lei <ming.lei@redhat.com> Date: Fri, 3 Dec 2021 21:15:32 +0800 Subject: [PATCH 081/132] blk-mq: move srcu from blk_mq_hw_ctx to request_queue In case of BLK_MQ_F_BLOCKING, per-hctx srcu is used to protect dispatch critical area. However, this srcu instance stays at the end of hctx, and it often takes standalone cacheline, often cold. Inside srcu_read_lock() and srcu_read_unlock(), WRITE is always done on the indirect percpu variable which is allocated from heap instead of being embedded, srcu->srcu_idx is read only in srcu_read_lock(). It doesn't matter if srcu structure stays in hctx or request queue. So switch to per-request-queue srcu for protecting dispatch, and this way simplifies quiesce a lot, not mention quiesce is always done on the request queue wide. Signed-off-by: Ming Lei <ming.lei@redhat.com> Link: https://lore.kernel.org/r/20211203131534.3668411-3-ming.lei@redhat.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-core.c | 27 ++++++++++++++++++++++----- block/blk-mq-sysfs.c | 2 -- block/blk-mq.c | 37 ++++++++----------------------------- block/blk-mq.h | 4 ++-- block/blk-sysfs.c | 3 ++- block/blk.h | 10 +++++++++- block/genhd.c | 2 +- include/linux/blk-mq.h | 8 -------- include/linux/blkdev.h | 9 +++++++++ 9 files changed, 53 insertions(+), 49 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index b0660c9df852..10619fd83c1b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -66,6 +66,7 @@ DEFINE_IDA(blk_queue_ida); * For queue allocation */ struct kmem_cache *blk_requestq_cachep; +struct kmem_cache *blk_requestq_srcu_cachep; /* * Controlling structure to kblockd @@ -437,21 +438,27 @@ static void blk_timeout_work(struct work_struct *work) { } -struct request_queue *blk_alloc_queue(int node_id) +struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu) { struct request_queue *q; int ret; - q = kmem_cache_alloc_node(blk_requestq_cachep, - GFP_KERNEL | __GFP_ZERO, node_id); + q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu), + GFP_KERNEL | __GFP_ZERO, node_id); if (!q) return NULL; + if (alloc_srcu) { + blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q); + if (init_srcu_struct(q->srcu) != 0) + goto fail_q; + } + q->last_merge = NULL; q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); if (q->id < 0) - goto fail_q; + goto fail_srcu; ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0); if (ret) @@ -508,8 +515,11 @@ fail_split: bioset_exit(&q->bio_split); fail_id: ida_simple_remove(&blk_queue_ida, q->id); +fail_srcu: + if (alloc_srcu) + cleanup_srcu_struct(q->srcu); fail_q: - kmem_cache_free(blk_requestq_cachep, q); + kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q); return NULL; } @@ -1301,6 +1311,9 @@ int __init blk_dev_init(void) sizeof_field(struct request, cmd_flags)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * sizeof_field(struct bio, bi_opf)); + BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu), + __alignof__(struct request_queue)) != + sizeof(struct request_queue)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", @@ -1311,6 +1324,10 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); + blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu", + sizeof(struct request_queue) + + sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL); + blk_debugfs_root = debugfs_create_dir("block", NULL); return 0; diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 253c857cba47..674786574075 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -36,8 +36,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj) struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); - if (hctx->flags & BLK_MQ_F_BLOCKING) - cleanup_srcu_struct(hctx->srcu); blk_free_flush_queue(hctx->fq); sbitmap_free(&hctx->ctx_map); free_cpumask_var(hctx->cpumask); diff --git a/block/blk-mq.c b/block/blk-mq.c index 494da31dc1a5..6a2c2704454e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -260,17 +260,9 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); */ void blk_mq_wait_quiesce_done(struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; - unsigned int i; - bool rcu = false; - - queue_for_each_hw_ctx(q, hctx, i) { - if (hctx->flags & BLK_MQ_F_BLOCKING) - synchronize_srcu(hctx->srcu); - else - rcu = true; - } - if (rcu) + if (blk_queue_has_srcu(q)) + synchronize_srcu(q->srcu); + else synchronize_rcu(); } EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done); @@ -3400,20 +3392,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, } } -static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) -{ - int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); - - BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu), - __alignof__(struct blk_mq_hw_ctx)) != - sizeof(struct blk_mq_hw_ctx)); - - if (tag_set->flags & BLK_MQ_F_BLOCKING) - hw_ctx_size += sizeof(struct srcu_struct); - - return hw_ctx_size; -} - static int blk_mq_init_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) @@ -3451,7 +3429,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx; gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; - hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node); + hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node); if (!hctx) goto fail_alloc_hctx; @@ -3493,8 +3471,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, if (!hctx->fq) goto free_bitmap; - if (hctx->flags & BLK_MQ_F_BLOCKING) - init_srcu_struct(hctx->srcu); blk_mq_hctx_kobj_init(hctx); return hctx; @@ -3830,7 +3806,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, struct request_queue *q; int ret; - q = blk_alloc_queue(set->numa_node); + q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING); if (!q) return ERR_PTR(-ENOMEM); q->queuedata = queuedata; @@ -3979,6 +3955,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { + WARN_ON_ONCE(blk_queue_has_srcu(q) != + !!(set->flags & BLK_MQ_F_BLOCKING)); + /* mark the queue as mq asap */ q->mq_ops = set->ops; diff --git a/block/blk-mq.h b/block/blk-mq.h index e4c396204928..792f0b29c6eb 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -385,9 +385,9 @@ do { \ int srcu_idx; \ \ might_sleep(); \ - srcu_idx = srcu_read_lock((hctx)->srcu); \ + srcu_idx = srcu_read_lock((hctx)->queue->srcu); \ (dispatch_ops); \ - srcu_read_unlock((hctx)->srcu, srcu_idx); \ + srcu_read_unlock((hctx)->queue->srcu, srcu_idx); \ } \ } while (0) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 4622da4bb992..3e6357321225 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -735,7 +735,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) { struct request_queue *q = container_of(rcu_head, struct request_queue, rcu_head); - kmem_cache_free(blk_requestq_cachep, q); + + kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q); } /* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ diff --git a/block/blk.h b/block/blk.h index 24d8b333bb03..7ccb7c7d86b3 100644 --- a/block/blk.h +++ b/block/blk.h @@ -27,6 +27,7 @@ struct blk_flush_queue { }; extern struct kmem_cache *blk_requestq_cachep; +extern struct kmem_cache *blk_requestq_srcu_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; @@ -424,7 +425,14 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); -struct request_queue *blk_alloc_queue(int node_id); +static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu) +{ + if (srcu) + return blk_requestq_srcu_cachep; + return blk_requestq_cachep; +} +struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu); + int disk_scan_partitions(struct gendisk *disk, fmode_t mode); int disk_alloc_events(struct gendisk *disk); diff --git a/block/genhd.c b/block/genhd.c index 5179a4f00fba..3c139a1b6f04 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1338,7 +1338,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) struct request_queue *q; struct gendisk *disk; - q = blk_alloc_queue(node); + q = blk_alloc_queue(node, false); if (!q) return NULL; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 561beb5be7ec..ecdc049b52fa 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -4,7 +4,6 @@ #include <linux/blkdev.h> #include <linux/sbitmap.h> -#include <linux/srcu.h> #include <linux/lockdep.h> #include <linux/scatterlist.h> #include <linux/prefetch.h> @@ -375,13 +374,6 @@ struct blk_mq_hw_ctx { * q->unused_hctx_list. */ struct list_head hctx_list; - - /** - * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is - * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also - * blk_mq_hw_ctx_size(). - */ - struct srcu_struct srcu[]; }; /** diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0a4416ef4fbf..c80cfaefc0a8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -16,6 +16,7 @@ #include <linux/percpu-refcount.h> #include <linux/blkzoned.h> #include <linux/sbitmap.h> +#include <linux/srcu.h> struct module; struct request_queue; @@ -373,11 +374,18 @@ struct request_queue { * devices that do not have multiple independent access ranges. */ struct blk_independent_access_ranges *ia_ranges; + + /** + * @srcu: Sleepable RCU. Use as lock when type of the request queue + * is blocking (BLK_MQ_F_BLOCKING). Must be the last member + */ + struct srcu_struct srcu[]; }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ #define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ #define QUEUE_FLAG_DYING 1 /* queue being torn down */ +#define QUEUE_FLAG_HAS_SRCU 2 /* SRCU is allocated */ #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ @@ -415,6 +423,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) +#define blk_queue_has_srcu(q) test_bit(QUEUE_FLAG_HAS_SRCU, &(q)->queue_flags) #define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) From bcc330f42f442a98d61f153d16c0b6487461ee81 Mon Sep 17 00:00:00 2001 From: Ming Lei <ming.lei@redhat.com> Date: Fri, 3 Dec 2021 21:15:33 +0800 Subject: [PATCH 082/132] blk-mq: pass request queue to blk_mq_run_dispatch_ops We have switched to allocate srcu into request queue, so it is fine to pass request queue to blk_mq_run_dispatch_ops(). Signed-off-by: Ming Lei <ming.lei@redhat.com> Link: https://lore.kernel.org/r/20211203131534.3668411-4-ming.lei@redhat.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq.c | 9 +++++---- block/blk-mq.h | 8 ++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 6a2c2704454e..24c65bb8719b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1925,7 +1925,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) */ WARN_ON_ONCE(in_interrupt()); - blk_mq_run_dispatch_ops(hctx, blk_mq_sched_dispatch_requests(hctx)); + blk_mq_run_dispatch_ops(hctx->queue, + blk_mq_sched_dispatch_requests(hctx)); } static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) @@ -2047,7 +2048,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) * And queue will be rerun in blk_mq_unquiesce_queue() if it is * quiesced. */ - blk_mq_run_dispatch_ops(hctx, + blk_mq_run_dispatch_ops(hctx->queue, need_run = !blk_queue_quiesced(hctx->queue) && blk_mq_hctx_has_pending(hctx)); @@ -2466,7 +2467,7 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) blk_status_t ret; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - blk_mq_run_dispatch_ops(hctx, + blk_mq_run_dispatch_ops(rq->q, ret = __blk_mq_try_issue_directly(hctx, rq, true, last)); return ret; } @@ -2780,7 +2781,7 @@ void blk_mq_submit_bio(struct bio *bio) (q->nr_hw_queues == 1 || !is_sync))) blk_mq_sched_insert_request(rq, false, true, true); else - blk_mq_run_dispatch_ops(rq->mq_hctx, + blk_mq_run_dispatch_ops(rq->q, blk_mq_try_issue_directly(rq->mq_hctx, rq)); } diff --git a/block/blk-mq.h b/block/blk-mq.h index 792f0b29c6eb..d62004e2d531 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -375,9 +375,9 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, } /* run the code block in @dispatch_ops with rcu/srcu read lock held */ -#define blk_mq_run_dispatch_ops(hctx, dispatch_ops) \ +#define blk_mq_run_dispatch_ops(q, dispatch_ops) \ do { \ - if (!((hctx)->flags & BLK_MQ_F_BLOCKING)) { \ + if (!blk_queue_has_srcu(q)) { \ rcu_read_lock(); \ (dispatch_ops); \ rcu_read_unlock(); \ @@ -385,9 +385,9 @@ do { \ int srcu_idx; \ \ might_sleep(); \ - srcu_idx = srcu_read_lock((hctx)->queue->srcu); \ + srcu_idx = srcu_read_lock((q)->srcu); \ (dispatch_ops); \ - srcu_read_unlock((hctx)->queue->srcu, srcu_idx); \ + srcu_read_unlock((q)->srcu, srcu_idx); \ } \ } while (0) From 4cafe86c9267f9dd5819df946ba8c038ba958370 Mon Sep 17 00:00:00 2001 From: Ming Lei <ming.lei@redhat.com> Date: Fri, 3 Dec 2021 21:15:34 +0800 Subject: [PATCH 083/132] blk-mq: run dispatch lock once in case of issuing from list It isn't necessary to call blk_mq_run_dispatch_ops() once for issuing single request directly, and enough to do it one time when issuing from whole list. Signed-off-by: Ming Lei <ming.lei@redhat.com> Link: https://lore.kernel.org/r/20211203131534.3668411-5-ming.lei@redhat.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq-sched.c | 3 ++- block/blk-mq.c | 14 ++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 0d7257848f7e..55488ba97823 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -475,7 +475,8 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, * us one extra enqueue & dequeue to sw queue. */ if (!hctx->dispatch_busy && !run_queue_async) { - blk_mq_try_issue_list_directly(hctx, list); + blk_mq_run_dispatch_ops(hctx->queue, + blk_mq_try_issue_list_directly(hctx, list)); if (list_empty(list)) goto out; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 24c65bb8719b..22ec21aa0c22 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2464,12 +2464,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) { - blk_status_t ret; - struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - - blk_mq_run_dispatch_ops(rq->q, - ret = __blk_mq_try_issue_directly(hctx, rq, true, last)); - return ret; + return __blk_mq_try_issue_directly(rq->mq_hctx, rq, true, last); } static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule) @@ -2526,7 +2521,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) plug->rq_count = 0; if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { - blk_mq_plug_issue_direct(plug, false); + blk_mq_run_dispatch_ops(plug->mq_list->q, + blk_mq_plug_issue_direct(plug, false)); if (rq_list_empty(plug->mq_list)) return; } @@ -2867,7 +2863,9 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - return blk_mq_request_issue_directly(rq, true); + blk_mq_run_dispatch_ops(rq->q, + ret = blk_mq_request_issue_directly(rq, true)); + return ret; } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); From 41adf531e390e7969f00a560b8971cbf42f5a6da Mon Sep 17 00:00:00 2001 From: Ming Lei <ming.lei@redhat.com> Date: Mon, 6 Dec 2021 19:12:13 +0800 Subject: [PATCH 084/132] blk-mq: don't run might_sleep() if the operation needn't blocking The operation protected via blk_mq_run_dispatch_ops() in blk_mq_run_hw_queue won't sleep, so don't run might_sleep() for it. Reported-and-tested-by: Marek Szyprowski <m.szyprowski@samsung.com> Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq.c | 2 +- block/blk-mq.h | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 22ec21aa0c22..706e9a836fe6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2048,7 +2048,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) * And queue will be rerun in blk_mq_unquiesce_queue() if it is * quiesced. */ - blk_mq_run_dispatch_ops(hctx->queue, + __blk_mq_run_dispatch_ops(hctx->queue, false, need_run = !blk_queue_quiesced(hctx->queue) && blk_mq_hctx_has_pending(hctx)); diff --git a/block/blk-mq.h b/block/blk-mq.h index d62004e2d531..948791ea2a3e 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -375,7 +375,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, } /* run the code block in @dispatch_ops with rcu/srcu read lock held */ -#define blk_mq_run_dispatch_ops(q, dispatch_ops) \ +#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \ do { \ if (!blk_queue_has_srcu(q)) { \ rcu_read_lock(); \ @@ -384,11 +384,14 @@ do { \ } else { \ int srcu_idx; \ \ - might_sleep(); \ + might_sleep_if(check_sleep); \ srcu_idx = srcu_read_lock((q)->srcu); \ (dispatch_ops); \ srcu_read_unlock((q)->srcu, srcu_idx); \ } \ } while (0) +#define blk_mq_run_dispatch_ops(q, dispatch_ops) \ + __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \ + #endif From 73f3760eddc9bc32c207fff06537f98f94bef451 Mon Sep 17 00:00:00 2001 From: Ming Lei <ming.lei@redhat.com> Date: Mon, 6 Dec 2021 11:33:50 +0800 Subject: [PATCH 085/132] blk-mq: don't use plug->mq_list->q directly in blk_mq_run_dispatch_ops() blk_mq_run_dispatch_ops() is defined as one macro, and plug->mq_list will be changed when running 'dispatch_ops', so add one local variable for holding request queue. Reported-and-tested-by: Yi Zhang <yi.zhang@redhat.com> Fixes: 4cafe86c9267 ("blk-mq: run dispatch lock once in case of issuing from list") Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 706e9a836fe6..0bf3523dd1f5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2521,7 +2521,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) plug->rq_count = 0; if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { - blk_mq_run_dispatch_ops(plug->mq_list->q, + struct request_queue *q = rq_list_peek(&plug->mq_list)->q; + + blk_mq_run_dispatch_ops(q, blk_mq_plug_issue_direct(plug, false)); if (rq_list_empty(plug->mq_list)) return; From 8ab30a331946c34e4ba022c44df8624acea1c74e Mon Sep 17 00:00:00 2001 From: John Garry <john.garry@huawei.com> Date: Mon, 6 Dec 2021 20:49:48 +0800 Subject: [PATCH 086/132] blk-mq: Drop busy_iter_fn blk_mq_hw_ctx argument The only user of blk_mq_hw_ctx blk_mq_hw_ctx argument is blk_mq_rq_inflight(). Function blk_mq_rq_inflight() uses the hctx to find the associated request queue to match against the request. However this same check is already done in caller bt_iter(), so drop this check. With that change there are no more users of busy_iter_fn blk_mq_hw_ctx argument, so drop the argument. Reviewed-by Hannes Reinecke <hare@suse.de> Signed-off-by: John Garry <john.garry@huawei.com> Reviewed-by: Ming Lei <ming.lei@redhat.com> Tested-by: Kashyap Desai <kashyap.desai@broadcom.com> Link: https://lore.kernel.org/r/1638794990-137490-2-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq-tag.c | 2 +- block/blk-mq.c | 17 ++++++++--------- include/linux/blk-mq.h | 3 +-- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 380e2dd31bfc..d3cf91d764d5 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -254,7 +254,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) return true; if (rq->q == hctx->queue && rq->mq_hctx == hctx) - ret = iter_data->fn(hctx, rq, iter_data->data, reserved); + ret = iter_data->fn(rq, iter_data->data, reserved); blk_mq_put_rq_ref(rq); return ret; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 0bf3523dd1f5..103c0f58853c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -127,8 +127,7 @@ struct mq_inflight { unsigned int inflight[2]; }; -static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, - struct request *rq, void *priv, +static bool blk_mq_check_inflight(struct request *rq, void *priv, bool reserved) { struct mq_inflight *mi = priv; @@ -1308,14 +1307,15 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q, } EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); -static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, - void *priv, bool reserved) +static bool blk_mq_rq_inflight(struct request *rq, void *priv, + bool reserved) { /* - * If we find a request that isn't idle and the queue matches, - * we know the queue is busy. Return false to stop the iteration. + * If we find a request that isn't idle we know the queue is busy + * as it's checked in the iter. + * Return false to stop the iteration. */ - if (blk_mq_request_started(rq) && rq->q == hctx->queue) { + if (blk_mq_request_started(rq)) { bool *busy = priv; *busy = true; @@ -1377,8 +1377,7 @@ void blk_mq_put_rq_ref(struct request *rq) __blk_mq_free_request(rq); } -static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, - struct request *rq, void *priv, bool reserved) +static bool blk_mq_check_expired(struct request *rq, void *priv, bool reserved) { unsigned long *next = priv; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ecdc049b52fa..17ebf29e42d8 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -470,8 +470,7 @@ struct blk_mq_queue_data { bool last; }; -typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, - bool); +typedef bool (busy_iter_fn)(struct request *, void *, bool); typedef bool (busy_tag_iter_fn)(struct request *, void *, bool); /** From fc39f8d2d1c10ac04976b0a247865bb0cec4dd88 Mon Sep 17 00:00:00 2001 From: John Garry <john.garry@huawei.com> Date: Mon, 6 Dec 2021 20:49:49 +0800 Subject: [PATCH 087/132] blk-mq: Delete busy_iter_fn Typedefs busy_iter_fn and busy_tag_iter_fn are now identical, so delete busy_iter_fn to reduce duplication. It would be nicer to delete busy_tag_iter_fn, as the name busy_iter_fn is less specific. However busy_tag_iter_fn is used in many different parts of the tree, unlike busy_iter_fn which is just use in block/, so just take the straightforward path now, so that we could rename later treewide. Signed-off-by: John Garry <john.garry@huawei.com> Reviewed-by: Ming Lei <ming.lei@redhat.com> Reviewed-by: Hannes Reinecke <hare@suse.de> Tested-by: Kashyap Desai <kashyap.desai@broadcom.com> Link: https://lore.kernel.org/r/1638794990-137490-3-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq-tag.c | 6 +++--- block/blk-mq-tag.h | 2 +- include/linux/blk-mq.h | 1 - 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index d3cf91d764d5..58b80d4b7a07 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -215,7 +215,7 @@ void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags) struct bt_iter_data { struct blk_mq_hw_ctx *hctx; - busy_iter_fn *fn; + busy_tag_iter_fn *fn; void *data; bool reserved; }; @@ -274,7 +274,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * bitmap_tags member of struct blk_mq_tags. */ static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, - busy_iter_fn *fn, void *data, bool reserved) + busy_tag_iter_fn *fn, void *data, bool reserved) { struct bt_iter_data iter_data = { .hctx = hctx, @@ -457,7 +457,7 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); * called for all requests on all queues that share that tag set and not only * for requests associated with @q. */ -void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv) { struct blk_mq_hw_ctx *hctx; diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index df787b5a23bd..5668e28be0b7 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -28,7 +28,7 @@ extern void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, extern void blk_mq_tag_update_sched_shared_tags(struct request_queue *q); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); -void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv); void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 17ebf29e42d8..772f8f921526 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -470,7 +470,6 @@ struct blk_mq_queue_data { bool last; }; -typedef bool (busy_iter_fn)(struct request *, void *, bool); typedef bool (busy_tag_iter_fn)(struct request *, void *, bool); /** From fea9f92f1748083cb82049ed503be30c3d3a9b69 Mon Sep 17 00:00:00 2001 From: John Garry <john.garry@huawei.com> Date: Mon, 6 Dec 2021 20:49:50 +0800 Subject: [PATCH 088/132] blk-mq: Optimise blk_mq_queue_tag_busy_iter() for shared tags Kashyap reports high CPU usage in blk_mq_queue_tag_busy_iter() and callees using megaraid SAS RAID card since moving to shared tags [0]. Previously, when shared tags was shared sbitmap, this function was less than optimum since we would iter through all tags for all hctx's, yet only ever match upto tagset depth number of rqs. Since the change to shared tags, things are even less efficient if we have parallel callers of blk_mq_queue_tag_busy_iter(). This is because in bt_iter() -> blk_mq_find_and_get_req() there would be more contention on accessing each request ref and tags->lock since they are now shared among all HW queues. Optimise by having separate calls to bt_for_each() for when we're using shared tags. In this case no longer pass a hctx, as it is no longer relevant, and teach bt_iter() about this. Ming suggested something along the lines of this change, apart from a different implementation. [0] https://lore.kernel.org/linux-block/e4e92abbe9d52bcba6b8cc6c91c442cc@mail.gmail.com/ Signed-off-by: John Garry <john.garry@huawei.com> Reviewed-by: Hannes Reinecke <hare@suse.de> Reviewed-by: Ming Lei <ming.lei@redhat.com> Reported-and-tested-by: Kashyap Desai <kashyap.desai@broadcom.com> Fixes: e155b0c238b2 ("blk-mq: Use shared tags for shared sbitmap support") Link: https://lore.kernel.org/r/1638794990-137490-4-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq-tag.c | 59 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 58b80d4b7a07..e55a6834c9a6 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -215,6 +215,7 @@ void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags) struct bt_iter_data { struct blk_mq_hw_ctx *hctx; + struct request_queue *q; busy_tag_iter_fn *fn; void *data; bool reserved; @@ -238,11 +239,18 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_iter_data *iter_data = data; struct blk_mq_hw_ctx *hctx = iter_data->hctx; - struct blk_mq_tags *tags = hctx->tags; + struct request_queue *q = iter_data->q; + struct blk_mq_tag_set *set = q->tag_set; bool reserved = iter_data->reserved; + struct blk_mq_tags *tags; struct request *rq; bool ret = true; + if (blk_mq_is_shared_tags(set->flags)) + tags = set->shared_tags; + else + tags = hctx->tags; + if (!reserved) bitnr += tags->nr_reserved_tags; /* @@ -253,7 +261,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) if (!rq) return true; - if (rq->q == hctx->queue && rq->mq_hctx == hctx) + if (rq->q == q && (!hctx || rq->mq_hctx == hctx)) ret = iter_data->fn(rq, iter_data->data, reserved); blk_mq_put_rq_ref(rq); return ret; @@ -262,6 +270,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) /** * bt_for_each - iterate over the requests associated with a hardware queue * @hctx: Hardware queue to examine. + * @q: Request queue to examine. * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each request @@ -273,14 +282,16 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * @reserved: Indicates whether @bt is the breserved_tags member or the * bitmap_tags member of struct blk_mq_tags. */ -static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, - busy_tag_iter_fn *fn, void *data, bool reserved) +static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct request_queue *q, + struct sbitmap_queue *bt, busy_tag_iter_fn *fn, + void *data, bool reserved) { struct bt_iter_data iter_data = { .hctx = hctx, .fn = fn, .data = data, .reserved = reserved, + .q = q, }; sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); @@ -460,9 +471,6 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv) { - struct blk_mq_hw_ctx *hctx; - int i; - /* * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx * while the queue is frozen. So we can use q_usage_counter to avoid @@ -471,19 +479,34 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, if (!percpu_ref_tryget(&q->q_usage_counter)) return; - queue_for_each_hw_ctx(q, hctx, i) { - struct blk_mq_tags *tags = hctx->tags; - - /* - * If no software queues are currently mapped to this - * hardware queue, there's nothing to check - */ - if (!blk_mq_hw_queue_mapped(hctx)) - continue; + if (blk_mq_is_shared_tags(q->tag_set->flags)) { + struct blk_mq_tags *tags = q->tag_set->shared_tags; + struct sbitmap_queue *bresv = &tags->breserved_tags; + struct sbitmap_queue *btags = &tags->bitmap_tags; if (tags->nr_reserved_tags) - bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); - bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); + bt_for_each(NULL, q, bresv, fn, priv, true); + bt_for_each(NULL, q, btags, fn, priv, false); + } else { + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + struct blk_mq_tags *tags = hctx->tags; + struct sbitmap_queue *bresv = &tags->breserved_tags; + struct sbitmap_queue *btags = &tags->bitmap_tags; + + /* + * If no software queues are currently mapped to this + * hardware queue, there's nothing to check + */ + if (!blk_mq_hw_queue_mapped(hctx)) + continue; + + if (tags->nr_reserved_tags) + bt_for_each(hctx, q, bresv, fn, priv, true); + bt_for_each(hctx, q, btags, fn, priv, false); + } } blk_queue_exit(q); } From 17f81f9d4b41d57e474975c3a5ca2a2c4c01e2ec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Mon, 6 Dec 2021 08:04:09 +0100 Subject: [PATCH 089/132] mtd_blkdevs: don't scan partitions for plain mtdblock mtdblock / mtdblock_ro set part_bits to 0 and thus nevever scanned partitions. Restore that behavior by setting the GENHD_FL_NO_PART flag. Fixes: 1ebe2e5f9d68e94c ("block: remove GENHD_FL_EXT_DEVT") Reported-by: Geert Uytterhoeven <geert+renesas@glider.be> Signed-off-by: Christoph Hellwig <hch@lst.de> Tested-by: Geert Uytterhoeven <geert+renesas@glider.be> Link: https://lore.kernel.org/r/20211206070409.2836165-1-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- drivers/mtd/mtd_blkdevs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 113f86df7603..243f28a3206b 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -346,7 +346,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) gd->minors = 1 << tr->part_bits; gd->fops = &mtd_block_ops; - if (tr->part_bits) + if (tr->part_bits) { if (new->devnum < 26) snprintf(gd->disk_name, sizeof(gd->disk_name), "%s%c", tr->name, 'a' + new->devnum); @@ -355,9 +355,11 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) "%s%c%c", tr->name, 'a' - 1 + new->devnum / 26, 'a' + new->devnum % 26); - else + } else { snprintf(gd->disk_name, sizeof(gd->disk_name), "%s%d", tr->name, new->devnum); + gd->flags |= GENHD_FL_NO_PART; + } set_capacity(gd, ((u64)new->size * tr->blksize) >> 9); From 0ba4566cd8a4e645b542e6ddbe3dd26c85ad2408 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" <willy@infradead.org> Date: Mon, 13 Dec 2021 17:11:13 +0000 Subject: [PATCH 090/132] bdev: Improve lookup_bdev documentation Add a Context section and rewrite the rest to be clearer. Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Nikolay Borisov <nborisov@suse.com> Link: https://lore.kernel.org/r/20211213171113.3097631-1-willy@infradead.org Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bdev.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 587645231d60..8bf93a19041b 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -955,15 +955,15 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) EXPORT_SYMBOL(blkdev_put); /** - * lookup_bdev - lookup a struct block_device by name - * @pathname: special file representing the block device - * @dev: return value of the block device's dev_t + * lookup_bdev() - Look up a struct block_device by name. + * @pathname: Name of the block device in the filesystem. + * @dev: Pointer to the block device's dev_t, if found. * * Lookup the block device's dev_t at @pathname in the current - * namespace if possible and return it by @dev. + * namespace if possible and return it in @dev. * - * RETURNS: - * 0 if succeeded, errno otherwise. + * Context: May sleep. + * Return: 0 if succeeded, negative errno otherwise. */ int lookup_bdev(const char *pathname, dev_t *dev) { From 68497092bde9f53e35cafeb52fa9a267ebe0d9b1 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Tue, 14 Dec 2021 17:23:05 -0700 Subject: [PATCH 091/132] block: make queue stat accounting a reference kyber turns on IO statistics when it is loaded on a queue, which means that even if kyber is then later unloaded, we're still stuck with stats enabled on the queue. Change the account enabled from a bool to an int, and pair the enable call with the equivalent disable call. This ensures that stats gets turned off again appropriately. Reviewed-by: Omar Sandoval <osandov@fb.com> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-stat.c | 21 ++++++++++++++++----- block/blk-stat.h | 1 + block/kyber-iosched.c | 1 + 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/block/blk-stat.c b/block/blk-stat.c index efb2a80db906..2ea01b5c1aca 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -15,7 +15,7 @@ struct blk_queue_stats { struct list_head callbacks; spinlock_t lock; - bool enable_accounting; + int accounting; }; void blk_rq_stat_init(struct blk_rq_stat *stat) @@ -161,7 +161,7 @@ void blk_stat_remove_callback(struct request_queue *q, spin_lock_irqsave(&q->stats->lock, flags); list_del_rcu(&cb->list); - if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting) + if (list_empty(&q->stats->callbacks) && !q->stats->accounting) blk_queue_flag_clear(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); @@ -184,13 +184,24 @@ void blk_stat_free_callback(struct blk_stat_callback *cb) call_rcu(&cb->rcu, blk_stat_free_callback_rcu); } +void blk_stat_disable_accounting(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(&q->stats->lock, flags); + if (!--q->stats->accounting) + blk_queue_flag_clear(QUEUE_FLAG_STATS, q); + spin_unlock_irqrestore(&q->stats->lock, flags); +} +EXPORT_SYMBOL_GPL(blk_stat_disable_accounting); + void blk_stat_enable_accounting(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(&q->stats->lock, flags); - q->stats->enable_accounting = true; - blk_queue_flag_set(QUEUE_FLAG_STATS, q); + if (!q->stats->accounting++) + blk_queue_flag_set(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); } EXPORT_SYMBOL_GPL(blk_stat_enable_accounting); @@ -205,7 +216,7 @@ struct blk_queue_stats *blk_alloc_queue_stats(void) INIT_LIST_HEAD(&stats->callbacks); spin_lock_init(&stats->lock); - stats->enable_accounting = false; + stats->accounting = 0; return stats; } diff --git a/block/blk-stat.h b/block/blk-stat.h index 58f029af49e5..17e1eb4ec7e2 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h @@ -70,6 +70,7 @@ void blk_stat_add(struct request *rq, u64 now); /* record time/size info in request but not add a callback */ void blk_stat_enable_accounting(struct request_queue *q); +void blk_stat_disable_accounting(struct request_queue *q); /** * blk_stat_alloc_callback() - Allocate a block statistics callback. diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index fdd74a4df56f..70ff2a599ef6 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -433,6 +433,7 @@ static void kyber_exit_sched(struct elevator_queue *e) int i; del_timer_sync(&kqd->timer); + blk_stat_disable_accounting(kqd->q); for (i = 0; i < KYBER_NUM_DOMAINS; i++) sbitmap_queue_free(&kqd->domain_tokens[i]); From 5581a5ddfe8d0ede1749bae1f7662fa739f83813 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Wed, 1 Dec 2021 15:01:51 -0700 Subject: [PATCH 092/132] block: add completion handler for fast path The batched completions only deal with non-partial requests anyway, and it doesn't deal with any requests that have errors. Add a completion handler that assumes it's a full request and that it's all being ended successfully. Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 103c0f58853c..75154cc788db 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -709,6 +709,47 @@ static void blk_print_req_error(struct request *req, blk_status_t status) IOPRIO_PRIO_CLASS(req->ioprio)); } +/* + * Fully end IO on a request. Does not support partial completions, or + * errors. + */ +static void blk_complete_request(struct request *req) +{ + const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0; + int total_bytes = blk_rq_bytes(req); + struct bio *bio = req->bio; + + trace_block_rq_complete(req, BLK_STS_OK, total_bytes); + + if (!bio) + return; + +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ) + req->q->integrity.profile->complete_fn(req, total_bytes); +#endif + + blk_account_io_completion(req, total_bytes); + + do { + struct bio *next = bio->bi_next; + + /* Completion has already been traced */ + bio_clear_flag(bio, BIO_TRACE_COMPLETION); + if (!is_flush) + bio_endio(bio); + bio = next; + } while (bio); + + /* + * Reset counters so that the request stacking driver + * can find how many bytes remain in the request + * later. + */ + req->bio = NULL; + req->__data_len = 0; +} + /** * blk_update_request - Complete multiple bytes without completing the request * @req: the request being processed @@ -922,7 +963,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) prefetch(rq->bio); prefetch(rq->rq_next); - blk_update_request(rq, BLK_STS_OK, blk_rq_bytes(rq)); + blk_complete_request(rq); if (iob->need_ts) __blk_mq_end_request_acct(rq, now); From fcade2ce06ffebee5c2f6629ddbf2086c0f5ba5a Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Wed, 1 Dec 2021 16:19:18 -0700 Subject: [PATCH 093/132] block: use singly linked list for bio cache Pointless to maintain a head/tail for the list, as we never need to access the tail. Entries are always LIFO for cache hotness reasons. Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/bio.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/block/bio.c b/block/bio.c index 15ab0d6d1c06..6fadc977cd7f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -26,7 +26,7 @@ #include "blk-rq-qos.h" struct bio_alloc_cache { - struct bio_list free_list; + struct bio *free_list; unsigned int nr; }; @@ -630,7 +630,8 @@ static void bio_alloc_cache_prune(struct bio_alloc_cache *cache, unsigned int i = 0; struct bio *bio; - while ((bio = bio_list_pop(&cache->free_list)) != NULL) { + while ((bio = cache->free_list) != NULL) { + cache->free_list = bio->bi_next; cache->nr--; bio_free(bio); if (++i == nr) @@ -689,7 +690,8 @@ void bio_put(struct bio *bio) bio_uninit(bio); cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); - bio_list_add_head(&cache->free_list, bio); + bio->bi_next = cache->free_list; + cache->free_list = bio; if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK) bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK); put_cpu(); @@ -1704,8 +1706,9 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs, return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs); cache = per_cpu_ptr(bs->cache, get_cpu()); - bio = bio_list_pop(&cache->free_list); - if (bio) { + if (cache->free_list) { + bio = cache->free_list; + cache->free_list = bio->bi_next; cache->nr--; put_cpu(); bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs); From 3c67d44de787dff288d7f2a51c372b22f7356db6 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Fri, 3 Dec 2021 06:48:53 -0700 Subject: [PATCH 094/132] block: add mq_ops->queue_rqs hook If we have a list of requests in our plug list, send it to the driver in one go, if possible. The driver must set mq_ops->queue_rqs() to support this, if not the usual one-by-one path is used. Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq.c | 26 +++++++++++++++++++++++--- include/linux/blk-mq.h | 8 ++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 75154cc788db..51991232824a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2553,6 +2553,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct blk_mq_hw_ctx *this_hctx; struct blk_mq_ctx *this_ctx; + struct request *rq; unsigned int depth; LIST_HEAD(list); @@ -2561,7 +2562,28 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) plug->rq_count = 0; if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { - struct request_queue *q = rq_list_peek(&plug->mq_list)->q; + struct request_queue *q; + + rq = rq_list_peek(&plug->mq_list); + q = rq->q; + + /* + * Peek first request and see if we have a ->queue_rqs() hook. + * If we do, we can dispatch the whole plug list in one go. We + * already know at this point that all requests belong to the + * same queue, caller must ensure that's the case. + * + * Since we pass off the full list to the driver at this point, + * we do not increment the active request count for the queue. + * Bypass shared tags for now because of that. + */ + if (q->mq_ops->queue_rqs && + !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { + blk_mq_run_dispatch_ops(q, + q->mq_ops->queue_rqs(&plug->mq_list)); + if (rq_list_empty(plug->mq_list)) + return; + } blk_mq_run_dispatch_ops(q, blk_mq_plug_issue_direct(plug, false)); @@ -2573,8 +2595,6 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) this_ctx = NULL; depth = 0; do { - struct request *rq; - rq = rq_list_pop(&plug->mq_list); if (!this_hctx) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 772f8f921526..550996cf419c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -492,6 +492,14 @@ struct blk_mq_ops { */ void (*commit_rqs)(struct blk_mq_hw_ctx *); + /** + * @queue_rqs: Queue a list of new requests. Driver is guaranteed + * that each request belongs to the same queue. If the driver doesn't + * empty the @rqlist completely, then the rest will be queued + * individually by the block layer upon return. + */ + void (*queue_rqs)(struct request **rqlist); + /** * @get_budget: Reserve budget before queue request, once .queue_rq is * run, it is driver's responsibility to release the From 3233b94cf842984ea7e208d5be1ad2f2af02d495 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Fri, 29 Oct 2021 14:32:44 -0600 Subject: [PATCH 095/132] nvme: split command copy into a helper We'll need it for batched submit as well. Since we now have a copy helper, get rid of the nvme_submit_cmd() wrapper. Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com> Reviewed-by: Hannes Reinecke <hare@suse.de> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- drivers/nvme/host/pci.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8637538f3fd5..2009f8c047a2 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -500,22 +500,13 @@ static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq) nvmeq->last_sq_tail = nvmeq->sq_tail; } -/** - * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell - * @nvmeq: The queue to use - * @cmd: The command to send - * @write_sq: whether to write to the SQ doorbell - */ -static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, - bool write_sq) +static inline void nvme_sq_copy_cmd(struct nvme_queue *nvmeq, + struct nvme_command *cmd) { - spin_lock(&nvmeq->sq_lock); memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes), - cmd, sizeof(*cmd)); + absolute_pointer(cmd), sizeof(*cmd)); if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; - nvme_write_sq_db(nvmeq, write_sq); - spin_unlock(&nvmeq->sq_lock); } static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) @@ -957,7 +948,10 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, } blk_mq_start_request(req); - nvme_submit_cmd(nvmeq, cmnd, bd->last); + spin_lock(&nvmeq->sq_lock); + nvme_sq_copy_cmd(nvmeq, &iod->cmd); + nvme_write_sq_db(nvmeq, bd->last); + spin_unlock(&nvmeq->sq_lock); return BLK_STS_OK; out_unmap_data: nvme_unmap_data(dev, req); @@ -1140,7 +1134,11 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) c.common.opcode = nvme_admin_async_event; c.common.command_id = NVME_AQ_BLK_MQ_DEPTH; - nvme_submit_cmd(nvmeq, &c, true); + + spin_lock(&nvmeq->sq_lock); + nvme_sq_copy_cmd(nvmeq, &c); + nvme_write_sq_db(nvmeq, true); + spin_unlock(&nvmeq->sq_lock); } static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) From 62451a2b2e7ea17c4a547ada6a5deebf8787a27a Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Fri, 29 Oct 2021 14:34:11 -0600 Subject: [PATCH 096/132] nvme: separate command prep and issue Add a nvme_prep_rq() helper to setup a command, and nvme_queue_rq() is adapted to use this helper. Reviewed-by: Hannes Reinecke <hare@suse.de> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- drivers/nvme/host/pci.c | 65 +++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2009f8c047a2..081abbe04f29 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -903,24 +903,52 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, return BLK_STS_OK; } +static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + blk_status_t ret; + + iod->aborted = 0; + iod->npages = -1; + iod->nents = 0; + + ret = nvme_setup_cmd(req->q->queuedata, req); + if (ret) + return ret; + + if (blk_rq_nr_phys_segments(req)) { + ret = nvme_map_data(dev, req, &iod->cmd); + if (ret) + goto out_free_cmd; + } + + if (blk_integrity_rq(req)) { + ret = nvme_map_metadata(dev, req, &iod->cmd); + if (ret) + goto out_unmap_data; + } + + blk_mq_start_request(req); + return BLK_STS_OK; +out_unmap_data: + nvme_unmap_data(dev, req); +out_free_cmd: + nvme_cleanup_cmd(req); + return ret; +} + /* * NOTE: ns is NULL when called on the admin queue. */ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { - struct nvme_ns *ns = hctx->queue->queuedata; struct nvme_queue *nvmeq = hctx->driver_data; struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_command *cmnd = &iod->cmd; blk_status_t ret; - iod->aborted = 0; - iod->npages = -1; - iod->nents = 0; - /* * We should not need to do this, but we're still using this to * ensure we can drain requests on a dying queue. @@ -928,36 +956,17 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) return BLK_STS_IOERR; - if (!nvme_check_ready(&dev->ctrl, req, true)) + if (unlikely(!nvme_check_ready(&dev->ctrl, req, true))) return nvme_fail_nonready_command(&dev->ctrl, req); - ret = nvme_setup_cmd(ns, req); - if (ret) + ret = nvme_prep_rq(dev, req); + if (unlikely(ret)) return ret; - - if (blk_rq_nr_phys_segments(req)) { - ret = nvme_map_data(dev, req, cmnd); - if (ret) - goto out_free_cmd; - } - - if (blk_integrity_rq(req)) { - ret = nvme_map_metadata(dev, req, cmnd); - if (ret) - goto out_unmap_data; - } - - blk_mq_start_request(req); spin_lock(&nvmeq->sq_lock); nvme_sq_copy_cmd(nvmeq, &iod->cmd); nvme_write_sq_db(nvmeq, bd->last); spin_unlock(&nvmeq->sq_lock); return BLK_STS_OK; -out_unmap_data: - nvme_unmap_data(dev, req); -out_free_cmd: - nvme_cleanup_cmd(req); - return ret; } static __always_inline void nvme_pci_unmap_rq(struct request *req) From d62cbcf62f2f4bf933d18113bcda45e18cc890f6 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Thu, 18 Nov 2021 08:37:30 -0700 Subject: [PATCH 097/132] nvme: add support for mq_ops->queue_rqs() This enables the block layer to send us a full plug list of requests that need submitting. The block layer guarantees that they all belong to the same queue, but we do have to check the hardware queue mapping for each request. If errors are encountered, leave them in the passed in list. Then the block layer will handle them individually. This is good for about a 4% improvement in peak performance, taking us from 9.6M to 10M IOPS/core. Reviewed-by: Hannes Reinecke <hare@suse.de> Reviewed-by: Keith Busch <kbusch@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@kernel.dk> --- drivers/nvme/host/pci.c | 59 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 081abbe04f29..50deb8b69c40 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -969,6 +969,64 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } +static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist) +{ + spin_lock(&nvmeq->sq_lock); + while (!rq_list_empty(*rqlist)) { + struct request *req = rq_list_pop(rqlist); + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + + nvme_sq_copy_cmd(nvmeq, &iod->cmd); + } + nvme_write_sq_db(nvmeq, true); + spin_unlock(&nvmeq->sq_lock); +} + +static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req) +{ + /* + * We should not need to do this, but we're still using this to + * ensure we can drain requests on a dying queue. + */ + if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) + return false; + if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true))) + return false; + + req->mq_hctx->tags->rqs[req->tag] = req; + return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK; +} + +static void nvme_queue_rqs(struct request **rqlist) +{ + struct request *req = rq_list_peek(rqlist), *prev = NULL; + struct request *requeue_list = NULL; + + do { + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + + if (!nvme_prep_rq_batch(nvmeq, req)) { + /* detach 'req' and add to remainder list */ + if (prev) + prev->rq_next = req->rq_next; + rq_list_add(&requeue_list, req); + } else { + prev = req; + } + + req = rq_list_next(req); + if (!req || (prev && req->mq_hctx != prev->mq_hctx)) { + /* detach rest of list, and submit */ + if (prev) + prev->rq_next = NULL; + nvme_submit_cmds(nvmeq, rqlist); + *rqlist = req; + } + } while (req); + + *rqlist = requeue_list; +} + static __always_inline void nvme_pci_unmap_rq(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -1670,6 +1728,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = { static const struct blk_mq_ops nvme_mq_ops = { .queue_rq = nvme_queue_rq, + .queue_rqs = nvme_queue_rqs, .complete = nvme_pci_complete_rq, .commit_rqs = nvme_commit_rqs, .init_hctx = nvme_init_hctx, From 8a2ba1785c5803d59a63b6320ff54fd4a37a41ce Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Thu, 9 Dec 2021 07:31:21 +0100 Subject: [PATCH 098/132] block: remove the nr_task field from struct io_context Nothing ever looks at ->nr_tasks, so remove it. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211209063131.18537-2-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 3 --- include/linux/iocontext.h | 1 - 2 files changed, 4 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 536fb496ad76..96336c2134ef 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -207,7 +207,6 @@ void exit_io_context(struct task_struct *task) task->io_context = NULL; task_unlock(task); - atomic_dec(&ioc->nr_tasks); put_io_context_active(ioc); } @@ -259,7 +258,6 @@ static struct io_context *alloc_io_context(gfp_t gfp_flags, int node) return NULL; atomic_long_set(&ioc->refcount, 1); - atomic_set(&ioc->nr_tasks, 1); atomic_set(&ioc->active_ref, 1); spin_lock_init(&ioc->lock); INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC); @@ -339,7 +337,6 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk) if (clone_flags & CLONE_IO) { atomic_long_inc(&ioc->refcount); atomic_inc(&ioc->active_ref); - atomic_inc(&ioc->nr_tasks); tsk->io_context = ioc; } else if (ioprio_valid(ioc->ioprio)) { tsk->io_context = alloc_io_context(GFP_KERNEL, NUMA_NO_NODE); diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index c1229fbd6691..82c7f4f5f4f5 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -99,7 +99,6 @@ struct io_cq { struct io_context { atomic_long_t refcount; atomic_t active_ref; - atomic_t nr_tasks; /* all the fields below are protected by this lock */ spinlock_t lock; From 0aed2f162bbc7853fe91c0d70492ea73c4e9cb07 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Thu, 9 Dec 2021 07:31:22 +0100 Subject: [PATCH 099/132] block: simplify struct io_context refcounting Don't hold a reference to ->refcount for each active reference, but just one for all active references. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211209063131.18537-3-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 96336c2134ef..9cde3906be3c 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -180,10 +180,8 @@ static void put_io_context_active(struct io_context *ioc) { struct io_cq *icq; - if (!atomic_dec_and_test(&ioc->active_ref)) { - put_io_context(ioc); + if (!atomic_dec_and_test(&ioc->active_ref)) return; - } spin_lock_irq(&ioc->lock); hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { @@ -335,7 +333,6 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk) * Share io context with parent, if CLONE_IO is set */ if (clone_flags & CLONE_IO) { - atomic_long_inc(&ioc->refcount); atomic_inc(&ioc->active_ref); tsk->io_context = ioc; } else if (ioprio_valid(ioc->ioprio)) { From 4be8a2eaff2e4473b6e8ad9a3857bc9b1e79c8ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Thu, 9 Dec 2021 07:31:23 +0100 Subject: [PATCH 100/132] block: refactor put_iocontext_active Factor out a ioc_exit_icqs helper to tear down the icqs and the fold the rest of put_iocontext_active into exit_io_context. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211209063131.18537-4-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 41 ++++++++++++++--------------------------- 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 9cde3906be3c..0380e33930e3 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -54,6 +54,16 @@ static void ioc_exit_icq(struct io_cq *icq) icq->flags |= ICQ_EXITED; } +static void ioc_exit_icqs(struct io_context *ioc) +{ + struct io_cq *icq; + + spin_lock_irq(&ioc->lock); + hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) + ioc_exit_icq(icq); + spin_unlock_irq(&ioc->lock); +} + /* * Release an icq. Called with ioc locked for blk-mq, and with both ioc * and queue locked for legacy. @@ -169,32 +179,6 @@ void put_io_context(struct io_context *ioc) } EXPORT_SYMBOL_GPL(put_io_context); -/** - * put_io_context_active - put active reference on ioc - * @ioc: ioc of interest - * - * Put an active reference to an ioc. If active reference reaches zero after - * put, @ioc can never issue further IOs and ioscheds are notified. - */ -static void put_io_context_active(struct io_context *ioc) -{ - struct io_cq *icq; - - if (!atomic_dec_and_test(&ioc->active_ref)) - return; - - spin_lock_irq(&ioc->lock); - hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { - if (icq->flags & ICQ_EXITED) - continue; - - ioc_exit_icq(icq); - } - spin_unlock_irq(&ioc->lock); - - put_io_context(ioc); -} - /* Called by the exiting task */ void exit_io_context(struct task_struct *task) { @@ -205,7 +189,10 @@ void exit_io_context(struct task_struct *task) task->io_context = NULL; task_unlock(task); - put_io_context_active(ioc); + if (atomic_dec_and_test(&ioc->active_ref)) { + ioc_exit_icqs(ioc); + put_io_context(ioc); + } } static void __ioc_clear_queue(struct list_head *icq_list) From 8a20c0c7e0cea7eb0c32fd6b63ff514c9ac32b8f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Thu, 9 Dec 2021 07:31:24 +0100 Subject: [PATCH 101/132] block: remove the NULL ioc check in put_io_context No caller passes in a NULL pointer, so remove the check. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211209063131.18537-5-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 0380e33930e3..04f3d2b0ca7d 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -155,9 +155,6 @@ void put_io_context(struct io_context *ioc) unsigned long flags; bool free_ioc = false; - if (ioc == NULL) - return; - BUG_ON(atomic_long_read(&ioc->refcount) <= 0); /* From edf70ff5a1ed9769da35178454d743828061a6a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Thu, 9 Dec 2021 07:31:25 +0100 Subject: [PATCH 102/132] block: refactor put_io_context Move the code to delay freeing the icqs into a separate helper. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211209063131.18537-6-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 04f3d2b0ca7d..ca996214c10a 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -143,6 +143,24 @@ static void ioc_release_fn(struct work_struct *work) kmem_cache_free(iocontext_cachep, ioc); } +/* + * Releasing icqs requires reverse order double locking and we may already be + * holding a queue_lock. Do it asynchronously from a workqueue. + */ +static bool ioc_delay_free(struct io_context *ioc) +{ + unsigned long flags; + + spin_lock_irqsave(&ioc->lock, flags); + if (!hlist_empty(&ioc->icq_list)) { + queue_work(system_power_efficient_wq, &ioc->release_work); + spin_unlock_irqrestore(&ioc->lock, flags); + return true; + } + spin_unlock_irqrestore(&ioc->lock, flags); + return false; +} + /** * put_io_context - put a reference of io_context * @ioc: io_context to put @@ -152,26 +170,8 @@ static void ioc_release_fn(struct work_struct *work) */ void put_io_context(struct io_context *ioc) { - unsigned long flags; - bool free_ioc = false; - BUG_ON(atomic_long_read(&ioc->refcount) <= 0); - - /* - * Releasing ioc requires reverse order double locking and we may - * already be holding a queue_lock. Do it asynchronously from wq. - */ - if (atomic_long_dec_and_test(&ioc->refcount)) { - spin_lock_irqsave(&ioc->lock, flags); - if (!hlist_empty(&ioc->icq_list)) - queue_work(system_power_efficient_wq, - &ioc->release_work); - else - free_ioc = true; - spin_unlock_irqrestore(&ioc->lock, flags); - } - - if (free_ioc) + if (atomic_long_dec_and_test(&ioc->refcount) && !ioc_delay_free(ioc)) kmem_cache_free(iocontext_cachep, ioc); } EXPORT_SYMBOL_GPL(put_io_context); From 091abcb3efd71cb18e80c8f040d9e4a634d8906d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Thu, 9 Dec 2021 07:31:26 +0100 Subject: [PATCH 103/132] block: cleanup ioc_clear_queue Fold __ioc_clear_queue into ioc_clear_queue and switch to always use plain _irq locking instead of the more expensive _irqsave that is not needed here. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211209063131.18537-7-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index ca996214c10a..f98a29ee8f36 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -192,27 +192,6 @@ void exit_io_context(struct task_struct *task) } } -static void __ioc_clear_queue(struct list_head *icq_list) -{ - unsigned long flags; - - rcu_read_lock(); - while (!list_empty(icq_list)) { - struct io_cq *icq = list_entry(icq_list->next, - struct io_cq, q_node); - struct io_context *ioc = icq->ioc; - - spin_lock_irqsave(&ioc->lock, flags); - if (icq->flags & ICQ_DESTROYED) { - spin_unlock_irqrestore(&ioc->lock, flags); - continue; - } - ioc_destroy_icq(icq); - spin_unlock_irqrestore(&ioc->lock, flags); - } - rcu_read_unlock(); -} - /** * ioc_clear_queue - break any ioc association with the specified queue * @q: request_queue being cleared @@ -227,7 +206,17 @@ void ioc_clear_queue(struct request_queue *q) list_splice_init(&q->icq_list, &icq_list); spin_unlock_irq(&q->queue_lock); - __ioc_clear_queue(&icq_list); + rcu_read_lock(); + while (!list_empty(&icq_list)) { + struct io_cq *icq = + list_entry(icq_list.next, struct io_cq, q_node); + + spin_lock_irq(&icq->ioc->lock); + if (!(icq->flags & ICQ_DESTROYED)) + ioc_destroy_icq(icq); + spin_unlock_irq(&icq->ioc->lock); + } + rcu_read_unlock(); } static struct io_context *alloc_io_context(gfp_t gfp_flags, int node) From a411cd3cfdc5bbd1329d5b33dbf39e2b5213969d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Thu, 9 Dec 2021 07:31:27 +0100 Subject: [PATCH 104/132] block: move set_task_ioprio to blk-ioc.c Keep set_task_ioprio with the other low-level code that accesses the io_context structure. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211209063131.18537-8-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 34 ++++++++++++++++++++++++++++++++-- block/ioprio.c | 32 -------------------------------- include/linux/iocontext.h | 2 -- 3 files changed, 32 insertions(+), 36 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index f98a29ee8f36..c25ce2f3eb19 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -8,6 +8,7 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/slab.h> +#include <linux/security.h> #include <linux/sched/task.h> #include "blk.h" @@ -280,8 +281,8 @@ static struct io_context *create_task_io_context(struct task_struct *task, * This function always goes through task_lock() and it's better to use * %current->io_context + get_io_context() for %current. */ -struct io_context *get_task_io_context(struct task_struct *task, - gfp_t gfp_flags, int node) +static struct io_context *get_task_io_context(struct task_struct *task, + gfp_t gfp_flags, int node) { struct io_context *ioc; @@ -298,6 +299,35 @@ struct io_context *get_task_io_context(struct task_struct *task, return ioc; } +int set_task_ioprio(struct task_struct *task, int ioprio) +{ + int err; + struct io_context *ioc; + const struct cred *cred = current_cred(), *tcred; + + rcu_read_lock(); + tcred = __task_cred(task); + if (!uid_eq(tcred->uid, cred->euid) && + !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { + rcu_read_unlock(); + return -EPERM; + } + rcu_read_unlock(); + + err = security_task_setioprio(task, ioprio); + if (err) + return err; + + ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); + if (ioc) { + ioc->ioprio = ioprio; + put_io_context(ioc); + } + + return err; +} +EXPORT_SYMBOL_GPL(set_task_ioprio); + int __copy_io(unsigned long clone_flags, struct task_struct *tsk) { struct io_context *ioc = current->io_context; diff --git a/block/ioprio.c b/block/ioprio.c index 313c14a70bbd..e118f4bf2dc6 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -22,46 +22,14 @@ */ #include <linux/gfp.h> #include <linux/kernel.h> -#include <linux/export.h> #include <linux/ioprio.h> #include <linux/cred.h> #include <linux/blkdev.h> #include <linux/capability.h> -#include <linux/sched/user.h> -#include <linux/sched/task.h> #include <linux/syscalls.h> #include <linux/security.h> #include <linux/pid_namespace.h> -int set_task_ioprio(struct task_struct *task, int ioprio) -{ - int err; - struct io_context *ioc; - const struct cred *cred = current_cred(), *tcred; - - rcu_read_lock(); - tcred = __task_cred(task); - if (!uid_eq(tcred->uid, cred->euid) && - !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { - rcu_read_unlock(); - return -EPERM; - } - rcu_read_unlock(); - - err = security_task_setioprio(task, ioprio); - if (err) - return err; - - ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); - if (ioc) { - ioc->ioprio = ioprio; - put_io_context(ioc); - } - - return err; -} -EXPORT_SYMBOL_GPL(set_task_ioprio); - int ioprio_check_cap(int ioprio) { int class = IOPRIO_PRIO_CLASS(ioprio); diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 82c7f4f5f4f5..648331f35fc6 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -116,8 +116,6 @@ struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); void exit_io_context(struct task_struct *task); -struct io_context *get_task_io_context(struct task_struct *task, - gfp_t gfp_flags, int node); int __copy_io(unsigned long clone_flags, struct task_struct *tsk); static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) { From 8472161b77c41d260c5ba0af6bf940269b297bb6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Thu, 9 Dec 2021 07:31:28 +0100 Subject: [PATCH 105/132] block: fold get_task_io_context into set_task_ioprio Fold get_task_io_context into its only caller, and simplify the code as no reference to the I/O context is required to just set the ioprio field. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211209063131.18537-9-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 52 +++++++++++++------------------------------------ 1 file changed, 14 insertions(+), 38 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index c25ce2f3eb19..1ba7cfedca2d 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -268,41 +268,9 @@ static struct io_context *create_task_io_context(struct task_struct *task, return ioc; } -/** - * get_task_io_context - get io_context of a task - * @task: task of interest - * @gfp_flags: allocation flags, used if allocation is necessary - * @node: allocation node, used if allocation is necessary - * - * Return io_context of @task. If it doesn't exist, it is created with - * @gfp_flags and @node. The returned io_context has its reference count - * incremented. - * - * This function always goes through task_lock() and it's better to use - * %current->io_context + get_io_context() for %current. - */ -static struct io_context *get_task_io_context(struct task_struct *task, - gfp_t gfp_flags, int node) -{ - struct io_context *ioc; - - might_sleep_if(gfpflags_allow_blocking(gfp_flags)); - - task_lock(task); - ioc = task->io_context; - if (unlikely(!ioc)) { - task_unlock(task); - return create_task_io_context(task, gfp_flags, node); - } - get_io_context(ioc); - task_unlock(task); - return ioc; -} - int set_task_ioprio(struct task_struct *task, int ioprio) { int err; - struct io_context *ioc; const struct cred *cred = current_cred(), *tcred; rcu_read_lock(); @@ -318,13 +286,21 @@ int set_task_ioprio(struct task_struct *task, int ioprio) if (err) return err; - ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); - if (ioc) { - ioc->ioprio = ioprio; - put_io_context(ioc); - } + task_lock(task); + if (unlikely(!task->io_context)) { + struct io_context *ioc; - return err; + task_unlock(task); + ioc = create_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); + if (ioc) { + ioc->ioprio = ioprio; + put_io_context(ioc); + } + return 0; + } + task->io_context->ioprio = ioprio; + task_unlock(task); + return 0; } EXPORT_SYMBOL_GPL(set_task_ioprio); From 5fc11eebb4a98df5324a4de369bb5ab7f0007ff7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Thu, 9 Dec 2021 07:31:29 +0100 Subject: [PATCH 106/132] block: open code create_task_io_context in set_task_ioprio The flow in set_task_ioprio can be simplified by simply open coding create_task_io_context, which removes a refcount roundtrip on the I/O context. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211209063131.18537-10-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 1ba7cfedca2d..cff0e3bdae53 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -291,12 +291,18 @@ int set_task_ioprio(struct task_struct *task, int ioprio) struct io_context *ioc; task_unlock(task); - ioc = create_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); - if (ioc) { - ioc->ioprio = ioprio; - put_io_context(ioc); + + ioc = alloc_io_context(GFP_ATOMIC, NUMA_NO_NODE); + if (!ioc) + return -ENOMEM; + + task_lock(task); + if (task->io_context || (task->flags & PF_EXITING)) { + kmem_cache_free(iocontext_cachep, ioc); + ioc = task->io_context; + } else { + task->io_context = ioc; } - return 0; } task->io_context->ioprio = ioprio; task_unlock(task); From 90b627f5426ce144cdd4ea585d1f7812359a1a6a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Thu, 9 Dec 2021 07:31:30 +0100 Subject: [PATCH 107/132] block: fold create_task_io_context into ioc_find_get_icq Fold create_task_io_context into the only remaining caller. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211209063131.18537-11-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index cff0e3bdae53..dc7fb064fd5f 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -238,36 +238,6 @@ static struct io_context *alloc_io_context(gfp_t gfp_flags, int node) return ioc; } -static struct io_context *create_task_io_context(struct task_struct *task, - gfp_t gfp_flags, int node) -{ - struct io_context *ioc; - - ioc = alloc_io_context(gfp_flags, node); - if (!ioc) - return NULL; - - /* - * Try to install. ioc shouldn't be installed if someone else - * already did or @task, which isn't %current, is exiting. Note - * that we need to allow ioc creation on exiting %current as exit - * path may issue IOs from e.g. exit_files(). The exit path is - * responsible for not issuing IO after exit_io_context(). - */ - task_lock(task); - if (!task->io_context && - (task == current || !(task->flags & PF_EXITING))) - task->io_context = ioc; - else - kmem_cache_free(iocontext_cachep, ioc); - - ioc = task->io_context; - if (ioc) - get_io_context(ioc); - task_unlock(task); - return ioc; -} - int set_task_ioprio(struct task_struct *task, int ioprio) { int err; @@ -426,9 +396,20 @@ struct io_cq *ioc_find_get_icq(struct request_queue *q) struct io_cq *icq = NULL; if (unlikely(!ioc)) { - ioc = create_task_io_context(current, GFP_ATOMIC, q->node); + ioc = alloc_io_context(GFP_ATOMIC, q->node); if (!ioc) return NULL; + + task_lock(current); + if (current->io_context) { + kmem_cache_free(iocontext_cachep, ioc); + ioc = current->io_context; + } else { + current->io_context = ioc; + } + + get_io_context(ioc); + task_unlock(current); } else { get_io_context(ioc); From 5ef1630586317e92c9ebd7b4ce48f393b7ff790f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Thu, 9 Dec 2021 07:31:31 +0100 Subject: [PATCH 108/132] block: only build the icq tracking code when needed Only bfq needs to code to track icq, so make it conditional. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20211209063131.18537-12-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/Kconfig | 3 ++ block/Kconfig.iosched | 1 + block/blk-ioc.c | 68 +++++++++++++++++++++++---------------- block/blk.h | 6 ++++ include/linux/iocontext.h | 6 ++-- 5 files changed, 55 insertions(+), 29 deletions(-) diff --git a/block/Kconfig b/block/Kconfig index c6ce41a5e5b2..d5d4197b7ed2 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -35,6 +35,9 @@ config BLK_CGROUP_RWSTAT config BLK_DEV_BSG_COMMON tristate +config BLK_ICQ + bool + config BLK_DEV_BSGLIB bool "Block layer SG support v4 helper lib" select BLK_DEV_BSG_COMMON diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 885fee86dfca..615516146086 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -18,6 +18,7 @@ config MQ_IOSCHED_KYBER config IOSCHED_BFQ tristate "BFQ I/O scheduler" + select BLK_ICQ help BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of of the device among all processes according to their weights, diff --git a/block/blk-ioc.c b/block/blk-ioc.c index dc7fb064fd5f..87bdc9ca8295 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -19,6 +19,7 @@ */ static struct kmem_cache *iocontext_cachep; +#ifdef CONFIG_BLK_ICQ /** * get_io_context - increment reference count to io_context * @ioc: io_context to get @@ -162,6 +163,42 @@ static bool ioc_delay_free(struct io_context *ioc) return false; } +/** + * ioc_clear_queue - break any ioc association with the specified queue + * @q: request_queue being cleared + * + * Walk @q->icq_list and exit all io_cq's. + */ +void ioc_clear_queue(struct request_queue *q) +{ + LIST_HEAD(icq_list); + + spin_lock_irq(&q->queue_lock); + list_splice_init(&q->icq_list, &icq_list); + spin_unlock_irq(&q->queue_lock); + + rcu_read_lock(); + while (!list_empty(&icq_list)) { + struct io_cq *icq = + list_entry(icq_list.next, struct io_cq, q_node); + + spin_lock_irq(&icq->ioc->lock); + if (!(icq->flags & ICQ_DESTROYED)) + ioc_destroy_icq(icq); + spin_unlock_irq(&icq->ioc->lock); + } + rcu_read_unlock(); +} +#else /* CONFIG_BLK_ICQ */ +static inline void ioc_exit_icqs(struct io_context *ioc) +{ +} +static inline bool ioc_delay_free(struct io_context *ioc) +{ + return false; +} +#endif /* CONFIG_BLK_ICQ */ + /** * put_io_context - put a reference of io_context * @ioc: io_context to put @@ -193,33 +230,6 @@ void exit_io_context(struct task_struct *task) } } -/** - * ioc_clear_queue - break any ioc association with the specified queue - * @q: request_queue being cleared - * - * Walk @q->icq_list and exit all io_cq's. - */ -void ioc_clear_queue(struct request_queue *q) -{ - LIST_HEAD(icq_list); - - spin_lock_irq(&q->queue_lock); - list_splice_init(&q->icq_list, &icq_list); - spin_unlock_irq(&q->queue_lock); - - rcu_read_lock(); - while (!list_empty(&icq_list)) { - struct io_cq *icq = - list_entry(icq_list.next, struct io_cq, q_node); - - spin_lock_irq(&icq->ioc->lock); - if (!(icq->flags & ICQ_DESTROYED)) - ioc_destroy_icq(icq); - spin_unlock_irq(&icq->ioc->lock); - } - rcu_read_unlock(); -} - static struct io_context *alloc_io_context(gfp_t gfp_flags, int node) { struct io_context *ioc; @@ -231,10 +241,12 @@ static struct io_context *alloc_io_context(gfp_t gfp_flags, int node) atomic_long_set(&ioc->refcount, 1); atomic_set(&ioc->active_ref, 1); +#ifdef CONFIG_BLK_ICQ spin_lock_init(&ioc->lock); INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC); INIT_HLIST_HEAD(&ioc->icq_list); INIT_WORK(&ioc->release_work, ioc_release_fn); +#endif return ioc; } @@ -300,6 +312,7 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk) return 0; } +#ifdef CONFIG_BLK_ICQ /** * ioc_lookup_icq - lookup io_cq from ioc * @q: the associated request_queue @@ -428,6 +441,7 @@ struct io_cq *ioc_find_get_icq(struct request_queue *q) return icq; } EXPORT_SYMBOL_GPL(ioc_find_get_icq); +#endif /* CONFIG_BLK_ICQ */ static int __init blk_ioc_init(void) { diff --git a/block/blk.h b/block/blk.h index 7ccb7c7d86b3..8bd43b3ad33d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -366,7 +366,13 @@ static inline unsigned int bio_aligned_discard_max_sectors( */ struct io_cq *ioc_find_get_icq(struct request_queue *q); struct io_cq *ioc_lookup_icq(struct request_queue *q); +#ifdef CONFIG_BLK_ICQ void ioc_clear_queue(struct request_queue *q); +#else +static inline void ioc_clear_queue(struct request_queue *q) +{ +} +#endif /* CONFIG_BLK_ICQ */ #ifdef CONFIG_BLK_DEV_THROTTLING_LOW extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 648331f35fc6..14f7eaf1b443 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -100,16 +100,18 @@ struct io_context { atomic_long_t refcount; atomic_t active_ref; + unsigned short ioprio; + +#ifdef CONFIG_BLK_ICQ /* all the fields below are protected by this lock */ spinlock_t lock; - unsigned short ioprio; - struct radix_tree_root icq_tree; struct io_cq __rcu *icq_hint; struct hlist_head icq_list; struct work_struct release_work; +#endif /* CONFIG_BLK_ICQ */ }; struct task_struct; From 361c81dbc58c8aa230e1f2d556045fa7bc3eb4a3 Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa <wander@redhat.com> Date: Mon, 20 Dec 2021 16:28:27 -0300 Subject: [PATCH 109/132] blktrace: switch trace spinlock to a raw spinlock The running_trace_lock protects running_trace_list and is acquired within the tracepoint which implies disabled preemption. The spinlock_t typed lock can not be acquired with disabled preemption on PREEMPT_RT because it becomes a sleeping lock. The runtime of the tracepoint depends on the number of entries in running_trace_list and has no limit. The blk-tracer is considered debug code and higher latencies here are okay. Make running_trace_lock a raw_spinlock_t. Signed-off-by: Wander Lairson Costa <wander@redhat.com> Link: https://lore.kernel.org/r/20211220192827.38297-1-wander@redhat.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- kernel/trace/blktrace.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 431e41bc4c23..af68a67179b4 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -34,7 +34,7 @@ static struct trace_array *blk_tr; static bool blk_tracer_enabled __read_mostly; static LIST_HEAD(running_trace_list); -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); +static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(running_trace_lock); /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 @@ -121,12 +121,12 @@ static void trace_note_tsk(struct task_struct *tsk) struct blk_trace *bt; tsk->btrace_seq = blktrace_seq; - spin_lock_irqsave(&running_trace_lock, flags); + raw_spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm), 0); } - spin_unlock_irqrestore(&running_trace_lock, flags); + raw_spin_unlock_irqrestore(&running_trace_lock, flags); } static void trace_note_time(struct blk_trace *bt) @@ -666,9 +666,9 @@ static int __blk_trace_startstop(struct request_queue *q, int start) blktrace_seq++; smp_mb(); bt->trace_state = Blktrace_running; - spin_lock_irq(&running_trace_lock); + raw_spin_lock_irq(&running_trace_lock); list_add(&bt->running_list, &running_trace_list); - spin_unlock_irq(&running_trace_lock); + raw_spin_unlock_irq(&running_trace_lock); trace_note_time(bt); ret = 0; @@ -676,9 +676,9 @@ static int __blk_trace_startstop(struct request_queue *q, int start) } else { if (bt->trace_state == Blktrace_running) { bt->trace_state = Blktrace_stopped; - spin_lock_irq(&running_trace_lock); + raw_spin_lock_irq(&running_trace_lock); list_del_init(&bt->running_list); - spin_unlock_irq(&running_trace_lock); + raw_spin_unlock_irq(&running_trace_lock); relay_flush(bt->rchan); ret = 0; } @@ -1608,9 +1608,9 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt->trace_state == Blktrace_running) { bt->trace_state = Blktrace_stopped; - spin_lock_irq(&running_trace_lock); + raw_spin_lock_irq(&running_trace_lock); list_del_init(&bt->running_list); - spin_unlock_irq(&running_trace_lock); + raw_spin_unlock_irq(&running_trace_lock); relay_flush(bt->rchan); } From 518579a9af10a4b7b952a8366566fdcc7cfce3ca Mon Sep 17 00:00:00 2001 From: Keith Busch <kbusch@kernel.org> Date: Mon, 20 Dec 2021 12:59:19 -0800 Subject: [PATCH 110/132] blk-mq: blk-mq: check quiesce state before queue_rqs The low level drivers don't expect to see new requests after a successful quiesce completes. Check the queue quiesce state within the rcu protected area prior to calling the driver's queue_rqs(). Fixes: 3c67d44de787 ("block: add mq_ops->queue_rqs hook") Signed-off-by: Keith Busch <kbusch@kernel.org> Link: https://lore.kernel.org/r/20211220205919.180191-1-kbusch@kernel.org Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-mq.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 51991232824a..0d7c9d3e0329 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2549,6 +2549,14 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule) blk_mq_commit_rqs(hctx, &queued, from_schedule); } +static void __blk_mq_flush_plug_list(struct request_queue *q, + struct blk_plug *plug) +{ + if (blk_queue_quiesced(q)) + return; + q->mq_ops->queue_rqs(&plug->mq_list); +} + void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct blk_mq_hw_ctx *this_hctx; @@ -2580,7 +2588,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) if (q->mq_ops->queue_rqs && !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { blk_mq_run_dispatch_ops(q, - q->mq_ops->queue_rqs(&plug->mq_list)); + __blk_mq_flush_plug_list(q, plug)); if (rq_list_empty(plug->mq_list)) return; } From a957b61254a7d59a6c14ee2ac2db20a62eb299a1 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@kernel.dk> Date: Mon, 20 Dec 2021 20:32:24 -0700 Subject: [PATCH 111/132] block: fix error in handling dead task for ioprio setting Don't combine the task exiting and "already have io_context" case, we need to just abort if the task is marked as dead. Return -ESRCH, which is the documented value for ioprio_set() if the specified task could not be found. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Reported-by: syzbot+8836466a79f4175961b0@syzkaller.appspotmail.com Fixes: 5fc11eebb4a9 ("block: open code create_task_io_context in set_task_ioprio") Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 87bdc9ca8295..71c3a933cf16 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -279,7 +279,12 @@ int set_task_ioprio(struct task_struct *task, int ioprio) return -ENOMEM; task_lock(task); - if (task->io_context || (task->flags & PF_EXITING)) { + if (task->flags & PF_EXITING) { + err = -ESRCH; + kmem_cache_free(iocontext_cachep, ioc); + goto out; + } + if (task->io_context) { kmem_cache_free(iocontext_cachep, ioc); ioc = task->io_context; } else { @@ -287,8 +292,9 @@ int set_task_ioprio(struct task_struct *task, int ioprio) } } task->io_context->ioprio = ioprio; +out: task_unlock(task); - return 0; + return err; } EXPORT_SYMBOL_GPL(set_task_ioprio); From 37e11c3616f6182b6bd7f95a04df035b43464f39 Mon Sep 17 00:00:00 2001 From: Ming Lei <ming.lei@redhat.com> Date: Tue, 21 Dec 2021 12:04:36 +0800 Subject: [PATCH 112/132] block: call blk_exit_queue() before freeing q->stats blk_stat_disable_accounting() is added in commit 68497092bde9 ("block: make queue stat accounting a reference"), and called in kyber_exit_sched(). So we have to free q->stats after elevator is unloaded from blk_exit_queue() in blk_release_queue(). Otherwise kernel panic is caused. Fixes: 68497092bde9 ("block: make queue stat accounting a reference") Signed-off-by: Ming Lei <ming.lei@redhat.com> Link: https://lore.kernel.org/r/20211221040436.1333880-1-ming.lei@redhat.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 3e6357321225..e20eadfcf5c8 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -791,11 +791,11 @@ static void blk_release_queue(struct kobject *kobj) blk_stat_remove_callback(q, q->poll_cb); blk_stat_free_callback(q->poll_cb); + blk_exit_queue(q); + blk_free_queue_stats(q->stats); kfree(q->poll_stat); - blk_exit_queue(q); - blk_queue_free_zone_bitmaps(q); if (queue_is_mq(q)) From 99d8690aae4b2f0d1d90075de355ac087f820a66 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Tue, 21 Dec 2021 17:18:51 +0100 Subject: [PATCH 113/132] block: fix error unwinding in device_add_disk One device_add is called disk->ev will be freed by disk_release, so we should free it twice. Fix this by allocating disk->ev after device_add so that the extra local unwinding can be removed entirely. Based on an earlier patch from Tetsuo Handa. Reported-by: syzbot <syzbot+28a66a9fbc621c939000@syzkaller.appspotmail.com> Tested-by: syzbot <syzbot+28a66a9fbc621c939000@syzkaller.appspotmail.com> Fixes: 83cbce9574462c6b ("block: add error handling for device_add_disk / add_disk") Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211221161851.788424-1-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/genhd.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 3c139a1b6f04..603db5d6f10c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -442,10 +442,6 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, disk->first_minor = ret; } - ret = disk_alloc_events(disk); - if (ret) - goto out_free_ext_minor; - /* delay uevents, until we scanned partition table */ dev_set_uevent_suppress(ddev, 1); @@ -456,7 +452,12 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, ddev->devt = MKDEV(disk->major, disk->first_minor); ret = device_add(ddev); if (ret) - goto out_disk_release_events; + goto out_free_ext_minor; + + ret = disk_alloc_events(disk); + if (ret) + goto out_device_del; + if (!sysfs_deprecated) { ret = sysfs_create_link(block_depr, &ddev->kobj, kobject_name(&ddev->kobj)); @@ -538,8 +539,6 @@ out_del_block_link: sysfs_remove_link(block_depr, dev_name(ddev)); out_device_del: device_del(ddev); -out_disk_release_events: - disk_release_events(disk); out_free_ext_minor: if (disk->major == BLOCK_EXT_MAJOR) blk_free_ext_minor(disk->first_minor); From 37ae5a0f5287a52cf51242e76ccf198d02ffe495 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> Date: Sat, 18 Dec 2021 18:41:56 +0900 Subject: [PATCH 114/132] block: use "unsigned long" for blk_validate_block_size(). Since lo_simple_ioctl(LOOP_SET_BLOCK_SIZE) and ioctl(NBD_SET_BLKSIZE) pass user-controlled "unsigned long arg" to blk_validate_block_size(), "unsigned long" should be used for validation. Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Reviewed-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/9ecbf057-4375-c2db-ab53-e4cc0dff953d@i-love.sakura.ne.jp Signed-off-by: Jens Axboe <axboe@kernel.dk> --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c80cfaefc0a8..bb5fb7282e6e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -45,7 +45,7 @@ struct blk_crypto_profile; */ #define BLKCG_MAX_POLS 6 -static inline int blk_validate_block_size(unsigned int bsize) +static inline int blk_validate_block_size(unsigned long bsize) { if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) return -EINVAL; From e338924bd05d6e71574bc13e310c89e10e49a8a5 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> Date: Fri, 17 Dec 2021 23:51:25 +0900 Subject: [PATCH 115/132] block: check minor range in device_add_disk() ioctl(fd, LOOP_CTL_ADD, 1048576) causes sysfs: cannot create duplicate filename '/dev/block/7:0' message because such request is treated as if ioctl(fd, LOOP_CTL_ADD, 0) due to MINORMASK == 1048575. Verify that all minor numbers for that device fit in the minor range. Reported-by: wangyangbo <wangyangbo@uniontech.com> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Reviewed-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/b1b19379-23ee-5379-0eb5-94bf5f79f1b4@i-love.sakura.ne.jp Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/genhd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/genhd.c b/block/genhd.c index 603db5d6f10c..626c8406f21a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -431,6 +431,8 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, DISK_MAX_PARTS); disk->minors = DISK_MAX_PARTS; } + if (disk->first_minor + disk->minors > MINORMASK + 1) + return -EINVAL; } else { if (WARN_ON(disk->minors)) return -EINVAL; From 6fd3c510ee4b37f2f9fe3d3cafbfa459e15c5e11 Mon Sep 17 00:00:00 2001 From: Randy Dunlap <rdunlap@infradead.org> Date: Wed, 22 Dec 2021 13:15:32 -0800 Subject: [PATCH 116/132] bio.h: fix kernel-doc warnings Fix all kernel-doc warnings in <linux/bio.h>: include/linux/bio.h:136: warning: Function parameter or member 'nbytes' not described in 'bio_advance' include/linux/bio.h:136: warning: Excess function parameter 'bytes' description in 'bio_advance' include/linux/bio.h:391: warning: No description found for return value of 'bio_next_split' Signed-off-by: Randy Dunlap <rdunlap@infradead.org> Cc: Kent Overstreet <kent.overstreet@gmail.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: linux-block@vger.kernel.org Link: https://lore.kernel.org/r/20211222211532.24060-1-rdunlap@infradead.org Signed-off-by: Jens Axboe <axboe@kernel.dk> --- include/linux/bio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index fe6bdfbbef66..0a41efe02208 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -124,7 +124,7 @@ void __bio_advance(struct bio *, unsigned bytes); /** * bio_advance - increment/complete a bio by some number of bytes * @bio: bio to advance - * @bytes: number of bytes to complete + * @nbytes: number of bytes to complete * * This updates bi_sector, bi_size and bi_idx; if the number of bytes to * complete doesn't align with a bvec boundary, then bv_len and bv_offset will @@ -332,7 +332,7 @@ extern struct bio *bio_split(struct bio *bio, int sectors, * @gfp: gfp mask * @bs: bio set to allocate from * - * Returns a bio representing the next @sectors of @bio - if the bio is smaller + * Return: a bio representing the next @sectors of @bio - if the bio is smaller * than @sectors, returns the original bio unchanged. */ static inline struct bio *bio_next_split(struct bio *bio, int sectors, From a16c7246368db8935652c805bc446928d0e1c0aa Mon Sep 17 00:00:00 2001 From: Keith Busch <kbusch@kernel.org> Date: Wed, 22 Dec 2021 13:52:39 -0800 Subject: [PATCH 117/132] block: remove unnecessary trailing '\' While harmless, the blank line is certainly not intended to be part of the rq_list_for_each() macro. Remove it. Signed-off-by: Keith Busch <kbusch@kernel.org> Link: https://lore.kernel.org/r/20211222215239.1768164-1-kbusch@kernel.org Signed-off-by: Jens Axboe <axboe@kernel.dk> --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bb5fb7282e6e..22746b2d6825 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1363,7 +1363,7 @@ struct io_comp_batch { }) #define rq_list_for_each(listptr, pos) \ - for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) \ + for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) #define rq_list_next(rq) (rq)->rq_next #define rq_list_empty(list) ((list) == (struct request *) NULL) From 669a064625fa3a06ddf8a4ac1f35b7436b99f133 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn <lukas.bulwahn@gmail.com> Date: Thu, 23 Dec 2021 13:53:00 +0100 Subject: [PATCH 118/132] block: drop needless assignment in set_task_ioprio() Commit 5fc11eebb4a9 ("block: open code create_task_io_context in set_task_ioprio") introduces a needless assignment 'ioc = task->io_context', as the local variable ioc is not further used before returning. Even after the further fix, commit a957b61254a7 ("block: fix error in handling dead task for ioprio setting"), the assignment still remains needless. Drop this needless assignment in set_task_ioprio(). This code smell was identified with 'make clang-analyzer'. Fixes: 5fc11eebb4a9 ("block: open code create_task_io_context in set_task_ioprio") Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20211223125300.20691-1-lukas.bulwahn@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-ioc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 71c3a933cf16..11f49f78db32 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -284,12 +284,10 @@ int set_task_ioprio(struct task_struct *task, int ioprio) kmem_cache_free(iocontext_cachep, ioc); goto out; } - if (task->io_context) { + if (task->io_context) kmem_cache_free(iocontext_cachep, ioc); - ioc = task->io_context; - } else { + else task->io_context = ioc; - } } task->io_context->ioprio = ioprio; out: From edce22e19bfa86efa2522d041d6367f2f099e8ed Mon Sep 17 00:00:00 2001 From: Keith Busch <kbusch@kernel.org> Date: Wed, 5 Jan 2022 09:05:15 -0800 Subject: [PATCH 119/132] block: move rq_list macros to blk-mq.h Move the request list macros to the header file that defines that struct they operate on. Signed-off-by: Keith Busch <kbusch@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20220105170518.3181469-2-kbusch@kernel.org Signed-off-by: Jens Axboe <axboe@kernel.dk> --- fs/io_uring.c | 2 +- include/linux/blk-mq.h | 29 +++++++++++++++++++++++++++++ include/linux/blkdev.h | 29 ----------------------------- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c4f217613f56..42bbbd34f45e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -57,7 +57,7 @@ #include <linux/mman.h> #include <linux/percpu.h> #include <linux/slab.h> -#include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/bvec.h> #include <linux/net.h> #include <net/sock.h> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 550996cf419c..bf64b94cd64e 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -216,6 +216,35 @@ static inline unsigned short req_get_ioprio(struct request *req) #define rq_dma_dir(rq) \ (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) +#define rq_list_add(listptr, rq) do { \ + (rq)->rq_next = *(listptr); \ + *(listptr) = rq; \ +} while (0) + +#define rq_list_pop(listptr) \ +({ \ + struct request *__req = NULL; \ + if ((listptr) && *(listptr)) { \ + __req = *(listptr); \ + *(listptr) = __req->rq_next; \ + } \ + __req; \ +}) + +#define rq_list_peek(listptr) \ +({ \ + struct request *__req = NULL; \ + if ((listptr) && *(listptr)) \ + __req = *(listptr); \ + __req; \ +}) + +#define rq_list_for_each(listptr, pos) \ + for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) + +#define rq_list_next(rq) (rq)->rq_next +#define rq_list_empty(list) ((list) == (struct request *) NULL) + enum blk_eh_timer_return { BLK_EH_DONE, /* drivers has completed the command */ BLK_EH_RESET_TIMER, /* reset timer and try again */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 22746b2d6825..9c95df26fc26 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1339,33 +1339,4 @@ struct io_comp_batch { #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } -#define rq_list_add(listptr, rq) do { \ - (rq)->rq_next = *(listptr); \ - *(listptr) = rq; \ -} while (0) - -#define rq_list_pop(listptr) \ -({ \ - struct request *__req = NULL; \ - if ((listptr) && *(listptr)) { \ - __req = *(listptr); \ - *(listptr) = __req->rq_next; \ - } \ - __req; \ -}) - -#define rq_list_peek(listptr) \ -({ \ - struct request *__req = NULL; \ - if ((listptr) && *(listptr)) \ - __req = *(listptr); \ - __req; \ -}) - -#define rq_list_for_each(listptr, pos) \ - for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) - -#define rq_list_next(rq) (rq)->rq_next -#define rq_list_empty(list) ((list) == (struct request *) NULL) - #endif /* _LINUX_BLKDEV_H */ From 3764fd05e1f89530e2ee5cbff0b638f2b1141b90 Mon Sep 17 00:00:00 2001 From: Keith Busch <kbusch@kernel.org> Date: Wed, 5 Jan 2022 09:05:16 -0800 Subject: [PATCH 120/132] block: introduce rq_list_for_each_safe macro While iterating a list, a particular request may need to be removed for special handling. Provide an iterator that can safely handle that. Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Keith Busch <kbusch@kernel.org> Link: https://lore.kernel.org/r/20220105170518.3181469-3-kbusch@kernel.org Signed-off-by: Jens Axboe <axboe@kernel.dk> --- include/linux/blk-mq.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index bf64b94cd64e..1467f0fa2142 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -242,6 +242,10 @@ static inline unsigned short req_get_ioprio(struct request *req) #define rq_list_for_each(listptr, pos) \ for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) +#define rq_list_for_each_safe(listptr, pos, nxt) \ + for (pos = rq_list_peek((listptr)), nxt = rq_list_next(pos); \ + pos; pos = nxt, nxt = pos ? rq_list_next(pos) : NULL) + #define rq_list_next(rq) (rq)->rq_next #define rq_list_empty(list) ((list) == (struct request *) NULL) From d2528be7a8b09af9796a270debd14101a72bb552 Mon Sep 17 00:00:00 2001 From: Keith Busch <kbusch@kernel.org> Date: Wed, 5 Jan 2022 09:05:17 -0800 Subject: [PATCH 121/132] block: introduce rq_list_move When iterating a list, a particular request may need to be moved for special handling. Provide a helper function to achieve that so drivers don't need to reimplement rqlist manipulation. Signed-off-by: Keith Busch <kbusch@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20220105170518.3181469-4-kbusch@kernel.org Signed-off-by: Jens Axboe <axboe@kernel.dk> --- include/linux/blk-mq.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1467f0fa2142..f40a05ecca4a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -249,6 +249,23 @@ static inline unsigned short req_get_ioprio(struct request *req) #define rq_list_next(rq) (rq)->rq_next #define rq_list_empty(list) ((list) == (struct request *) NULL) +/** + * rq_list_move() - move a struct request from one list to another + * @src: The source list @rq is currently in + * @dst: The destination list that @rq will be appended to + * @rq: The request to move + * @prev: The request preceding @rq in @src (NULL if @rq is the head) + */ +static void inline rq_list_move(struct request **src, struct request **dst, + struct request *rq, struct request *prev) +{ + if (prev) + prev->rq_next = rq->rq_next; + else + *src = rq->rq_next; + rq_list_add(dst, rq); +} + enum blk_eh_timer_return { BLK_EH_DONE, /* drivers has completed the command */ BLK_EH_RESET_TIMER, /* reset timer and try again */ From 6bfec7992ec79b63fb07330ae97f3fb43120aa37 Mon Sep 17 00:00:00 2001 From: Keith Busch <kbusch@kernel.org> Date: Wed, 5 Jan 2022 09:05:18 -0800 Subject: [PATCH 122/132] nvme-pci: fix queue_rqs list splitting If command prep fails, current handling will orphan subsequent requests in the list. Consider a simple example: rqlist = [ 1 -> 2 ] When prep for request '1' fails, it will be appended to the 'requeue_list', leaving request '2' disconnected from the original rqlist and no longer tracked. Meanwhile, rqlist is still pointing to the failed request '1' and will attempt to submit the unprepped command. Fix this by updating the rqlist accordingly using the request list helper functions. Fixes: d62cbcf62f2f ("nvme: add support for mq_ops->queue_rqs()") Signed-off-by: Keith Busch <kbusch@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Link: https://lore.kernel.org/r/20220105170518.3181469-5-kbusch@kernel.org Signed-off-by: Jens Axboe <axboe@kernel.dk> --- drivers/nvme/host/pci.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 50deb8b69c40..d8585df2c2fd 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -999,30 +999,30 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req) static void nvme_queue_rqs(struct request **rqlist) { - struct request *req = rq_list_peek(rqlist), *prev = NULL; + struct request *req, *next, *prev = NULL; struct request *requeue_list = NULL; - do { + rq_list_for_each_safe(rqlist, req, next) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; if (!nvme_prep_rq_batch(nvmeq, req)) { /* detach 'req' and add to remainder list */ - if (prev) - prev->rq_next = req->rq_next; - rq_list_add(&requeue_list, req); - } else { - prev = req; + rq_list_move(rqlist, &requeue_list, req, prev); + + req = prev; + if (!req) + continue; } - req = rq_list_next(req); - if (!req || (prev && req->mq_hctx != prev->mq_hctx)) { + if (!next || req->mq_hctx != next->mq_hctx) { /* detach rest of list, and submit */ - if (prev) - prev->rq_next = NULL; + req->rq_next = NULL; nvme_submit_cmds(nvmeq, rqlist); - *rqlist = req; - } - } while (req); + *rqlist = next; + prev = NULL; + } else + prev = req; + } *rqlist = requeue_list; } From 292c33c95defd0b814fec1fc8cd60d16556cf7b8 Mon Sep 17 00:00:00 2001 From: Yang Li <yang.lee@linux.alibaba.com> Date: Fri, 7 Jan 2022 08:52:28 +0800 Subject: [PATCH 123/132] block: fix old-style declaration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the 'inline' keyword to the front of 'void'. Remove a warning found by clang(make W=1 LLVM=1) ./include/linux/blk-mq.h:259:1: warning: ‘inline’ is not at beginning of declaration Reported-by: Abaci Robot <abaci@linux.alibaba.com> Signed-off-by: Yang Li <yang.lee@linux.alibaba.com> Link: https://lore.kernel.org/r/20220107005228.103927-1-yang.lee@linux.alibaba.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f40a05ecca4a..d319ffa59354 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -256,7 +256,7 @@ static inline unsigned short req_get_ioprio(struct request *req) * @rq: The request to move * @prev: The request preceding @rq in @src (NULL if @rq is the head) */ -static void inline rq_list_move(struct request **src, struct request **dst, +static inline void rq_list_move(struct request **src, struct request **dst, struct request *rq, struct request *prev) { if (prev) From 9d497e2941c30a060ba62d5485b3bc9d91ffb09e Mon Sep 17 00:00:00 2001 From: Ming Lei <ming.lei@redhat.com> Date: Tue, 4 Jan 2022 21:42:23 +0800 Subject: [PATCH 124/132] block: don't protect submit_bio_checks by q_usage_counter Commit cc9c884dd7f4 ("block: call submit_bio_checks under q_usage_counter") uses q_usage_counter to protect submit_bio_checks for avoiding IO after disk is deleted by del_gendisk(). Turns out the protection isn't necessary, because once blk_mq_freeze_queue_wait() in del_gendisk() returns: 1) all in-flight IO has been done 2) all new IO will be failed in __bio_queue_enter() because q_usage_counter is dead, and GD_DEAD is set 3) both disk and request queue instance are safe since caller of submit_bio() guarantees that the disk can't be closed. Once submit_bio_checks() needn't the protection of q_usage_counter, we can move submit_bio_checks before calling blk_mq_submit_bio() and ->submit_bio(). With this change, we needn't to throttle queue with holding one allocated request, then precise driver tag or request won't be wasted in throttling. Meantime we can unify the bio check for both bio based and request based driver. Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Ming Lei <ming.lei@redhat.com> Link: https://lore.kernel.org/r/20220104134223.590803-1-ming.lei@redhat.com Signed-off-by: Jens Axboe <axboe@kernel.dk> --- block/blk-core.c | 14 +++++++++----- block/blk-mq.c | 39 +++++++++++++-------------------------- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 10619fd83c1b..97f8bc8d3a79 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -787,17 +787,21 @@ end_io: static void __submit_bio_fops(struct gendisk *disk, struct bio *bio) { - if (unlikely(bio_queue_enter(bio) != 0)) - return; - if (submit_bio_checks(bio) && blk_crypto_bio_prep(&bio)) - disk->fops->submit_bio(bio); - blk_queue_exit(disk->queue); + if (blk_crypto_bio_prep(&bio)) { + if (likely(bio_queue_enter(bio) == 0)) { + disk->fops->submit_bio(bio); + blk_queue_exit(disk->queue); + } + } } static void __submit_bio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; + if (unlikely(!submit_bio_checks(bio))) + return; + if (!disk->fops->submit_bio) blk_mq_submit_bio(bio); else diff --git a/block/blk-mq.c b/block/blk-mq.c index 0d7c9d3e0329..a6d4780580fc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2714,26 +2714,18 @@ static bool blk_mq_attempt_bio_merge(struct request_queue *q, static struct request *blk_mq_get_new_requests(struct request_queue *q, struct blk_plug *plug, - struct bio *bio, - unsigned int nsegs) + struct bio *bio) { struct blk_mq_alloc_data data = { .q = q, .nr_tags = 1, + .cmd_flags = bio->bi_opf, }; struct request *rq; if (unlikely(bio_queue_enter(bio))) return NULL; - if (unlikely(!submit_bio_checks(bio))) - goto queue_exit; - if (blk_mq_attempt_bio_merge(q, bio, nsegs)) - goto queue_exit; - rq_qos_throttle(q, bio); - - /* ->bi_opf is finalized after submit_bio_checks() returns */ - data.cmd_flags = bio->bi_opf; if (plug) { data.nr_tags = plug->nr_ios; plug->nr_ios = 1; @@ -2746,13 +2738,12 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); -queue_exit: blk_queue_exit(q); return NULL; } static inline struct request *blk_mq_get_cached_request(struct request_queue *q, - struct blk_plug *plug, struct bio **bio, unsigned int nsegs) + struct blk_plug *plug, struct bio *bio) { struct request *rq; @@ -2762,21 +2753,14 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, if (!rq || rq->q != q) return NULL; - if (unlikely(!submit_bio_checks(*bio))) + if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type) return NULL; - if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) { - *bio = NULL; - return NULL; - } - if (blk_mq_get_hctx_type((*bio)->bi_opf) != rq->mq_hctx->type) - return NULL; - if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf)) + if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) return NULL; - rq->cmd_flags = (*bio)->bi_opf; + rq->cmd_flags = bio->bi_opf; plug->cached_rq = rq_list_next(rq); INIT_LIST_HEAD(&rq->queuelist); - rq_qos_throttle(q, *bio); return rq; } @@ -2812,11 +2796,14 @@ void blk_mq_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) return; - rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs); + if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) + return; + + rq_qos_throttle(q, bio); + + rq = blk_mq_get_cached_request(q, plug, bio); if (!rq) { - if (!bio) - return; - rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); + rq = blk_mq_get_new_requests(q, plug, bio); if (unlikely(!rq)) return; } From ae7a7a53498f452eb927cd4b4eed0bccded85ebf Mon Sep 17 00:00:00 2001 From: Eric Biggers <ebiggers@google.com> Date: Wed, 8 Dec 2021 16:38:26 -0800 Subject: [PATCH 125/132] docs: sysfs-block: move to stable directory The block layer sysfs ABI is widely used by userspace software and is considered stable. Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Reviewed-by: Bart Van Assche <bvanassche@acm.org> Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20211209003833.6396-2-ebiggers@kernel.org Signed-off-by: Jens Axboe <axboe@kernel.dk> --- Documentation/ABI/{testing => stable}/sysfs-block | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Documentation/ABI/{testing => stable}/sysfs-block (100%) diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/stable/sysfs-block similarity index 100% rename from Documentation/ABI/testing/sysfs-block rename to Documentation/ABI/stable/sysfs-block From 07c9093c429361dd405499b1e433e4170b81551f Mon Sep 17 00:00:00 2001 From: Eric Biggers <ebiggers@google.com> Date: Wed, 8 Dec 2021 16:38:27 -0800 Subject: [PATCH 126/132] docs: sysfs-block: sort alphabetically Sort the documentation for the files alphabetically by file path so that there is a logical order and it's clear where to add new files. With two small exceptions, this patch doesn't change the documentation itself and just reorders it: - In /sys/block/<disk>/<part>/stat, I replaced <part> with <partition> to be consistent with the other files. - The description for /sys/block/<disk>/<part>/stat referred to another file "above", which I reworded. Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Reviewed-by: Hannes Reinecke <hare@suse.de> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Reviewed-by: Bart Van Assche <bvanassche@acm.org> Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20211209003833.6396-3-ebiggers@kernel.org Signed-off-by: Jens Axboe <axboe@kernel.dk> --- Documentation/ABI/stable/sysfs-block | 657 ++++++++++++++------------- 1 file changed, 339 insertions(+), 318 deletions(-) diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index b16b0c45a272..9febd53a5ebe 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -1,3 +1,342 @@ +What: /sys/block/<disk>/alignment_offset +Date: April 2009 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Storage devices may report a physical block size that is + bigger than the logical block size (for instance a drive + with 4KB physical sectors exposing 512-byte logical + blocks to the operating system). This parameter + indicates how many bytes the beginning of the device is + offset from the disk's natural alignment. + + +What: /sys/block/<disk>/discard_alignment +Date: May 2011 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Devices that support discard functionality may + internally allocate space in units that are bigger than + the exported logical block size. The discard_alignment + parameter indicates how many bytes the beginning of the + device is offset from the internal allocation unit's + natural alignment. + + +What: /sys/block/<disk>/diskseq +Date: February 2021 +Contact: Matteo Croce <mcroce@microsoft.com> +Description: + The /sys/block/<disk>/diskseq files reports the disk + sequence number, which is a monotonically increasing + number assigned to every drive. + Some devices, like the loop device, refresh such number + every time the backing file is changed. + The value type is 64 bit unsigned. + + +What: /sys/block/<disk>/inflight +Date: October 2009 +Contact: Jens Axboe <axboe@kernel.dk>, Nikanth Karthikesan <knikanth@suse.de> +Description: + Reports the number of I/O requests currently in progress + (pending / in flight) in a device driver. This can be less + than the number of requests queued in the block device queue. + The report contains 2 fields: one for read requests + and one for write requests. + The value type is unsigned int. + Cf. Documentation/block/stat.rst which contains a single value for + requests in flight. + This is related to nr_requests in Documentation/block/queue-sysfs.rst + and for SCSI device also its queue_depth. + + +What: /sys/block/<disk>/integrity/device_is_integrity_capable +Date: July 2014 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Indicates whether a storage device is capable of storing + integrity metadata. Set if the device is T10 PI-capable. + + +What: /sys/block/<disk>/integrity/format +Date: June 2008 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Metadata format for integrity capable block device. + E.g. T10-DIF-TYPE1-CRC. + + +What: /sys/block/<disk>/integrity/protection_interval_bytes +Date: July 2015 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Describes the number of data bytes which are protected + by one integrity tuple. Typically the device's logical + block size. + + +What: /sys/block/<disk>/integrity/read_verify +Date: June 2008 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Indicates whether the block layer should verify the + integrity of read requests serviced by devices that + support sending integrity metadata. + + +What: /sys/block/<disk>/integrity/tag_size +Date: June 2008 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Number of bytes of integrity tag space available per + 512 bytes of data. + + +What: /sys/block/<disk>/integrity/write_generate +Date: June 2008 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Indicates whether the block layer should automatically + generate checksums for write requests bound for + devices that support receiving integrity metadata. + + +What: /sys/block/<disk>/<partition>/alignment_offset +Date: April 2009 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Storage devices may report a physical block size that is + bigger than the logical block size (for instance a drive + with 4KB physical sectors exposing 512-byte logical + blocks to the operating system). This parameter + indicates how many bytes the beginning of the partition + is offset from the disk's natural alignment. + + +What: /sys/block/<disk>/<partition>/discard_alignment +Date: May 2011 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Devices that support discard functionality may + internally allocate space in units that are bigger than + the exported logical block size. The discard_alignment + parameter indicates how many bytes the beginning of the + partition is offset from the internal allocation unit's + natural alignment. + + +What: /sys/block/<disk>/<partition>/stat +Date: February 2008 +Contact: Jerome Marchand <jmarchan@redhat.com> +Description: + The /sys/block/<disk>/<partition>/stat files display the + I/O statistics of partition <partition>. The format is the + same as the format of /sys/block/<disk>/stat. + + +What: /sys/block/<disk>/queue/chunk_sectors +Date: September 2016 +Contact: Hannes Reinecke <hare@suse.com> +Description: + chunk_sectors has different meaning depending on the type + of the disk. For a RAID device (dm-raid), chunk_sectors + indicates the size in 512B sectors of the RAID volume + stripe segment. For a zoned block device, either + host-aware or host-managed, chunk_sectors indicates the + size in 512B sectors of the zones of the device, with + the eventual exception of the last zone of the device + which may be smaller. + + +What: /sys/block/<disk>/queue/discard_granularity +Date: May 2011 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Devices that support discard functionality may + internally allocate space using units that are bigger + than the logical block size. The discard_granularity + parameter indicates the size of the internal allocation + unit in bytes if reported by the device. Otherwise the + discard_granularity will be set to match the device's + physical block size. A discard_granularity of 0 means + that the device does not support discard functionality. + + +What: /sys/block/<disk>/queue/discard_max_bytes +Date: May 2011 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Devices that support discard functionality may have + internal limits on the number of bytes that can be + trimmed or unmapped in a single operation. Some storage + protocols also have inherent limits on the number of + blocks that can be described in a single command. The + discard_max_bytes parameter is set by the device driver + to the maximum number of bytes that can be discarded in + a single operation. Discard requests issued to the + device must not exceed this limit. A discard_max_bytes + value of 0 means that the device does not support + discard functionality. + + +What: /sys/block/<disk>/queue/discard_zeroes_data +Date: May 2011 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Will always return 0. Don't rely on any specific behavior + for discards, and don't read this file. + + +What: /sys/block/<disk>/queue/io_timeout +Date: November 2018 +Contact: Weiping Zhang <zhangweiping@didiglobal.com> +Description: + io_timeout is the request timeout in milliseconds. If a request + does not complete in this time then the block driver timeout + handler is invoked. That timeout handler can decide to retry + the request, to fail it or to start a device recovery strategy. + + +What: /sys/block/<disk>/queue/logical_block_size +Date: May 2009 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + This is the smallest unit the storage device can + address. It is typically 512 bytes. + + +What: /sys/block/<disk>/queue/max_active_zones +Date: July 2020 +Contact: Niklas Cassel <niklas.cassel@wdc.com> +Description: + For zoned block devices (zoned attribute indicating + "host-managed" or "host-aware"), the sum of zones belonging to + any of the zone states: EXPLICIT OPEN, IMPLICIT OPEN or CLOSED, + is limited by this value. If this value is 0, there is no limit. + + +What: /sys/block/<disk>/queue/max_open_zones +Date: July 2020 +Contact: Niklas Cassel <niklas.cassel@wdc.com> +Description: + For zoned block devices (zoned attribute indicating + "host-managed" or "host-aware"), the sum of zones belonging to + any of the zone states: EXPLICIT OPEN or IMPLICIT OPEN, + is limited by this value. If this value is 0, there is no limit. + + +What: /sys/block/<disk>/queue/minimum_io_size +Date: April 2009 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Storage devices may report a granularity or preferred + minimum I/O size which is the smallest request the + device can perform without incurring a performance + penalty. For disk drives this is often the physical + block size. For RAID arrays it is often the stripe + chunk size. A properly aligned multiple of + minimum_io_size is the preferred request size for + workloads where a high number of I/O operations is + desired. + + +What: /sys/block/<disk>/queue/nomerges +Date: January 2010 +Contact: +Description: + Standard I/O elevator operations include attempts to + merge contiguous I/Os. For known random I/O loads these + attempts will always fail and result in extra cycles + being spent in the kernel. This allows one to turn off + this behavior on one of two ways: When set to 1, complex + merge checks are disabled, but the simple one-shot merges + with the previous I/O request are enabled. When set to 2, + all merge tries are disabled. The default value is 0 - + which enables all types of merge tries. + + +What: /sys/block/<disk>/queue/nr_zones +Date: November 2018 +Contact: Damien Le Moal <damien.lemoal@wdc.com> +Description: + nr_zones indicates the total number of zones of a zoned block + device ("host-aware" or "host-managed" zone model). For regular + block devices, the value is always 0. + + +What: /sys/block/<disk>/queue/optimal_io_size +Date: April 2009 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Storage devices may report an optimal I/O size, which is + the device's preferred unit for sustained I/O. This is + rarely reported for disk drives. For RAID arrays it is + usually the stripe width or the internal track size. A + properly aligned multiple of optimal_io_size is the + preferred request size for workloads where sustained + throughput is desired. If no optimal I/O size is + reported this file contains 0. + + +What: /sys/block/<disk>/queue/physical_block_size +Date: May 2009 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + This is the smallest unit a physical storage device can + write atomically. It is usually the same as the logical + block size but may be bigger. One example is SATA + drives with 4KB sectors that expose a 512-byte logical + block size to the operating system. For stacked block + devices the physical_block_size variable contains the + maximum physical_block_size of the component devices. + + +What: /sys/block/<disk>/queue/write_same_max_bytes +Date: January 2012 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Some devices support a write same operation in which a + single data block can be written to a range of several + contiguous blocks on storage. This can be used to wipe + areas on disk or to initialize drives in a RAID + configuration. write_same_max_bytes indicates how many + bytes can be written in a single write same command. If + write_same_max_bytes is 0, write same is not supported + by the device. + + +What: /sys/block/<disk>/queue/write_zeroes_max_bytes +Date: November 2016 +Contact: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com> +Description: + Devices that support write zeroes operation in which a + single request can be issued to zero out the range of + contiguous blocks on storage without having any payload + in the request. This can be used to optimize writing zeroes + to the devices. write_zeroes_max_bytes indicates how many + bytes can be written in a single write zeroes command. If + write_zeroes_max_bytes is 0, write zeroes is not supported + by the device. + + +What: /sys/block/<disk>/queue/zoned +Date: September 2016 +Contact: Damien Le Moal <damien.lemoal@wdc.com> +Description: + zoned indicates if the device is a zoned block device + and the zone model of the device if it is indeed zoned. + The possible values indicated by zoned are "none" for + regular block devices and "host-aware" or "host-managed" + for zoned block devices. The characteristics of + host-aware and host-managed zoned block devices are + described in the ZBC (Zoned Block Commands) and ZAC + (Zoned Device ATA Command Set) standards. These standards + also define the "drive-managed" zone model. However, + since drive-managed zoned block devices do not support + zone commands, they will be treated as regular block + devices and zoned will report "none". + + What: /sys/block/<disk>/stat Date: February 2008 Contact: Jerome Marchand <jmarchan@redhat.com> @@ -26,321 +365,3 @@ Description: == ============================================== For more details refer Documentation/admin-guide/iostats.rst - - -What: /sys/block/<disk>/inflight -Date: October 2009 -Contact: Jens Axboe <axboe@kernel.dk>, Nikanth Karthikesan <knikanth@suse.de> -Description: - Reports the number of I/O requests currently in progress - (pending / in flight) in a device driver. This can be less - than the number of requests queued in the block device queue. - The report contains 2 fields: one for read requests - and one for write requests. - The value type is unsigned int. - Cf. Documentation/block/stat.rst which contains a single value for - requests in flight. - This is related to nr_requests in Documentation/block/queue-sysfs.rst - and for SCSI device also its queue_depth. - - -What: /sys/block/<disk>/diskseq -Date: February 2021 -Contact: Matteo Croce <mcroce@microsoft.com> -Description: - The /sys/block/<disk>/diskseq files reports the disk - sequence number, which is a monotonically increasing - number assigned to every drive. - Some devices, like the loop device, refresh such number - every time the backing file is changed. - The value type is 64 bit unsigned. - - -What: /sys/block/<disk>/<part>/stat -Date: February 2008 -Contact: Jerome Marchand <jmarchan@redhat.com> -Description: - The /sys/block/<disk>/<part>/stat files display the - I/O statistics of partition <part>. The format is the - same as the above-written /sys/block/<disk>/stat - format. - - -What: /sys/block/<disk>/integrity/format -Date: June 2008 -Contact: Martin K. Petersen <martin.petersen@oracle.com> -Description: - Metadata format for integrity capable block device. - E.g. T10-DIF-TYPE1-CRC. - - -What: /sys/block/<disk>/integrity/read_verify -Date: June 2008 -Contact: Martin K. Petersen <martin.petersen@oracle.com> -Description: - Indicates whether the block layer should verify the - integrity of read requests serviced by devices that - support sending integrity metadata. - - -What: /sys/block/<disk>/integrity/tag_size -Date: June 2008 -Contact: Martin K. Petersen <martin.petersen@oracle.com> -Description: - Number of bytes of integrity tag space available per - 512 bytes of data. - - -What: /sys/block/<disk>/integrity/device_is_integrity_capable -Date: July 2014 -Contact: Martin K. Petersen <martin.petersen@oracle.com> -Description: - Indicates whether a storage device is capable of storing - integrity metadata. Set if the device is T10 PI-capable. - -What: /sys/block/<disk>/integrity/protection_interval_bytes -Date: July 2015 -Contact: Martin K. Petersen <martin.petersen@oracle.com> -Description: - Describes the number of data bytes which are protected - by one integrity tuple. Typically the device's logical - block size. - -What: /sys/block/<disk>/integrity/write_generate -Date: June 2008 -Contact: Martin K. Petersen <martin.petersen@oracle.com> -Description: - Indicates whether the block layer should automatically - generate checksums for write requests bound for - devices that support receiving integrity metadata. - -What: /sys/block/<disk>/alignment_offset -Date: April 2009 -Contact: Martin K. Petersen <martin.petersen@oracle.com> -Description: - Storage devices may report a physical block size that is - bigger than the logical block size (for instance a drive - with 4KB physical sectors exposing 512-byte logical - blocks to the operating system). This parameter - indicates how many bytes the beginning of the device is - offset from the disk's natural alignment. - -What: /sys/block/<disk>/<partition>/alignment_offset -Date: April 2009 -Contact: Martin K. Petersen <martin.petersen@oracle.com> -Description: - Storage devices may report a physical block size that is - bigger than the logical block size (for instance a drive - with 4KB physical sectors exposing 512-byte logical - blocks to the operating system). This parameter - indicates how many bytes the beginning of the partition - is offset from the disk's natural alignment. - -What: /sys/block/<disk>/queue/logical_block_size -Date: May 2009 -Contact: Martin K. Petersen <martin.petersen@oracle.com> -Description: - This is the smallest unit the storage device can - address. It is typically 512 bytes. - -What: /sys/block/<disk>/queue/physical_block_size -Date: May 2009 -Contact: Martin K. Petersen <martin.petersen@oracle.com> -Description: - This is the smallest unit a physical storage device can - write atomically. It is usually the same as the logical - block size but may be bigger. One example is SATA - drives with 4KB sectors that expose a 512-byte logical - block size to the operating system. For stacked block - devices the physical_block_size variable contains the - maximum physical_block_size of the component devices. - -What: /sys/block/<disk>/queue/minimum_io_size -Date: April 2009 -Contact: Martin K. Petersen <martin.petersen@oracle.com> -Description: - Storage devices may report a granularity or preferred - minimum I/O size which is the smallest request the - device can perform without incurring a performance - penalty. For disk drives this is often the physical - block size. For RAID arrays it is often the stripe - chunk size. A properly aligned multiple of - minimum_io_size is the preferred request size for - workloads where a high number of I/O operations is - desired. - -What: /sys/block/<disk>/queue/optimal_io_size -Date: April 2009 -Contact: Martin K. Petersen <martin.petersen@oracle.com> -Description: - Storage devices may report an optimal I/O size, which is - the device's preferred unit for sustained I/O. This is - rarely reported for disk drives. For RAID arrays it is - usually the stripe width or the internal track size. A - properly aligned multiple of optimal_io_size is the - preferred request size for workloads where sustained - throughput is desired. If no optimal I/O size is - reported this file contains 0. - -What: /sys/block/<disk>/queue/nomerges -Date: January 2010 -Contact: -Description: - Standard I/O elevator operations include attempts to - merge contiguous I/Os. For known random I/O loads these - attempts will always fail and result in extra cycles - being spent in the kernel. This allows one to turn off - this behavior on one of two ways: When set to 1, complex - merge checks are disabled, but the simple one-shot merges - with the previous I/O request are enabled. When set to 2, - all merge tries are disabled. The default value is 0 - - which enables all types of merge tries. - -What: /sys/block/<disk>/discard_alignment -Date: May 2011 -Contact: Martin K. Petersen <martin.petersen@oracle.com> -Description: - Devices that support discard functionality may - internally allocate space in units that are bigger than - the exported logical block size. The discard_alignment - parameter indicates how many bytes the beginning of the - device is offset from the internal allocation unit's - natural alignment. - -What: /sys/block/<disk>/<partition>/discard_alignment -Date: May 2011 -Contact: Martin K. Petersen <martin.petersen@oracle.com> -Description: - Devices that support discard functionality may - internally allocate space in units that are bigger than - the exported logical block size. The discard_alignment - parameter indicates how many bytes the beginning of the - partition is offset from the internal allocation unit's - natural alignment. - -What: /sys/block/<disk>/queue/discard_granularity -Date: May 2011 -Contact: Martin K. Petersen <martin.petersen@oracle.com> -Description: - Devices that support discard functionality may - internally allocate space using units that are bigger - than the logical block size. The discard_granularity - parameter indicates the size of the internal allocation - unit in bytes if reported by the device. Otherwise the - discard_granularity will be set to match the device's - physical block size. A discard_granularity of 0 means - that the device does not support discard functionality. - -What: /sys/block/<disk>/queue/discard_max_bytes -Date: May 2011 -Contact: Martin K. Petersen <martin.petersen@oracle.com> -Description: - Devices that support discard functionality may have - internal limits on the number of bytes that can be - trimmed or unmapped in a single operation. Some storage - protocols also have inherent limits on the number of - blocks that can be described in a single command. The - discard_max_bytes parameter is set by the device driver - to the maximum number of bytes that can be discarded in - a single operation. Discard requests issued to the - device must not exceed this limit. A discard_max_bytes - value of 0 means that the device does not support - discard functionality. - -What: /sys/block/<disk>/queue/discard_zeroes_data -Date: May 2011 -Contact: Martin K. Petersen <martin.petersen@oracle.com> -Description: - Will always return 0. Don't rely on any specific behavior - for discards, and don't read this file. - -What: /sys/block/<disk>/queue/write_same_max_bytes -Date: January 2012 -Contact: Martin K. Petersen <martin.petersen@oracle.com> -Description: - Some devices support a write same operation in which a - single data block can be written to a range of several - contiguous blocks on storage. This can be used to wipe - areas on disk or to initialize drives in a RAID - configuration. write_same_max_bytes indicates how many - bytes can be written in a single write same command. If - write_same_max_bytes is 0, write same is not supported - by the device. - -What: /sys/block/<disk>/queue/write_zeroes_max_bytes -Date: November 2016 -Contact: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com> -Description: - Devices that support write zeroes operation in which a - single request can be issued to zero out the range of - contiguous blocks on storage without having any payload - in the request. This can be used to optimize writing zeroes - to the devices. write_zeroes_max_bytes indicates how many - bytes can be written in a single write zeroes command. If - write_zeroes_max_bytes is 0, write zeroes is not supported - by the device. - -What: /sys/block/<disk>/queue/zoned -Date: September 2016 -Contact: Damien Le Moal <damien.lemoal@wdc.com> -Description: - zoned indicates if the device is a zoned block device - and the zone model of the device if it is indeed zoned. - The possible values indicated by zoned are "none" for - regular block devices and "host-aware" or "host-managed" - for zoned block devices. The characteristics of - host-aware and host-managed zoned block devices are - described in the ZBC (Zoned Block Commands) and ZAC - (Zoned Device ATA Command Set) standards. These standards - also define the "drive-managed" zone model. However, - since drive-managed zoned block devices do not support - zone commands, they will be treated as regular block - devices and zoned will report "none". - -What: /sys/block/<disk>/queue/nr_zones -Date: November 2018 -Contact: Damien Le Moal <damien.lemoal@wdc.com> -Description: - nr_zones indicates the total number of zones of a zoned block - device ("host-aware" or "host-managed" zone model). For regular - block devices, the value is always 0. - -What: /sys/block/<disk>/queue/max_active_zones -Date: July 2020 -Contact: Niklas Cassel <niklas.cassel@wdc.com> -Description: - For zoned block devices (zoned attribute indicating - "host-managed" or "host-aware"), the sum of zones belonging to - any of the zone states: EXPLICIT OPEN, IMPLICIT OPEN or CLOSED, - is limited by this value. If this value is 0, there is no limit. - -What: /sys/block/<disk>/queue/max_open_zones -Date: July 2020 -Contact: Niklas Cassel <niklas.cassel@wdc.com> -Description: - For zoned block devices (zoned attribute indicating - "host-managed" or "host-aware"), the sum of zones belonging to - any of the zone states: EXPLICIT OPEN or IMPLICIT OPEN, - is limited by this value. If this value is 0, there is no limit. - -What: /sys/block/<disk>/queue/chunk_sectors -Date: September 2016 -Contact: Hannes Reinecke <hare@suse.com> -Description: - chunk_sectors has different meaning depending on the type - of the disk. For a RAID device (dm-raid), chunk_sectors - indicates the size in 512B sectors of the RAID volume - stripe segment. For a zoned block device, either - host-aware or host-managed, chunk_sectors indicates the - size in 512B sectors of the zones of the device, with - the eventual exception of the last zone of the device - which may be smaller. - -What: /sys/block/<disk>/queue/io_timeout -Date: November 2018 -Contact: Weiping Zhang <zhangweiping@didiglobal.com> -Description: - io_timeout is the request timeout in milliseconds. If a request - does not complete in this time then the block driver timeout - handler is invoked. That timeout handler can decide to retry - the request, to fail it or to start a device recovery strategy. From 8b0551a74b4a9396a7f6ddb0c5f6f3c8465e9d45 Mon Sep 17 00:00:00 2001 From: Eric Biggers <ebiggers@google.com> Date: Wed, 8 Dec 2021 16:38:28 -0800 Subject: [PATCH 127/132] docs: sysfs-block: add contact for nomerges The nomerges file was missing a "Contact" entry. Use linux-block. Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Reviewed-by: Hannes Reinecke <hare@suse.de> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Reviewed-by: Bart Van Assche <bvanassche@acm.org> Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20211209003833.6396-4-ebiggers@kernel.org Signed-off-by: Jens Axboe <axboe@kernel.dk> --- Documentation/ABI/stable/sysfs-block | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 9febd53a5ebe..c70fce6b76c1 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -242,7 +242,7 @@ Description: What: /sys/block/<disk>/queue/nomerges Date: January 2010 -Contact: +Contact: linux-block@vger.kernel.org Description: Standard I/O elevator operations include attempts to merge contiguous I/Os. For known random I/O loads these From 849ab826e10531f106846e8e9eeae8d00a198f6e Mon Sep 17 00:00:00 2001 From: Eric Biggers <ebiggers@google.com> Date: Wed, 8 Dec 2021 16:38:29 -0800 Subject: [PATCH 128/132] docs: sysfs-block: fill in missing documentation from queue-sysfs.rst sysfs documentation is supposed to go in Documentation/ABI/. However, /sys/block/<disk>/queue/* are documented in Documentation/block/queue-sysfs.rst, and sometimes redundantly in Documentation/ABI/stable/sysfs-block too. Let's consolidate this documentation into Documentation/ABI/. Therefore, copy the relevant docs from queue-sysfs.rst into sysfs-block. This primarily means adding the 25 missing files that were documented in queue-sysfs.rst only, as well as mentioning the RO/RW status of files. Documentation/ABI/ requires "Date" and "Contact" fields. For the Date fields, I used the date of the commit which added support for each file. For the "Contact" fields, I used linux-block. Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Reviewed-by: Hannes Reinecke <hare@suse.de> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Reviewed-by: Bart Van Assche <bvanassche@acm.org> Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20211209003833.6396-5-ebiggers@kernel.org Signed-off-by: Jens Axboe <axboe@kernel.dk> --- Documentation/ABI/stable/sysfs-block | 482 +++++++++++++++++++++------ 1 file changed, 381 insertions(+), 101 deletions(-) diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index c70fce6b76c1..de3b86a3dfa5 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -46,7 +46,7 @@ Description: The value type is unsigned int. Cf. Documentation/block/stat.rst which contains a single value for requests in flight. - This is related to nr_requests in Documentation/block/queue-sysfs.rst + This is related to /sys/block/<disk>/queue/nr_requests and for SCSI device also its queue_depth. @@ -134,207 +134,487 @@ Description: same as the format of /sys/block/<disk>/stat. +What: /sys/block/<disk>/queue/add_random +Date: June 2010 +Contact: linux-block@vger.kernel.org +Description: + [RW] This file allows to turn off the disk entropy contribution. + Default value of this file is '1'(on). + + What: /sys/block/<disk>/queue/chunk_sectors Date: September 2016 Contact: Hannes Reinecke <hare@suse.com> Description: - chunk_sectors has different meaning depending on the type + [RO] chunk_sectors has different meaning depending on the type of the disk. For a RAID device (dm-raid), chunk_sectors - indicates the size in 512B sectors of the RAID volume - stripe segment. For a zoned block device, either - host-aware or host-managed, chunk_sectors indicates the - size in 512B sectors of the zones of the device, with - the eventual exception of the last zone of the device - which may be smaller. + indicates the size in 512B sectors of the RAID volume stripe + segment. For a zoned block device, either host-aware or + host-managed, chunk_sectors indicates the size in 512B sectors + of the zones of the device, with the eventual exception of the + last zone of the device which may be smaller. + + +What: /sys/block/<disk>/queue/dax +Date: June 2016 +Contact: linux-block@vger.kernel.org +Description: + [RO] This file indicates whether the device supports Direct + Access (DAX), used by CPU-addressable storage to bypass the + pagecache. It shows '1' if true, '0' if not. What: /sys/block/<disk>/queue/discard_granularity Date: May 2011 Contact: Martin K. Petersen <martin.petersen@oracle.com> Description: - Devices that support discard functionality may - internally allocate space using units that are bigger - than the logical block size. The discard_granularity - parameter indicates the size of the internal allocation - unit in bytes if reported by the device. Otherwise the - discard_granularity will be set to match the device's - physical block size. A discard_granularity of 0 means - that the device does not support discard functionality. + [RO] Devices that support discard functionality may internally + allocate space using units that are bigger than the logical + block size. The discard_granularity parameter indicates the size + of the internal allocation unit in bytes if reported by the + device. Otherwise the discard_granularity will be set to match + the device's physical block size. A discard_granularity of 0 + means that the device does not support discard functionality. What: /sys/block/<disk>/queue/discard_max_bytes Date: May 2011 Contact: Martin K. Petersen <martin.petersen@oracle.com> Description: - Devices that support discard functionality may have - internal limits on the number of bytes that can be - trimmed or unmapped in a single operation. Some storage - protocols also have inherent limits on the number of - blocks that can be described in a single command. The - discard_max_bytes parameter is set by the device driver - to the maximum number of bytes that can be discarded in - a single operation. Discard requests issued to the - device must not exceed this limit. A discard_max_bytes - value of 0 means that the device does not support - discard functionality. + [RW] While discard_max_hw_bytes is the hardware limit for the + device, this setting is the software limit. Some devices exhibit + large latencies when large discards are issued, setting this + value lower will make Linux issue smaller discards and + potentially help reduce latencies induced by large discard + operations. + + +What: /sys/block/<disk>/queue/discard_max_hw_bytes +Date: July 2015 +Contact: linux-block@vger.kernel.org +Description: + [RO] Devices that support discard functionality may have + internal limits on the number of bytes that can be trimmed or + unmapped in a single operation. The `discard_max_hw_bytes` + parameter is set by the device driver to the maximum number of + bytes that can be discarded in a single operation. Discard + requests issued to the device must not exceed this limit. A + `discard_max_hw_bytes` value of 0 means that the device does not + support discard functionality. What: /sys/block/<disk>/queue/discard_zeroes_data Date: May 2011 Contact: Martin K. Petersen <martin.petersen@oracle.com> Description: - Will always return 0. Don't rely on any specific behavior + [RO] Will always return 0. Don't rely on any specific behavior for discards, and don't read this file. +What: /sys/block/<disk>/queue/fua +Date: May 2018 +Contact: linux-block@vger.kernel.org +Description: + [RO] Whether or not the block driver supports the FUA flag for + write requests. FUA stands for Force Unit Access. If the FUA + flag is set that means that write requests must bypass the + volatile cache of the storage device. + + +What: /sys/block/<disk>/queue/hw_sector_size +Date: January 2008 +Contact: linux-block@vger.kernel.org +Description: + [RO] This is the hardware sector size of the device, in bytes. + + +What: /sys/block/<disk>/queue/independent_access_ranges/ +Date: October 2021 +Contact: linux-block@vger.kernel.org +Description: + [RO] The presence of this sub-directory of the + /sys/block/xxx/queue/ directory indicates that the device is + capable of executing requests targeting different sector ranges + in parallel. For instance, single LUN multi-actuator hard-disks + will have an independent_access_ranges directory if the device + correctly advertizes the sector ranges of its actuators. + + The independent_access_ranges directory contains one directory + per access range, with each range described using the sector + (RO) attribute file to indicate the first sector of the range + and the nr_sectors (RO) attribute file to indicate the total + number of sectors in the range starting from the first sector of + the range. For example, a dual-actuator hard-disk will have the + following independent_access_ranges entries.:: + + $ tree /sys/block/<disk>/queue/independent_access_ranges/ + /sys/block/<disk>/queue/independent_access_ranges/ + |-- 0 + | |-- nr_sectors + | `-- sector + `-- 1 + |-- nr_sectors + `-- sector + + The sector and nr_sectors attributes use 512B sector unit, + regardless of the actual block size of the device. Independent + access ranges do not overlap and include all sectors within the + device capacity. The access ranges are numbered in increasing + order of the range start sector, that is, the sector attribute + of range 0 always has the value 0. + + +What: /sys/block/<disk>/queue/io_poll +Date: November 2015 +Contact: linux-block@vger.kernel.org +Description: + [RW] When read, this file shows whether polling is enabled (1) + or disabled (0). Writing '0' to this file will disable polling + for this device. Writing any non-zero value will enable this + feature. + + +What: /sys/block/<disk>/queue/io_poll_delay +Date: November 2016 +Contact: linux-block@vger.kernel.org +Description: + [RW] If polling is enabled, this controls what kind of polling + will be performed. It defaults to -1, which is classic polling. + In this mode, the CPU will repeatedly ask for completions + without giving up any time. If set to 0, a hybrid polling mode + is used, where the kernel will attempt to make an educated guess + at when the IO will complete. Based on this guess, the kernel + will put the process issuing IO to sleep for an amount of time, + before entering a classic poll loop. This mode might be a little + slower than pure classic polling, but it will be more efficient. + If set to a value larger than 0, the kernel will put the process + issuing IO to sleep for this amount of microseconds before + entering classic polling. + + What: /sys/block/<disk>/queue/io_timeout Date: November 2018 Contact: Weiping Zhang <zhangweiping@didiglobal.com> Description: - io_timeout is the request timeout in milliseconds. If a request - does not complete in this time then the block driver timeout - handler is invoked. That timeout handler can decide to retry - the request, to fail it or to start a device recovery strategy. + [RW] io_timeout is the request timeout in milliseconds. If a + request does not complete in this time then the block driver + timeout handler is invoked. That timeout handler can decide to + retry the request, to fail it or to start a device recovery + strategy. + + +What: /sys/block/<disk>/queue/iostats +Date: January 2009 +Contact: linux-block@vger.kernel.org +Description: + [RW] This file is used to control (on/off) the iostats + accounting of the disk. What: /sys/block/<disk>/queue/logical_block_size Date: May 2009 Contact: Martin K. Petersen <martin.petersen@oracle.com> Description: - This is the smallest unit the storage device can - address. It is typically 512 bytes. + [RO] This is the smallest unit the storage device can address. + It is typically 512 bytes. What: /sys/block/<disk>/queue/max_active_zones Date: July 2020 Contact: Niklas Cassel <niklas.cassel@wdc.com> Description: - For zoned block devices (zoned attribute indicating + [RO] For zoned block devices (zoned attribute indicating "host-managed" or "host-aware"), the sum of zones belonging to any of the zone states: EXPLICIT OPEN, IMPLICIT OPEN or CLOSED, is limited by this value. If this value is 0, there is no limit. + If the host attempts to exceed this limit, the driver should + report this error with BLK_STS_ZONE_ACTIVE_RESOURCE, which user + space may see as the EOVERFLOW errno. + + +What: /sys/block/<disk>/queue/max_discard_segments +Date: February 2017 +Contact: linux-block@vger.kernel.org +Description: + [RO] The maximum number of DMA scatter/gather entries in a + discard request. + + +What: /sys/block/<disk>/queue/max_hw_sectors_kb +Date: September 2004 +Contact: linux-block@vger.kernel.org +Description: + [RO] This is the maximum number of kilobytes supported in a + single data transfer. + + +What: /sys/block/<disk>/queue/max_integrity_segments +Date: September 2010 +Contact: linux-block@vger.kernel.org +Description: + [RO] Maximum number of elements in a DMA scatter/gather list + with integrity data that will be submitted by the block layer + core to the associated block driver. + What: /sys/block/<disk>/queue/max_open_zones Date: July 2020 Contact: Niklas Cassel <niklas.cassel@wdc.com> Description: - For zoned block devices (zoned attribute indicating + [RO] For zoned block devices (zoned attribute indicating "host-managed" or "host-aware"), the sum of zones belonging to - any of the zone states: EXPLICIT OPEN or IMPLICIT OPEN, - is limited by this value. If this value is 0, there is no limit. + any of the zone states: EXPLICIT OPEN or IMPLICIT OPEN, is + limited by this value. If this value is 0, there is no limit. + + +What: /sys/block/<disk>/queue/max_sectors_kb +Date: September 2004 +Contact: linux-block@vger.kernel.org +Description: + [RW] This is the maximum number of kilobytes that the block + layer will allow for a filesystem request. Must be smaller than + or equal to the maximum size allowed by the hardware. + + +What: /sys/block/<disk>/queue/max_segment_size +Date: March 2010 +Contact: linux-block@vger.kernel.org +Description: + [RO] Maximum size in bytes of a single element in a DMA + scatter/gather list. + + +What: /sys/block/<disk>/queue/max_segments +Date: March 2010 +Contact: linux-block@vger.kernel.org +Description: + [RO] Maximum number of elements in a DMA scatter/gather list + that is submitted to the associated block driver. What: /sys/block/<disk>/queue/minimum_io_size Date: April 2009 Contact: Martin K. Petersen <martin.petersen@oracle.com> Description: - Storage devices may report a granularity or preferred - minimum I/O size which is the smallest request the - device can perform without incurring a performance - penalty. For disk drives this is often the physical - block size. For RAID arrays it is often the stripe - chunk size. A properly aligned multiple of - minimum_io_size is the preferred request size for - workloads where a high number of I/O operations is - desired. + [RO] Storage devices may report a granularity or preferred + minimum I/O size which is the smallest request the device can + perform without incurring a performance penalty. For disk + drives this is often the physical block size. For RAID arrays + it is often the stripe chunk size. A properly aligned multiple + of minimum_io_size is the preferred request size for workloads + where a high number of I/O operations is desired. What: /sys/block/<disk>/queue/nomerges Date: January 2010 Contact: linux-block@vger.kernel.org Description: - Standard I/O elevator operations include attempts to - merge contiguous I/Os. For known random I/O loads these - attempts will always fail and result in extra cycles - being spent in the kernel. This allows one to turn off - this behavior on one of two ways: When set to 1, complex - merge checks are disabled, but the simple one-shot merges - with the previous I/O request are enabled. When set to 2, - all merge tries are disabled. The default value is 0 - - which enables all types of merge tries. + [RW] Standard I/O elevator operations include attempts to merge + contiguous I/Os. For known random I/O loads these attempts will + always fail and result in extra cycles being spent in the + kernel. This allows one to turn off this behavior on one of two + ways: When set to 1, complex merge checks are disabled, but the + simple one-shot merges with the previous I/O request are + enabled. When set to 2, all merge tries are disabled. The + default value is 0 - which enables all types of merge tries. + + +What: /sys/block/<disk>/queue/nr_requests +Date: July 2003 +Contact: linux-block@vger.kernel.org +Description: + [RW] This controls how many requests may be allocated in the + block layer for read or write requests. Note that the total + allocated number may be twice this amount, since it applies only + to reads or writes (not the accumulated sum). + + To avoid priority inversion through request starvation, a + request queue maintains a separate request pool per each cgroup + when CONFIG_BLK_CGROUP is enabled, and this parameter applies to + each such per-block-cgroup request pool. IOW, if there are N + block cgroups, each request queue may have up to N request + pools, each independently regulated by nr_requests. What: /sys/block/<disk>/queue/nr_zones Date: November 2018 Contact: Damien Le Moal <damien.lemoal@wdc.com> Description: - nr_zones indicates the total number of zones of a zoned block - device ("host-aware" or "host-managed" zone model). For regular - block devices, the value is always 0. + [RO] nr_zones indicates the total number of zones of a zoned + block device ("host-aware" or "host-managed" zone model). For + regular block devices, the value is always 0. What: /sys/block/<disk>/queue/optimal_io_size Date: April 2009 Contact: Martin K. Petersen <martin.petersen@oracle.com> Description: - Storage devices may report an optimal I/O size, which is - the device's preferred unit for sustained I/O. This is - rarely reported for disk drives. For RAID arrays it is - usually the stripe width or the internal track size. A - properly aligned multiple of optimal_io_size is the - preferred request size for workloads where sustained - throughput is desired. If no optimal I/O size is - reported this file contains 0. + [RO] Storage devices may report an optimal I/O size, which is + the device's preferred unit for sustained I/O. This is rarely + reported for disk drives. For RAID arrays it is usually the + stripe width or the internal track size. A properly aligned + multiple of optimal_io_size is the preferred request size for + workloads where sustained throughput is desired. If no optimal + I/O size is reported this file contains 0. What: /sys/block/<disk>/queue/physical_block_size Date: May 2009 Contact: Martin K. Petersen <martin.petersen@oracle.com> Description: - This is the smallest unit a physical storage device can - write atomically. It is usually the same as the logical - block size but may be bigger. One example is SATA - drives with 4KB sectors that expose a 512-byte logical - block size to the operating system. For stacked block - devices the physical_block_size variable contains the - maximum physical_block_size of the component devices. + [RO] This is the smallest unit a physical storage device can + write atomically. It is usually the same as the logical block + size but may be bigger. One example is SATA drives with 4KB + sectors that expose a 512-byte logical block size to the + operating system. For stacked block devices the + physical_block_size variable contains the maximum + physical_block_size of the component devices. + + +What: /sys/block/<disk>/queue/read_ahead_kb +Date: May 2004 +Contact: linux-block@vger.kernel.org +Description: + [RW] Maximum number of kilobytes to read-ahead for filesystems + on this block device. + + +What: /sys/block/<disk>/queue/rotational +Date: January 2009 +Contact: linux-block@vger.kernel.org +Description: + [RW] This file is used to stat if the device is of rotational + type or non-rotational type. + + +What: /sys/block/<disk>/queue/rq_affinity +Date: September 2008 +Contact: linux-block@vger.kernel.org +Description: + [RW] If this option is '1', the block layer will migrate request + completions to the cpu "group" that originally submitted the + request. For some workloads this provides a significant + reduction in CPU cycles due to caching effects. + + For storage configurations that need to maximize distribution of + completion processing setting this option to '2' forces the + completion to run on the requesting cpu (bypassing the "group" + aggregation logic). + + +What: /sys/block/<disk>/queue/scheduler +Date: October 2004 +Contact: linux-block@vger.kernel.org +Description: + [RW] When read, this file will display the current and available + IO schedulers for this block device. The currently active IO + scheduler will be enclosed in [] brackets. Writing an IO + scheduler name to this file will switch control of this block + device to that new IO scheduler. Note that writing an IO + scheduler name to this file will attempt to load that IO + scheduler module, if it isn't already present in the system. + + +What: /sys/block/<disk>/queue/throttle_sample_time +Date: March 2017 +Contact: linux-block@vger.kernel.org +Description: + [RW] This is the time window that blk-throttle samples data, in + millisecond. blk-throttle makes decision based on the + samplings. Lower time means cgroups have more smooth throughput, + but higher CPU overhead. This exists only when + CONFIG_BLK_DEV_THROTTLING_LOW is enabled. + + +What: /sys/block/<disk>/queue/wbt_lat_usec +Date: November 2016 +Contact: linux-block@vger.kernel.org +Description: + [RW] If the device is registered for writeback throttling, then + this file shows the target minimum read latency. If this latency + is exceeded in a given window of time (see wb_window_usec), then + the writeback throttling will start scaling back writes. Writing + a value of '0' to this file disables the feature. Writing a + value of '-1' to this file resets the value to the default + setting. + + +What: /sys/block/<disk>/queue/write_cache +Date: April 2016 +Contact: linux-block@vger.kernel.org +Description: + [RW] When read, this file will display whether the device has + write back caching enabled or not. It will return "write back" + for the former case, and "write through" for the latter. Writing + to this file can change the kernels view of the device, but it + doesn't alter the device state. This means that it might not be + safe to toggle the setting from "write back" to "write through", + since that will also eliminate cache flushes issued by the + kernel. What: /sys/block/<disk>/queue/write_same_max_bytes Date: January 2012 Contact: Martin K. Petersen <martin.petersen@oracle.com> Description: - Some devices support a write same operation in which a + [RO] Some devices support a write same operation in which a single data block can be written to a range of several - contiguous blocks on storage. This can be used to wipe - areas on disk or to initialize drives in a RAID - configuration. write_same_max_bytes indicates how many - bytes can be written in a single write same command. If - write_same_max_bytes is 0, write same is not supported - by the device. + contiguous blocks on storage. This can be used to wipe areas on + disk or to initialize drives in a RAID configuration. + write_same_max_bytes indicates how many bytes can be written in + a single write same command. If write_same_max_bytes is 0, write + same is not supported by the device. What: /sys/block/<disk>/queue/write_zeroes_max_bytes Date: November 2016 Contact: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com> Description: - Devices that support write zeroes operation in which a - single request can be issued to zero out the range of - contiguous blocks on storage without having any payload - in the request. This can be used to optimize writing zeroes - to the devices. write_zeroes_max_bytes indicates how many - bytes can be written in a single write zeroes command. If - write_zeroes_max_bytes is 0, write zeroes is not supported - by the device. + [RO] Devices that support write zeroes operation in which a + single request can be issued to zero out the range of contiguous + blocks on storage without having any payload in the request. + This can be used to optimize writing zeroes to the devices. + write_zeroes_max_bytes indicates how many bytes can be written + in a single write zeroes command. If write_zeroes_max_bytes is + 0, write zeroes is not supported by the device. + + +What: /sys/block/<disk>/queue/zone_append_max_bytes +Date: May 2020 +Contact: linux-block@vger.kernel.org +Description: + [RO] This is the maximum number of bytes that can be written to + a sequential zone of a zoned block device using a zone append + write operation (REQ_OP_ZONE_APPEND). This value is always 0 for + regular block devices. + + +What: /sys/block/<disk>/queue/zone_write_granularity +Date: January 2021 +Contact: linux-block@vger.kernel.org +Description: + [RO] This indicates the alignment constraint, in bytes, for + write operations in sequential zones of zoned block devices + (devices with a zoned attributed that reports "host-managed" or + "host-aware"). This value is always 0 for regular block devices. What: /sys/block/<disk>/queue/zoned Date: September 2016 Contact: Damien Le Moal <damien.lemoal@wdc.com> Description: - zoned indicates if the device is a zoned block device - and the zone model of the device if it is indeed zoned. - The possible values indicated by zoned are "none" for - regular block devices and "host-aware" or "host-managed" - for zoned block devices. The characteristics of - host-aware and host-managed zoned block devices are - described in the ZBC (Zoned Block Commands) and ZAC - (Zoned Device ATA Command Set) standards. These standards - also define the "drive-managed" zone model. However, - since drive-managed zoned block devices do not support - zone commands, they will be treated as regular block - devices and zoned will report "none". + [RO] zoned indicates if the device is a zoned block device and + the zone model of the device if it is indeed zoned. The + possible values indicated by zoned are "none" for regular block + devices and "host-aware" or "host-managed" for zoned block + devices. The characteristics of host-aware and host-managed + zoned block devices are described in the ZBC (Zoned Block + Commands) and ZAC (Zoned Device ATA Command Set) standards. + These standards also define the "drive-managed" zone model. + However, since drive-managed zoned block devices do not support + zone commands, they will be treated as regular block devices and + zoned will report "none". What: /sys/block/<disk>/stat From 1163010418a7f0c60c309743498cb6c5cd828ecc Mon Sep 17 00:00:00 2001 From: Eric Biggers <ebiggers@google.com> Date: Wed, 8 Dec 2021 16:38:30 -0800 Subject: [PATCH 129/132] docs: sysfs-block: document stable_writes /sys/block/<disk>/queue/stable_writes is completely undocumented. Document it. Reviewed-by: Bart Van Assche <bvanassche@acm.org> Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20211209003833.6396-6-ebiggers@kernel.org Signed-off-by: Jens Axboe <axboe@kernel.dk> --- Documentation/ABI/stable/sysfs-block | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index de3b86a3dfa5..288626e8cb53 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -516,6 +516,23 @@ Description: scheduler module, if it isn't already present in the system. +What: /sys/block/<disk>/queue/stable_writes +Date: September 2020 +Contact: linux-block@vger.kernel.org +Description: + [RW] This file will contain '1' if memory must not be modified + while it is being used in a write request to this device. When + this is the case and the kernel is performing writeback of a + page, the kernel will wait for writeback to complete before + allowing the page to be modified again, rather than allowing + immediate modification as is normally the case. This + restriction arises when the device accesses the memory multiple + times where the same data must be seen every time -- for + example, once to calculate a checksum and once to actually write + the data. If no such restriction exists, this file will contain + '0'. This file is writable for testing purposes. + + What: /sys/block/<disk>/queue/throttle_sample_time Date: March 2017 Contact: linux-block@vger.kernel.org From 8bc2f7c67061cb39e317a45ad9870f529b1fb190 Mon Sep 17 00:00:00 2001 From: Eric Biggers <ebiggers@google.com> Date: Wed, 8 Dec 2021 16:38:31 -0800 Subject: [PATCH 130/132] docs: sysfs-block: document virt_boundary_mask /sys/block/<disk>/queue/virt_boundary_mask is completely undocumented. Document it. Reviewed-by: Bart Van Assche <bvanassche@acm.org> Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20211209003833.6396-7-ebiggers@kernel.org Signed-off-by: Jens Axboe <axboe@kernel.dk> --- Documentation/ABI/stable/sysfs-block | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 288626e8cb53..8dd3e84a8aad 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -544,6 +544,18 @@ Description: CONFIG_BLK_DEV_THROTTLING_LOW is enabled. +What: /sys/block/<disk>/queue/virt_boundary_mask +Date: April 2021 +Contact: linux-block@vger.kernel.org +Description: + [RO] This file shows the I/O segment memory alignment mask for + the block device. I/O requests to this device will be split + between segments wherever either the memory address of the end + of the previous segment or the memory address of the beginning + of the current segment is not aligned to virt_boundary_mask + 1 + bytes. + + What: /sys/block/<disk>/queue/wbt_lat_usec Date: November 2016 Contact: linux-block@vger.kernel.org From 208e4f9c0028e9181220460600b1df0bc677e796 Mon Sep 17 00:00:00 2001 From: Eric Biggers <ebiggers@google.com> Date: Wed, 8 Dec 2021 16:38:32 -0800 Subject: [PATCH 131/132] docs: block: remove queue-sysfs.rst This has been replaced by Documentation/ABI/stable/sysfs-block, which is the correct place for sysfs documentation. Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Reviewed-by: Hannes Reinecke <hare@suse.de> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Reviewed-by: Bart Van Assche <bvanassche@acm.org> Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20211209003833.6396-8-ebiggers@kernel.org Signed-off-by: Jens Axboe <axboe@kernel.dk> --- Documentation/block/index.rst | 1 - Documentation/block/queue-sysfs.rst | 321 ---------------------------- 2 files changed, 322 deletions(-) delete mode 100644 Documentation/block/queue-sysfs.rst diff --git a/Documentation/block/index.rst b/Documentation/block/index.rst index 86dcf7159f99..3a41495dd77b 100644 --- a/Documentation/block/index.rst +++ b/Documentation/block/index.rst @@ -20,7 +20,6 @@ Block kyber-iosched null_blk pr - queue-sysfs request stat switching-sched diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst deleted file mode 100644 index 3f569d532485..000000000000 --- a/Documentation/block/queue-sysfs.rst +++ /dev/null @@ -1,321 +0,0 @@ -================= -Queue sysfs files -================= - -This text file will detail the queue files that are located in the sysfs tree -for each block device. Note that stacked devices typically do not export -any settings, since their queue merely functions as a remapping target. -These files are the ones found in the /sys/block/xxx/queue/ directory. - -Files denoted with a RO postfix are readonly and the RW postfix means -read-write. - -add_random (RW) ---------------- -This file allows to turn off the disk entropy contribution. Default -value of this file is '1'(on). - -chunk_sectors (RO) ------------------- -This has different meaning depending on the type of the block device. -For a RAID device (dm-raid), chunk_sectors indicates the size in 512B sectors -of the RAID volume stripe segment. For a zoned block device, either host-aware -or host-managed, chunk_sectors indicates the size in 512B sectors of the zones -of the device, with the eventual exception of the last zone of the device which -may be smaller. - -dax (RO) --------- -This file indicates whether the device supports Direct Access (DAX), -used by CPU-addressable storage to bypass the pagecache. It shows '1' -if true, '0' if not. - -discard_granularity (RO) ------------------------- -This shows the size of internal allocation of the device in bytes, if -reported by the device. A value of '0' means device does not support -the discard functionality. - -discard_max_hw_bytes (RO) -------------------------- -Devices that support discard functionality may have internal limits on -the number of bytes that can be trimmed or unmapped in a single operation. -The `discard_max_hw_bytes` parameter is set by the device driver to the -maximum number of bytes that can be discarded in a single operation. -Discard requests issued to the device must not exceed this limit. -A `discard_max_hw_bytes` value of 0 means that the device does not support -discard functionality. - -discard_max_bytes (RW) ----------------------- -While discard_max_hw_bytes is the hardware limit for the device, this -setting is the software limit. Some devices exhibit large latencies when -large discards are issued, setting this value lower will make Linux issue -smaller discards and potentially help reduce latencies induced by large -discard operations. - -discard_zeroes_data (RO) ------------------------- -Obsolete. Always zero. - -fua (RO) --------- -Whether or not the block driver supports the FUA flag for write requests. -FUA stands for Force Unit Access. If the FUA flag is set that means that -write requests must bypass the volatile cache of the storage device. - -hw_sector_size (RO) -------------------- -This is the hardware sector size of the device, in bytes. - -io_poll (RW) ------------- -When read, this file shows whether polling is enabled (1) or disabled -(0). Writing '0' to this file will disable polling for this device. -Writing any non-zero value will enable this feature. - -io_poll_delay (RW) ------------------- -If polling is enabled, this controls what kind of polling will be -performed. It defaults to -1, which is classic polling. In this mode, -the CPU will repeatedly ask for completions without giving up any time. -If set to 0, a hybrid polling mode is used, where the kernel will attempt -to make an educated guess at when the IO will complete. Based on this -guess, the kernel will put the process issuing IO to sleep for an amount -of time, before entering a classic poll loop. This mode might be a -little slower than pure classic polling, but it will be more efficient. -If set to a value larger than 0, the kernel will put the process issuing -IO to sleep for this amount of microseconds before entering classic -polling. - -io_timeout (RW) ---------------- -io_timeout is the request timeout in milliseconds. If a request does not -complete in this time then the block driver timeout handler is invoked. -That timeout handler can decide to retry the request, to fail it or to start -a device recovery strategy. - -iostats (RW) -------------- -This file is used to control (on/off) the iostats accounting of the -disk. - -logical_block_size (RO) ------------------------ -This is the logical block size of the device, in bytes. - -max_discard_segments (RO) -------------------------- -The maximum number of DMA scatter/gather entries in a discard request. - -max_hw_sectors_kb (RO) ----------------------- -This is the maximum number of kilobytes supported in a single data transfer. - -max_integrity_segments (RO) ---------------------------- -Maximum number of elements in a DMA scatter/gather list with integrity -data that will be submitted by the block layer core to the associated -block driver. - -max_active_zones (RO) ---------------------- -For zoned block devices (zoned attribute indicating "host-managed" or -"host-aware"), the sum of zones belonging to any of the zone states: -EXPLICIT OPEN, IMPLICIT OPEN or CLOSED, is limited by this value. -If this value is 0, there is no limit. - -If the host attempts to exceed this limit, the driver should report this error -with BLK_STS_ZONE_ACTIVE_RESOURCE, which user space may see as the EOVERFLOW -errno. - -max_open_zones (RO) -------------------- -For zoned block devices (zoned attribute indicating "host-managed" or -"host-aware"), the sum of zones belonging to any of the zone states: -EXPLICIT OPEN or IMPLICIT OPEN, is limited by this value. -If this value is 0, there is no limit. - -If the host attempts to exceed this limit, the driver should report this error -with BLK_STS_ZONE_OPEN_RESOURCE, which user space may see as the ETOOMANYREFS -errno. - -max_sectors_kb (RW) -------------------- -This is the maximum number of kilobytes that the block layer will allow -for a filesystem request. Must be smaller than or equal to the maximum -size allowed by the hardware. - -max_segments (RO) ------------------ -Maximum number of elements in a DMA scatter/gather list that is submitted -to the associated block driver. - -max_segment_size (RO) ---------------------- -Maximum size in bytes of a single element in a DMA scatter/gather list. - -minimum_io_size (RO) --------------------- -This is the smallest preferred IO size reported by the device. - -nomerges (RW) -------------- -This enables the user to disable the lookup logic involved with IO -merging requests in the block layer. By default (0) all merges are -enabled. When set to 1 only simple one-hit merges will be tried. When -set to 2 no merge algorithms will be tried (including one-hit or more -complex tree/hash lookups). - -nr_requests (RW) ----------------- -This controls how many requests may be allocated in the block layer for -read or write requests. Note that the total allocated number may be twice -this amount, since it applies only to reads or writes (not the accumulated -sum). - -To avoid priority inversion through request starvation, a request -queue maintains a separate request pool per each cgroup when -CONFIG_BLK_CGROUP is enabled, and this parameter applies to each such -per-block-cgroup request pool. IOW, if there are N block cgroups, -each request queue may have up to N request pools, each independently -regulated by nr_requests. - -nr_zones (RO) -------------- -For zoned block devices (zoned attribute indicating "host-managed" or -"host-aware"), this indicates the total number of zones of the device. -This is always 0 for regular block devices. - -optimal_io_size (RO) --------------------- -This is the optimal IO size reported by the device. - -physical_block_size (RO) ------------------------- -This is the physical block size of device, in bytes. - -read_ahead_kb (RW) ------------------- -Maximum number of kilobytes to read-ahead for filesystems on this block -device. - -rotational (RW) ---------------- -This file is used to stat if the device is of rotational type or -non-rotational type. - -rq_affinity (RW) ----------------- -If this option is '1', the block layer will migrate request completions to the -cpu "group" that originally submitted the request. For some workloads this -provides a significant reduction in CPU cycles due to caching effects. - -For storage configurations that need to maximize distribution of completion -processing setting this option to '2' forces the completion to run on the -requesting cpu (bypassing the "group" aggregation logic). - -scheduler (RW) --------------- -When read, this file will display the current and available IO schedulers -for this block device. The currently active IO scheduler will be enclosed -in [] brackets. Writing an IO scheduler name to this file will switch -control of this block device to that new IO scheduler. Note that writing -an IO scheduler name to this file will attempt to load that IO scheduler -module, if it isn't already present in the system. - -write_cache (RW) ----------------- -When read, this file will display whether the device has write back -caching enabled or not. It will return "write back" for the former -case, and "write through" for the latter. Writing to this file can -change the kernels view of the device, but it doesn't alter the -device state. This means that it might not be safe to toggle the -setting from "write back" to "write through", since that will also -eliminate cache flushes issued by the kernel. - -write_same_max_bytes (RO) -------------------------- -This is the number of bytes the device can write in a single write-same -command. A value of '0' means write-same is not supported by this -device. - -wbt_lat_usec (RW) ------------------ -If the device is registered for writeback throttling, then this file shows -the target minimum read latency. If this latency is exceeded in a given -window of time (see wb_window_usec), then the writeback throttling will start -scaling back writes. Writing a value of '0' to this file disables the -feature. Writing a value of '-1' to this file resets the value to the -default setting. - -throttle_sample_time (RW) -------------------------- -This is the time window that blk-throttle samples data, in millisecond. -blk-throttle makes decision based on the samplings. Lower time means cgroups -have more smooth throughput, but higher CPU overhead. This exists only when -CONFIG_BLK_DEV_THROTTLING_LOW is enabled. - -write_zeroes_max_bytes (RO) ---------------------------- -For block drivers that support REQ_OP_WRITE_ZEROES, the maximum number of -bytes that can be zeroed at once. The value 0 means that REQ_OP_WRITE_ZEROES -is not supported. - -zone_append_max_bytes (RO) --------------------------- -This is the maximum number of bytes that can be written to a sequential -zone of a zoned block device using a zone append write operation -(REQ_OP_ZONE_APPEND). This value is always 0 for regular block devices. - -zoned (RO) ----------- -This indicates if the device is a zoned block device and the zone model of the -device if it is indeed zoned. The possible values indicated by zoned are -"none" for regular block devices and "host-aware" or "host-managed" for zoned -block devices. The characteristics of host-aware and host-managed zoned block -devices are described in the ZBC (Zoned Block Commands) and ZAC -(Zoned Device ATA Command Set) standards. These standards also define the -"drive-managed" zone model. However, since drive-managed zoned block devices -do not support zone commands, they will be treated as regular block devices -and zoned will report "none". - -zone_write_granularity (RO) ---------------------------- -This indicates the alignment constraint, in bytes, for write operations in -sequential zones of zoned block devices (devices with a zoned attributed -that reports "host-managed" or "host-aware"). This value is always 0 for -regular block devices. - -independent_access_ranges (RO) ------------------------------- - -The presence of this sub-directory of the /sys/block/xxx/queue/ directory -indicates that the device is capable of executing requests targeting -different sector ranges in parallel. For instance, single LUN multi-actuator -hard-disks will have an independent_access_ranges directory if the device -correctly advertizes the sector ranges of its actuators. - -The independent_access_ranges directory contains one directory per access -range, with each range described using the sector (RO) attribute file to -indicate the first sector of the range and the nr_sectors (RO) attribute file -to indicate the total number of sectors in the range starting from the first -sector of the range. For example, a dual-actuator hard-disk will have the -following independent_access_ranges entries.:: - - $ tree /sys/block/<device>/queue/independent_access_ranges/ - /sys/block/<device>/queue/independent_access_ranges/ - |-- 0 - | |-- nr_sectors - | `-- sector - `-- 1 - |-- nr_sectors - `-- sector - -The sector and nr_sectors attributes use 512B sector unit, regardless of -the actual block size of the device. Independent access ranges do not -overlap and include all sectors within the device capacity. The access -ranges are numbered in increasing order of the range start sector, -that is, the sector attribute of range 0 always has the value 0. - -Jens Axboe <jens.axboe@oracle.com>, February 2009 From f029cedb9bb5bab7f1bb3042be348f2dac0ee66e Mon Sep 17 00:00:00 2001 From: Eric Biggers <ebiggers@google.com> Date: Wed, 8 Dec 2021 16:38:33 -0800 Subject: [PATCH 132/132] MAINTAINERS: add entries for block layer documentation Include Documentation/block/ and Documentation/ABI/stable/sysfs-block in the "BLOCK LAYER" maintainers file entry. Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Reviewed-by: Hannes Reinecke <hare@suse.de> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Reviewed-by: Bart Van Assche <bvanassche@acm.org> Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20211209003833.6396-9-ebiggers@kernel.org Signed-off-by: Jens Axboe <axboe@kernel.dk> --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 360e9aa0205d..19db69dda15a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3380,6 +3380,8 @@ M: Jens Axboe <axboe@kernel.dk> L: linux-block@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git +F: Documentation/ABI/stable/sysfs-block +F: Documentation/block/ F: block/ F: drivers/block/ F: include/linux/blk*