From 2dd209f00fc5a1caafa493066c7cd692fd2fd57c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 9 Mar 2020 21:26:16 -0700 Subject: [PATCH 01/72] blk-mq: Fix a comment in include/linux/blk-mq.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'hctx_list' member of struct blk_mq_hw_ctx is not a list head but instead an entry in q->unused_hctx_list. Fix the comment above this struct member. Fixes: d386732bc142 ("blk-mq: fill header with kernel-doc") Signed-off-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Cc: André Almeida Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 11cfd6470b1a..31344d5f83e2 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -162,7 +162,10 @@ struct blk_mq_hw_ctx { struct dentry *sched_debugfs_dir; #endif - /** @hctx_list: List of all hardware queues. */ + /** + * @hctx_list: if this hctx is not in use, this is an entry in + * q->unused_hctx_list. + */ struct list_head hctx_list; /** From 6e66b49392419f3fe134e1be583323ef75da1e4b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 9 Mar 2020 21:26:17 -0700 Subject: [PATCH 02/72] blk-mq: Keep set->nr_hw_queues and set->map[].nr_queues in sync blk_mq_map_queues() and multiple .map_queues() implementations expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the number of hardware queues. Hence set .nr_queues before calling these functions. This patch fixes the following kernel warning: WARNING: CPU: 0 PID: 2501 at include/linux/cpumask.h:137 Call Trace: blk_mq_run_hw_queue+0x19d/0x350 block/blk-mq.c:1508 blk_mq_run_hw_queues+0x112/0x1a0 block/blk-mq.c:1525 blk_mq_requeue_work+0x502/0x780 block/blk-mq.c:775 process_one_work+0x9af/0x1740 kernel/workqueue.c:2269 worker_thread+0x98/0xe40 kernel/workqueue.c:2415 kthread+0x361/0x430 kernel/kthread.c:255 Fixes: ed76e329d74a ("blk-mq: abstract out queue map") # v5.0 Reported-by: syzbot+d44e1b26ce5c3e77458d@syzkaller.appspotmail.com Signed-off-by: Bart Van Assche Reviewed-by: Ming Lei Reviewed-by: Chaitanya Kulkarni Cc: Johannes Thumshirn Cc: Hannes Reinecke Cc: Ming Lei Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index d92088dec6c3..d4bd9b961726 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3023,6 +3023,14 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) { + /* + * blk_mq_map_queues() and multiple .map_queues() implementations + * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the + * number of hardware queues. + */ + if (set->nr_maps == 1) + set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; + if (set->ops->map_queues && !is_kdump_kernel()) { int i; From d0930bb8f46b8fb4a7d429c0bf1c91b3ed00a7cf Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 9 Mar 2020 21:26:18 -0700 Subject: [PATCH 03/72] blk-mq: Fix a recently introduced regression in blk_mq_realloc_hw_ctxs() q->nr_hw_queues must only be updated once it is known that blk_mq_realloc_hw_ctxs() has succeeded. Otherwise it can happen that reallocation fails and that q->nr_hw_queues is larger than the number of allocated hardware queues. This patch fixes the following crash if increasing the number of hardware queues fails: BUG: KASAN: null-ptr-deref in blk_mq_map_swqueue+0x775/0x810 Write of size 8 at addr 0000000000000118 by task check/977 CPU: 3 PID: 977 Comm: check Not tainted 5.6.0-rc1-dbg+ #8 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Call Trace: dump_stack+0xa5/0xe6 __kasan_report.cold+0x65/0x99 kasan_report+0x16/0x20 check_memory_region+0x140/0x1b0 memset+0x28/0x40 blk_mq_map_swqueue+0x775/0x810 blk_mq_update_nr_hw_queues+0x468/0x710 nullb_device_submit_queues_store+0xf7/0x1a0 [null_blk] configfs_write_file+0x1c4/0x250 [configfs] __vfs_write+0x4c/0x90 vfs_write+0x145/0x2c0 ksys_write+0xd7/0x180 __x64_sys_write+0x47/0x50 do_syscall_64+0x6f/0x2f0 entry_SYSCALL_64_after_hwframe+0x49/0xbe Fixes: ac0d6b926e74 ("block: Reduce the amount of memory required per request queue") Signed-off-by: Bart Van Assche Reviewed-by: Ming Lei Cc: Keith Busch Cc: Johannes Thumshirn Cc: Hannes Reinecke Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index d4bd9b961726..37ff8dfb8ab9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2824,7 +2824,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, memcpy(new_hctxs, hctxs, q->nr_hw_queues * sizeof(*hctxs)); q->queue_hw_ctx = new_hctxs; - q->nr_hw_queues = set->nr_hw_queues; kfree(hctxs); hctxs = new_hctxs; } From b9853b4d6fb403ccb1d4d82e2d39fc17fc07519c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 9 Mar 2020 21:26:19 -0700 Subject: [PATCH 04/72] null_blk: Suppress an UBSAN complaint triggered when setting 'memory_backed' Although it is not clear to me why UBSAN complains when 'memory_backed' is set, this patch suppresses the UBSAN complaint that is triggered when setting that configfs attribute. UBSAN: Undefined behaviour in drivers/block/null_blk_main.c:327:1 load of value 16 is not a valid value for type '_Bool' CPU: 2 PID: 8396 Comm: check Not tainted 5.6.0-rc1-dbg+ #14 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Call Trace: dump_stack+0xa5/0xe6 ubsan_epilogue+0x9/0x26 __ubsan_handle_load_invalid_value+0x6d/0x76 nullb_device_memory_backed_store.cold+0x2c/0x38 [null_blk] configfs_write_file+0x1c4/0x250 [configfs] __vfs_write+0x4c/0x90 vfs_write+0x145/0x2c0 ksys_write+0xd7/0x180 __x64_sys_write+0x47/0x50 do_syscall_64+0x6f/0x2f0 entry_SYSCALL_64_after_hwframe+0x49/0xbe Signed-off-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Cc: Johannes Thumshirn Cc: Hannes Reinecke Cc: Ming Lei Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/null_blk_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 133060431dbd..b8a08b82b882 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -276,7 +276,7 @@ nullb_device_##NAME##_store(struct config_item *item, const char *page, \ { \ int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\ struct nullb_device *dev = to_nullb_device(item); \ - TYPE uninitialized_var(new_value); \ + TYPE new_value = 0; \ int ret; \ \ ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\ From 78b10be23d42e66fc42e52b9ae620f0a6ef5b8c7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 9 Mar 2020 21:26:20 -0700 Subject: [PATCH 05/72] null_blk: Fix changing the number of hardware queues Instead of initializing null_blk hardware queues explicitly after the request queue has been created, provide .init_hctx() and .exit_hctx() callback functions. The latter functions are not only called during request queue allocation but also when the number of hardware queues changes. Allocate nr_cpu_ids queues during initialization to support increasing the number of hardware queues above the initial hardware queue count. This change fixes increasing the number of hardware queues above the initial number of hardware queues and also keeps nullb->nr_queues in sync with the number of hardware queues. Fixes: 45919fbfe1c4 ("null_blk: Enable modifying 'submit_queues' after an instance has been configured") Signed-off-by: Bart Van Assche Cc: Johannes Thumshirn Cc: Hannes Reinecke Cc: Ming Lei Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/null_blk_main.c | 103 ++++++++++++++++++++++------------ 1 file changed, 66 insertions(+), 37 deletions(-) diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index b8a08b82b882..83e31b81db3c 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -302,6 +302,12 @@ static int nullb_apply_submit_queues(struct nullb_device *dev, if (!nullb) return 0; + /* + * Make sure that null_init_hctx() does not access nullb->queues[] past + * the end of that array. + */ + if (submit_queues > nr_cpu_ids) + return -EINVAL; set = nullb->tag_set; blk_mq_update_nr_hw_queues(set, submit_queues); return set->nr_hw_queues == submit_queues ? 0 : -ENOMEM; @@ -1408,12 +1414,6 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq)); } -static const struct blk_mq_ops null_mq_ops = { - .queue_rq = null_queue_rq, - .complete = null_complete_rq, - .timeout = null_timeout_rq, -}; - static void cleanup_queue(struct nullb_queue *nq) { kfree(nq->tag_map); @@ -1430,6 +1430,43 @@ static void cleanup_queues(struct nullb *nullb) kfree(nullb->queues); } +static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + struct nullb_queue *nq = hctx->driver_data; + struct nullb *nullb = nq->dev->nullb; + + nullb->nr_queues--; +} + +static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) +{ + init_waitqueue_head(&nq->wait); + nq->queue_depth = nullb->queue_depth; + nq->dev = nullb->dev; +} + +static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, + unsigned int hctx_idx) +{ + struct nullb *nullb = hctx->queue->queuedata; + struct nullb_queue *nq; + + nq = &nullb->queues[hctx_idx]; + hctx->driver_data = nq; + null_init_queue(nullb, nq); + nullb->nr_queues++; + + return 0; +} + +static const struct blk_mq_ops null_mq_ops = { + .queue_rq = null_queue_rq, + .complete = null_complete_rq, + .timeout = null_timeout_rq, + .init_hctx = null_init_hctx, + .exit_hctx = null_exit_hctx, +}; + static void null_del_dev(struct nullb *nullb) { struct nullb_device *dev = nullb->dev; @@ -1473,33 +1510,6 @@ static const struct block_device_operations null_ops = { .report_zones = null_report_zones, }; -static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) -{ - BUG_ON(!nullb); - BUG_ON(!nq); - - init_waitqueue_head(&nq->wait); - nq->queue_depth = nullb->queue_depth; - nq->dev = nullb->dev; -} - -static void null_init_queues(struct nullb *nullb) -{ - struct request_queue *q = nullb->q; - struct blk_mq_hw_ctx *hctx; - struct nullb_queue *nq; - int i; - - queue_for_each_hw_ctx(q, hctx, i) { - if (!hctx->nr_ctx || !hctx->tags) - continue; - nq = &nullb->queues[i]; - hctx->driver_data = nq; - null_init_queue(nullb, nq); - nullb->nr_queues++; - } -} - static int setup_commands(struct nullb_queue *nq) { struct nullb_cmd *cmd; @@ -1526,8 +1536,7 @@ static int setup_commands(struct nullb_queue *nq) static int setup_queues(struct nullb *nullb) { - nullb->queues = kcalloc(nullb->dev->submit_queues, - sizeof(struct nullb_queue), + nullb->queues = kcalloc(nr_cpu_ids, sizeof(struct nullb_queue), GFP_KERNEL); if (!nullb->queues) return -ENOMEM; @@ -1673,6 +1682,27 @@ static bool null_setup_fault(void) return true; } +/* + * This function is identical to blk_mq_init_queue() except that it sets + * queuedata before .init_hctx is called. + */ +static struct request_queue *nullb_alloc_queue(struct nullb *nullb) +{ + struct request_queue *uninit_q, *q; + struct blk_mq_tag_set *set = nullb->tag_set; + + uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); + if (!uninit_q) + return ERR_PTR(-ENOMEM); + + uninit_q->queuedata = nullb; + q = blk_mq_init_allocated_queue(set, uninit_q, false); + if (IS_ERR(q)) + blk_cleanup_queue(uninit_q); + + return q; +} + static int null_add_dev(struct nullb_device *dev) { struct nullb *nullb; @@ -1712,12 +1742,11 @@ static int null_add_dev(struct nullb_device *dev) goto out_cleanup_queues; nullb->tag_set->timeout = 5 * HZ; - nullb->q = blk_mq_init_queue(nullb->tag_set); + nullb->q = nullb_alloc_queue(nullb); if (IS_ERR(nullb->q)) { rv = -ENOMEM; goto out_cleanup_tags; } - null_init_queues(nullb); } else if (dev->queue_mode == NULL_Q_BIO) { nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node); if (!nullb->q) { From 2004bfdef945fe55196db6b9cdf321fbc75bb0de Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 9 Mar 2020 21:26:21 -0700 Subject: [PATCH 06/72] null_blk: Fix the null_add_dev() error path If null_add_dev() fails, clear dev->nullb. This patch fixes the following KASAN complaint: BUG: KASAN: use-after-free in nullb_device_submit_queues_store+0xcf/0x160 [null_blk] Read of size 8 at addr ffff88803280fc30 by task check/8409 Call Trace: dump_stack+0xa5/0xe6 print_address_description.constprop.0+0x26/0x260 __kasan_report.cold+0x7b/0x99 kasan_report+0x16/0x20 __asan_load8+0x58/0x90 nullb_device_submit_queues_store+0xcf/0x160 [null_blk] configfs_write_file+0x1c4/0x250 [configfs] __vfs_write+0x4c/0x90 vfs_write+0x145/0x2c0 ksys_write+0xd7/0x180 __x64_sys_write+0x47/0x50 do_syscall_64+0x6f/0x2f0 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7ff370926317 Code: 64 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 RSP: 002b:00007fff2dd2da48 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007ff370926317 RDX: 0000000000000002 RSI: 0000559437ef23f0 RDI: 0000000000000001 RBP: 0000559437ef23f0 R08: 000000000000000a R09: 0000000000000001 R10: 0000559436703471 R11: 0000000000000246 R12: 0000000000000002 R13: 00007ff370a006a0 R14: 00007ff370a014a0 R15: 00007ff370a008a0 Allocated by task 8409: save_stack+0x23/0x90 __kasan_kmalloc.constprop.0+0xcf/0xe0 kasan_kmalloc+0xd/0x10 kmem_cache_alloc_node_trace+0x129/0x4c0 null_add_dev+0x24a/0xe90 [null_blk] nullb_device_power_store+0x1b6/0x270 [null_blk] configfs_write_file+0x1c4/0x250 [configfs] __vfs_write+0x4c/0x90 vfs_write+0x145/0x2c0 ksys_write+0xd7/0x180 __x64_sys_write+0x47/0x50 do_syscall_64+0x6f/0x2f0 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 8409: save_stack+0x23/0x90 __kasan_slab_free+0x112/0x160 kasan_slab_free+0x12/0x20 kfree+0xdf/0x250 null_add_dev+0xaf3/0xe90 [null_blk] nullb_device_power_store+0x1b6/0x270 [null_blk] configfs_write_file+0x1c4/0x250 [configfs] __vfs_write+0x4c/0x90 vfs_write+0x145/0x2c0 ksys_write+0xd7/0x180 __x64_sys_write+0x47/0x50 do_syscall_64+0x6f/0x2f0 entry_SYSCALL_64_after_hwframe+0x49/0xbe Fixes: 2984c8684f96 ("nullb: factor disk parameters") Signed-off-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Cc: Johannes Thumshirn Cc: Hannes Reinecke Cc: Ming Lei Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/null_blk_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 83e31b81db3c..80d3dcb8722c 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1817,6 +1817,7 @@ out_cleanup_queues: cleanup_queues(nullb); out_free_nullb: kfree(nullb); + dev->nullb = NULL; out: return rv; } From 9b03b713082a31a5b90e0a893c72aa620e255c26 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 9 Mar 2020 21:26:22 -0700 Subject: [PATCH 07/72] null_blk: Handle null_add_dev() failures properly If null_add_dev() fails then null_del_dev() is called with a NULL argument. Make null_del_dev() handle this scenario correctly. This patch fixes the following KASAN complaint: null-ptr-deref in null_del_dev+0x28/0x280 [null_blk] Read of size 8 at addr 0000000000000000 by task find/1062 Call Trace: dump_stack+0xa5/0xe6 __kasan_report.cold+0x65/0x99 kasan_report+0x16/0x20 __asan_load8+0x58/0x90 null_del_dev+0x28/0x280 [null_blk] nullb_group_drop_item+0x7e/0xa0 [null_blk] client_drop_item+0x53/0x80 [configfs] configfs_rmdir+0x395/0x4e0 [configfs] vfs_rmdir+0xb6/0x220 do_rmdir+0x238/0x2c0 __x64_sys_unlinkat+0x75/0x90 do_syscall_64+0x6f/0x2f0 entry_SYSCALL_64_after_hwframe+0x49/0xbe Signed-off-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Cc: Johannes Thumshirn Cc: Hannes Reinecke Cc: Ming Lei Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/null_blk_main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 80d3dcb8722c..d21c82c8863f 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1469,7 +1469,12 @@ static const struct blk_mq_ops null_mq_ops = { static void null_del_dev(struct nullb *nullb) { - struct nullb_device *dev = nullb->dev; + struct nullb_device *dev; + + if (!nullb) + return; + + dev = nullb->dev; ida_simple_remove(&nullb_indexes, nullb->index); From 596444e7570587867924c3ab025183b1a8726897 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 9 Mar 2020 21:26:23 -0700 Subject: [PATCH 08/72] null_blk: Add support for init_hctx() fault injection This makes it possible to test the error path in blk_mq_realloc_hw_ctxs() and also several error paths in null_blk. Signed-off-by: Bart Van Assche Cc: Johannes Thumshirn Cc: Hannes Reinecke Cc: Ming Lei Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/null_blk_main.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index d21c82c8863f..89bb16a99007 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -23,6 +23,7 @@ #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION static DECLARE_FAULT_ATTR(null_timeout_attr); static DECLARE_FAULT_ATTR(null_requeue_attr); +static DECLARE_FAULT_ATTR(null_init_hctx_attr); #endif static inline u64 mb_per_tick(int mbps) @@ -101,6 +102,9 @@ module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444); static char g_requeue_str[80]; module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444); + +static char g_init_hctx_str[80]; +module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444); #endif static int g_queue_mode = NULL_Q_MQ; @@ -1451,6 +1455,11 @@ static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, struct nullb *nullb = hctx->queue->queuedata; struct nullb_queue *nq; +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + if (g_init_hctx_str[0] && should_fail(&null_init_hctx_attr, 1)) + return -EFAULT; +#endif + nq = &nullb->queues[hctx_idx]; hctx->driver_data = nq; null_init_queue(nullb, nq); @@ -1683,6 +1692,8 @@ static bool null_setup_fault(void) return false; if (!__null_setup_fault(&null_requeue_attr, g_requeue_str)) return false; + if (!__null_setup_fault(&null_init_hctx_attr, g_init_hctx_str)) + return false; #endif return true; } From 30a2da7b7e225ef6c87a660419ea04d3cef3f6a7 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Wed, 11 Mar 2020 16:07:50 +0530 Subject: [PATCH 09/72] block: Fix use-after-free issue accessing struct io_cq There is a potential race between ioc_release_fn() and ioc_clear_queue() as shown below, due to which below kernel crash is observed. It also can result into use-after-free issue. context#1: context#2: ioc_release_fn() __ioc_clear_queue() gets the same icq ->spin_lock(&ioc->lock); ->spin_lock(&ioc->lock); ->ioc_destroy_icq(icq); ->list_del_init(&icq->q_node); ->call_rcu(&icq->__rcu_head, icq_free_icq_rcu); ->spin_unlock(&ioc->lock); ->ioc_destroy_icq(icq); ->hlist_del_init(&icq->ioc_node); This results into below crash as this memory is now used by icq->__rcu_head in context#1. There is a chance that icq could be free'd as well. 22150.386550: <6> Unable to handle kernel write to read-only memory at virtual address ffffffaa8d31ca50 ... Call trace: 22150.607350: <2> ioc_destroy_icq+0x44/0x110 22150.611202: <2> ioc_clear_queue+0xac/0x148 22150.615056: <2> blk_cleanup_queue+0x11c/0x1a0 22150.619174: <2> __scsi_remove_device+0xdc/0x128 22150.623465: <2> scsi_forget_host+0x2c/0x78 22150.627315: <2> scsi_remove_host+0x7c/0x2a0 22150.631257: <2> usb_stor_disconnect+0x74/0xc8 22150.635371: <2> usb_unbind_interface+0xc8/0x278 22150.639665: <2> device_release_driver_internal+0x198/0x250 22150.644897: <2> device_release_driver+0x24/0x30 22150.649176: <2> bus_remove_device+0xec/0x140 22150.653204: <2> device_del+0x270/0x460 22150.656712: <2> usb_disable_device+0x120/0x390 22150.660918: <2> usb_disconnect+0xf4/0x2e0 22150.664684: <2> hub_event+0xd70/0x17e8 22150.668197: <2> process_one_work+0x210/0x480 22150.672222: <2> worker_thread+0x32c/0x4c8 Fix this by adding a new ICQ_DESTROYED flag in ioc_destroy_icq() to indicate this icq is once marked as destroyed. Also, ensure __ioc_clear_queue() is accessing icq within rcu_read_lock/unlock so that icq doesn't get free'd up while it is still using it. Signed-off-by: Sahitya Tummala Co-developed-by: Pradeep P V K Signed-off-by: Pradeep P V K Signed-off-by: Jens Axboe --- block/blk-ioc.c | 7 +++++++ include/linux/iocontext.h | 1 + 2 files changed, 8 insertions(+) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 5ed59ac6ae58..9df50fb507ca 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -84,6 +84,7 @@ static void ioc_destroy_icq(struct io_cq *icq) * making it impossible to determine icq_cache. Record it in @icq. */ icq->__rcu_icq_cache = et->icq_cache; + icq->flags |= ICQ_DESTROYED; call_rcu(&icq->__rcu_head, icq_free_icq_rcu); } @@ -212,15 +213,21 @@ static void __ioc_clear_queue(struct list_head *icq_list) { unsigned long flags; + rcu_read_lock(); while (!list_empty(icq_list)) { struct io_cq *icq = list_entry(icq_list->next, struct io_cq, q_node); struct io_context *ioc = icq->ioc; spin_lock_irqsave(&ioc->lock, flags); + if (icq->flags & ICQ_DESTROYED) { + spin_unlock_irqrestore(&ioc->lock, flags); + continue; + } ioc_destroy_icq(icq); spin_unlock_irqrestore(&ioc->lock, flags); } + rcu_read_unlock(); } /** diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index dba15ca8e60b..1dcd9198beb7 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -8,6 +8,7 @@ enum { ICQ_EXITED = 1 << 2, + ICQ_DESTROYED = 1 << 3, }; /* From 0d72031820a710c82fde4ed61352ff2a4217e6dd Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 9 Mar 2020 22:41:33 +0100 Subject: [PATCH 10/72] block: fix comment for blk_cloned_rq_check_limits Since the later description mentioned "checked against the new queue limits", so make the change to avoid confusion. Signed-off-by: Guoqing Jiang Reviewed-by: Chaitanya Kulkarni Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 60dc9552ef8d..e26341aa2e3a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1203,7 +1203,7 @@ EXPORT_SYMBOL(submit_bio); /** * blk_cloned_rq_check_limits - Helper function to check a cloned request - * for new the queue limits + * for the new queue limits * @q: the queue * @rq: the request being checked * From 35ed78b32cbbb6499b82e7a3a6769fa14e4b3c92 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 9 Mar 2020 22:41:34 +0100 Subject: [PATCH 11/72] block: use bio_{wouldblock,io}_error in direct_make_request Use the two functions to simplify code. Signed-off-by: Guoqing Jiang Reviewed-by: Nikolay Borisov Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index e26341aa2e3a..9a78f62285f9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1121,10 +1121,9 @@ blk_qc_t direct_make_request(struct bio *bio) if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) { if (nowait && !blk_queue_dying(q)) - bio->bi_status = BLK_STS_AGAIN; + bio_wouldblock_error(bio); else - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); + bio_io_error(bio); return BLK_QC_T_NONE; } From fc4cc772102511de90e708e72754800686aa0043 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 9 Mar 2020 22:41:35 +0100 Subject: [PATCH 12/72] block: remove redundant setting of QUEUE_FLAG_DYING Previously, blk_cleanup_queue has called blk_set_queue_dying to set the flag, no need to do it again. Signed-off-by: Guoqing Jiang Reviewed-by: Nikolay Borisov Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 9a78f62285f9..74edcadd6747 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -346,7 +346,6 @@ void blk_cleanup_queue(struct request_queue *q) blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); - blk_queue_flag_set(QUEUE_FLAG_DYING, q); /* * Drain all requests queued before DYING marking. Set DEAD flag to From 361301a222193c85bc53dbe64770271a4d4ff7f4 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 9 Mar 2020 22:41:36 +0100 Subject: [PATCH 13/72] block: cleanup for _blk/blk_rq_prep_clone Both cmd and sense had been moved to scsi_request, so remove the related comments to avoid confusion. And as Bart suggested, move _blk_rq_prep_clone into the only caller (blk_rq_prep_clone). Signed-off-by: Guoqing Jiang Signed-off-by: Jens Axboe --- block/blk-core.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 74edcadd6747..abfdcf81a228 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1581,23 +1581,6 @@ void blk_rq_unprep_clone(struct request *rq) } EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); -/* - * Copy attributes of the original request to the clone request. - * The actual data parts (e.g. ->cmd, ->sense) are not copied. - */ -static void __blk_rq_prep_clone(struct request *dst, struct request *src) -{ - dst->__sector = blk_rq_pos(src); - dst->__data_len = blk_rq_bytes(src); - if (src->rq_flags & RQF_SPECIAL_PAYLOAD) { - dst->rq_flags |= RQF_SPECIAL_PAYLOAD; - dst->special_vec = src->special_vec; - } - dst->nr_phys_segments = src->nr_phys_segments; - dst->ioprio = src->ioprio; - dst->extra_len = src->extra_len; -} - /** * blk_rq_prep_clone - Helper function to setup clone request * @rq: the request to be setup @@ -1610,8 +1593,6 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src) * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. - * The actual data parts of @rq_src (e.g. ->cmd, ->sense) - * are not copied, and copying such parts is the caller's responsibility. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. * So cloned bios must be completed before original bios, which means @@ -1642,7 +1623,16 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, rq->bio = rq->biotail = bio; } - __blk_rq_prep_clone(rq, rq_src); + /* Copy attributes of the original request to the clone request. */ + rq->__sector = blk_rq_pos(rq_src); + rq->__data_len = blk_rq_bytes(rq_src); + if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { + rq->rq_flags |= RQF_SPECIAL_PAYLOAD; + rq->special_vec = rq_src->special_vec; + } + rq->nr_phys_segments = rq_src->nr_phys_segments; + rq->ioprio = rq_src->ioprio; + rq->extra_len = rq_src->extra_len; return 0; From 754a15726f8d82afa87076505ce00a6a5806a48f Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 9 Mar 2020 22:41:37 +0100 Subject: [PATCH 14/72] block: remove unneeded argument from blk_alloc_flush_queue Remove 'q' from arguments since it is not used anymore after commit 7e992f847a08e ("block: remove non mq parts from the flush code"). Signed-off-by: Guoqing Jiang Reviewed-by: Nikolay Borisov Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-flush.c | 4 ++-- block/blk-mq.c | 3 +-- block/blk.h | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 5cc775bdb06a..7f7f98305115 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -485,8 +485,8 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, } EXPORT_SYMBOL(blkdev_issue_flush); -struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, - int node, int cmd_size, gfp_t flags) +struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, + gfp_t flags) { struct blk_flush_queue *fq; int rq_sz = sizeof(struct request); diff --git a/block/blk-mq.c b/block/blk-mq.c index 37ff8dfb8ab9..5b2e6550e0b6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2409,8 +2409,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); INIT_LIST_HEAD(&hctx->dispatch_wait.entry); - hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size, - gfp); + hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); if (!hctx->fq) goto free_bitmap; diff --git a/block/blk.h b/block/blk.h index 0b8884353f6b..670337b7cfa0 100644 --- a/block/blk.h +++ b/block/blk.h @@ -55,8 +55,8 @@ is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx) return hctx->fq->flush_rq == req; } -struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, - int node, int cmd_size, gfp_t flags); +struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, + gfp_t flags); void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); From ce24f736f2e047d1489dc51f0aa66d5a6c5dfb12 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 9 Mar 2020 22:41:38 +0100 Subject: [PATCH 15/72] block: cleanup comment for blk_flush_complete_seq Remove the comment about return value, since it is not valid after commit 404b8f5a03d84 ("block: cleanup kick/queued handling"). Signed-off-by: Guoqing Jiang Reviewed-by: Nikolay Borisov Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-flush.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 7f7f98305115..843d25683691 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -160,9 +160,6 @@ static void blk_account_io_flush(struct request *rq) * * CONTEXT: * spin_lock_irq(fq->mq_flush_lock) - * - * RETURNS: - * %true if requests were added to the dispatch queue, %false otherwise. */ static void blk_flush_complete_seq(struct request *rq, struct blk_flush_queue *fq, From 9243c6f3e012a92dd900d97ef45efaf8a8edc448 Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Sat, 7 Mar 2020 15:56:59 +0100 Subject: [PATCH 16/72] block: Document genhd capability flags The kernel documentation includes a brief section about genhd capabilities, but it turns out that the only documented capability (GENHD_FL_MEDIA_CHANGE_NOTIFY) isn't used any more. This patch removes that flag, and documents the rest, based on my understanding of the current uses of these flags in the kernel. The documentation is kept in the header file, alongside the declarations, in the hope that it will be kept up-to-date in future; the kernel documentation is changed to include the documentation generated from the header file. Because the ultimate goal is to provide some end-user documentation (or end-administrator documentation), the comments are perhaps more user-oriented than might be expected. Since the values are shown to users in hexadecimal, the documentation lists them in hexadecimal, and the constant declarations are adjusted to match. Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Stephen Kitt Signed-off-by: Jens Axboe --- Documentation/block/capability.rst | 16 ++----- include/linux/genhd.h | 69 +++++++++++++++++++++++++----- 2 files changed, 62 insertions(+), 23 deletions(-) diff --git a/Documentation/block/capability.rst b/Documentation/block/capability.rst index 2cf258d64bbe..160a5148b915 100644 --- a/Documentation/block/capability.rst +++ b/Documentation/block/capability.rst @@ -2,17 +2,9 @@ Generic Block Device Capability =============================== -This file documents the sysfs file block//capability +This file documents the sysfs file ``block//capability``. -capability is a hex word indicating which capabilities a specific disk -supports. For more information on bits not listed here, see -include/linux/genhd.h +``capability`` is a bitfield, printed in hexadecimal, indicating which +capabilities a specific block device supports: -GENHD_FL_MEDIA_CHANGE_NOTIFY ----------------------------- - -Value: 4 - -When this bit is set, the disk supports Asynchronous Notification -of media change events. These events will be broadcast to user -space via kernel uevent. +.. kernel-doc:: include/linux/genhd.h diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 6fbe58538ad6..e7244fb6b6ad 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -133,17 +133,64 @@ struct hd_struct { struct rcu_work rcu_work; }; -#define GENHD_FL_REMOVABLE 1 -/* 2 is unused */ -#define GENHD_FL_MEDIA_CHANGE_NOTIFY 4 -#define GENHD_FL_CD 8 -#define GENHD_FL_UP 16 -#define GENHD_FL_SUPPRESS_PARTITION_INFO 32 -#define GENHD_FL_EXT_DEVT 64 /* allow extended devt */ -#define GENHD_FL_NATIVE_CAPACITY 128 -#define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 256 -#define GENHD_FL_NO_PART_SCAN 512 -#define GENHD_FL_HIDDEN 1024 +/** + * DOC: genhd capability flags + * + * ``GENHD_FL_REMOVABLE`` (0x0001): indicates that the block device + * gives access to removable media. + * When set, the device remains present even when media is not + * inserted. + * Must not be set for devices which are removed entirely when the + * media is removed. + * + * ``GENHD_FL_CD`` (0x0008): the block device is a CD-ROM-style + * device. + * Affects responses to the ``CDROM_GET_CAPABILITY`` ioctl. + * + * ``GENHD_FL_UP`` (0x0010): indicates that the block device is "up", + * with a similar meaning to network interfaces. + * + * ``GENHD_FL_SUPPRESS_PARTITION_INFO`` (0x0020): don't include + * partition information in ``/proc/partitions`` or in the output of + * printk_all_partitions(). + * Used for the null block device and some MMC devices. + * + * ``GENHD_FL_EXT_DEVT`` (0x0040): the driver supports extended + * dynamic ``dev_t``, i.e. it wants extended device numbers + * (``BLOCK_EXT_MAJOR``). + * This affects the maximum number of partitions. + * + * ``GENHD_FL_NATIVE_CAPACITY`` (0x0080): based on information in the + * partition table, the device's capacity has been extended to its + * native capacity; i.e. the device has hidden capacity used by one + * of the partitions (this is a flag used so that native capacity is + * only ever unlocked once). + * + * ``GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE`` (0x0100): event polling is + * blocked whenever a writer holds an exclusive lock. + * + * ``GENHD_FL_NO_PART_SCAN`` (0x0200): partition scanning is disabled. + * Used for loop devices in their default settings and some MMC + * devices. + * + * ``GENHD_FL_HIDDEN`` (0x0400): the block device is hidden; it + * doesn't produce events, doesn't appear in sysfs, and doesn't have + * an associated ``bdev``. + * Implies ``GENHD_FL_SUPPRESS_PARTITION_INFO`` and + * ``GENHD_FL_NO_PART_SCAN``. + * Used for multipath devices. + */ +#define GENHD_FL_REMOVABLE 0x0001 +/* 2 is unused (used to be GENHD_FL_DRIVERFS) */ +/* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */ +#define GENHD_FL_CD 0x0008 +#define GENHD_FL_UP 0x0010 +#define GENHD_FL_SUPPRESS_PARTITION_INFO 0x0020 +#define GENHD_FL_EXT_DEVT 0x0040 +#define GENHD_FL_NATIVE_CAPACITY 0x0080 +#define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 0x0100 +#define GENHD_FL_NO_PART_SCAN 0x0200 +#define GENHD_FL_HIDDEN 0x0400 enum { DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ From 88d6041d070028ef31c52845966216004ebba3bb Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Tue, 3 Mar 2020 12:17:00 -0700 Subject: [PATCH 17/72] block: sed-opal: Change the check condition for regular session validity This patch changes the check condition for the validity/authentication of the session. 1. The Host Session Number(HSN) in the response should match the HSN for the session. 2. The TPER Session Number(TSN) can never be less than 4096 for a regular session. Reference: Section 3.2.2.1 of https://trustedcomputinggroup.org/wp-content/uploads/TCG_Storage_Opal_SSC_Application_Note_1-00_1-00-Final.pdf Section 3.3.7.1.1 of https://trustedcomputinggroup.org/wp-content/uploads/TCG_Storage_Architecture_Core_Spec_v2.01_r1.00.pdf Co-developed-by: Andrzej Jakowski Signed-off-by: Andrzej Jakowski Signed-off-by: Revanth Rajashekar Signed-off-by: Jens Axboe --- block/opal_proto.h | 1 + block/sed-opal.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/block/opal_proto.h b/block/opal_proto.h index 325cbba2465f..b486b3ec7dc4 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -36,6 +36,7 @@ enum opal_response_token { #define DTAERROR_NO_METHOD_STATUS 0x89 #define GENERIC_HOST_SESSION_NUM 0x41 +#define FIRST_TPER_SESSION_NUM 4096 #define TPER_SYNC_SUPPORTED 0x01 #define MBR_ENABLED_MASK 0x10 diff --git a/block/sed-opal.c b/block/sed-opal.c index 880cc57a5f6b..daafadbb88ca 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1056,7 +1056,7 @@ static int start_opal_session_cont(struct opal_dev *dev) hsn = response_get_u64(&dev->parsed, 4); tsn = response_get_u64(&dev->parsed, 5); - if (hsn == 0 && tsn == 0) { + if (hsn != GENERIC_HOST_SESSION_NUM || tsn < FIRST_TPER_SESSION_NUM) { pr_debug("Couldn't authenticate session\n"); return -EPERM; } From fa800d73c8d0d36b1f5929198371f421b69e610e Mon Sep 17 00:00:00 2001 From: Weiping Zhang Date: Thu, 27 Feb 2020 09:38:46 +0800 Subject: [PATCH 18/72] blk-iocost: remove duplicated lines in comments Acked-by: Tejun Heo Signed-off-by: Weiping Zhang Signed-off-by: Jens Axboe --- block/blk-iocost.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 27ca68621137..6a7788f31c22 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -46,9 +46,6 @@ * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate * device-specific coefficients. * - * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate - * device-specific coefficients. - * * 2. Control Strategy * * The device virtual time (vtime) is used as the primary control metric. From 11bde986002c0af67eb92d73321d06baefae7128 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 12 Feb 2020 20:40:27 +0300 Subject: [PATCH 19/72] block, zoned: fix integer overflow with BLKRESETZONE et al Check for overflow in addition before checking for end-of-block-device. Steps to reproduce: #define _GNU_SOURCE 1 #include #include #include #include typedef unsigned long long __u64; struct blk_zone_range { __u64 sector; __u64 nr_sectors; }; #define BLKRESETZONE _IOW(0x12, 131, struct blk_zone_range) int main(void) { int fd = open("/dev/nullb0", O_RDWR|O_DIRECT); struct blk_zone_range zr = {4096, 0xfffffffffffff000ULL}; ioctl(fd, BLKRESETZONE, &zr); return 0; } BUG: KASAN: null-ptr-deref in submit_bio_wait+0x74/0xe0 Write of size 8 at addr 0000000000000040 by task a.out/1590 CPU: 8 PID: 1590 Comm: a.out Not tainted 5.6.0-rc1-00019-g359c92c02bfa #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190711_202441-buildvm-armv7-10.arm.fedoraproject.org-2.fc31 04/01/2014 Call Trace: dump_stack+0x76/0xa0 __kasan_report.cold+0x5/0x3e kasan_report+0xe/0x20 submit_bio_wait+0x74/0xe0 blkdev_zone_mgmt+0x26f/0x2a0 blkdev_zone_mgmt_ioctl+0x14b/0x1b0 blkdev_ioctl+0xb28/0xe60 block_ioctl+0x69/0x80 ksys_ioctl+0x3af/0xa50 Reviewed-by: Christoph Hellwig Signed-off-by: Alexey Dobriyan (SK hynix) Signed-off-by: Jens Axboe --- block/blk-zoned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 05741c6f618b..6b442ae96499 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -173,7 +173,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, if (!op_is_zone_mgmt(op)) return -EOPNOTSUPP; - if (!nr_sectors || end_sector > capacity) + if (end_sector <= sector || end_sector > capacity) /* Out of range */ return -EINVAL; From d981cb5b9fa0da9da6a6c8b9ae7cbe789c3b3214 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Mar 2020 09:12:06 +0100 Subject: [PATCH 20/72] block: fix a device invalidation regression Historically we only set the capacity to zero for devices that support partitions (independ of actually having partitions created). Doing that is rather inconsistent, but changing it broke legacy udisks polling for legacy ide-cdrom devices. Use the crude a crude check for devices that either are non-removable or partitionable to get the sane behavior for most device while not breaking userspace for this particular setup. Fixes: a1548b674403 ("block: move rescan_partitions to fs/block_dev.c") Reported-by: He Zhe Signed-off-by: Christoph Hellwig Tested-by: He Zhe Signed-off-by: Jens Axboe --- fs/block_dev.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 69bf2fb6f7cd..9501880dff5e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1520,10 +1520,22 @@ rescan: if (ret) return ret; - if (invalidate) - set_capacity(disk, 0); - else if (disk->fops->revalidate_disk) - disk->fops->revalidate_disk(disk); + /* + * Historically we only set the capacity to zero for devices that + * support partitions (independ of actually having partitions created). + * Doing that is rather inconsistent, but changing it broke legacy + * udisks polling for legacy ide-cdrom devices. Use the crude check + * below to get the sane behavior for most device while not breaking + * userspace for this particular setup. + */ + if (invalidate) { + if (disk_part_scan_enabled(disk) || + !(disk->flags & GENHD_FL_REMOVABLE)) + set_capacity(disk, 0); + } else { + if (disk->fops->revalidate_disk) + disk->fops->revalidate_disk(disk); + } check_disk_size_change(disk, bdev, !invalidate); From de6a78b601c529398ad1448e3bffcade1fcf5a70 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 18 Mar 2020 11:43:36 +0800 Subject: [PATCH 21/72] block: Prevent hung_check firing during long sync IO submit_bio_wait() can be called from ioctl(BLKSECDISCARD), which may take long time to complete, as Salman mentioned, 4K BLKSECDISCARD takes up to 100 second on some devices. Also any block I/O operation that occurs after the BLKSECDISCARD is submitted will also potentially be affected by the hung task timeouts. Another report is that task hang can be observed when running mkfs over raid10 which takes a small max discard sectors limit because of chunk size. So prevent hung_check from firing by taking same approach used in blk_execute_rq(), and the wake-up interval is set as half the hung_check timer period, which keeps overhead low enough. Cc: Salman Qazi Cc: Jesse Barnes Cc: Bart Van Assche Link: https://lkml.org/lkml/2020/2/12/1193 Reported-by: Salman Qazi Reviewed-by: Jesse Barnes Reviewed-by: Salman Qazi Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index 94d697217887..0985f3422556 100644 --- a/block/bio.c +++ b/block/bio.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "blk.h" @@ -1019,12 +1020,21 @@ static void submit_bio_wait_endio(struct bio *bio) int submit_bio_wait(struct bio *bio) { DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map); + unsigned long hang_check; bio->bi_private = &done; bio->bi_end_io = submit_bio_wait_endio; bio->bi_opf |= REQ_SYNC; submit_bio(bio); - wait_for_completion_io(&done); + + /* Prevent hang_check timer from firing at us during very long I/O */ + hang_check = sysctl_hung_task_timeout_secs; + if (hang_check) + while (!wait_for_completion_io_timeout(&done, + hang_check * (HZ/2))) + ; + else + wait_for_completion_io(&done); return blk_status_to_errno(bio->bi_status); } From e598a72faeb543599bdf0d930df3a71906404e6f Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 13 Mar 2020 05:30:05 +0000 Subject: [PATCH 22/72] block/genhd: Notify udev about capacity change Allow block/genhd to notify user space (via udev) about disk size changes using a new helper set_capacity_revalidate_and_notify(), which is a wrapper on top of set_capacity(). set_capacity_revalidate_and_notify() will only notify via udev if the current capacity or the target capacity is not zero and iff the capacity changes. Suggested-by: Christoph Hellwig Signed-off-by: Someswarudu Sangaraju Signed-off-by: Balbir Singh Reviewed-by: Bob Liu Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 24 ++++++++++++++++++++++++ include/linux/genhd.h | 2 ++ 2 files changed, 26 insertions(+) diff --git a/block/genhd.c b/block/genhd.c index ff6268970ddc..6a60131baffa 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -46,6 +46,30 @@ static void disk_add_events(struct gendisk *disk); static void disk_del_events(struct gendisk *disk); static void disk_release_events(struct gendisk *disk); +/* + * Set disk capacity and notify if the size is not currently + * zero and will not be set to zero + */ +void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, + bool revalidate) +{ + sector_t capacity = get_capacity(disk); + + set_capacity(disk, size); + + if (revalidate) + revalidate_disk(disk); + + if (capacity != size && capacity != 0 && size != 0) { + char *envp[] = { "RESIZE=1", NULL }; + + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + } +} + +EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify); + + void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { if (queue_is_mq(q)) diff --git a/include/linux/genhd.h b/include/linux/genhd.h index e7244fb6b6ad..3c1a816785c8 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -508,6 +508,8 @@ static inline int get_disk_ro(struct gendisk *disk) extern void disk_block_events(struct gendisk *disk); extern void disk_unblock_events(struct gendisk *disk); extern void disk_flush_events(struct gendisk *disk, unsigned int mask); +extern void set_capacity_revalidate_and_notify(struct gendisk *disk, + sector_t size, bool revalidate); extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask); /* drivers/char/random.c */ From 662155e2898dd1c3915e420378bb6c0826548e70 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 13 Mar 2020 05:30:06 +0000 Subject: [PATCH 23/72] virtio_blk.c: Convert to use set_capacity_revalidate_and_notify block/genhd provides set_capacity_revalidate_and_notify() for sending RESIZE notifications via uevents. Signed-off-by: Balbir Singh Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 54158766334b..c913ebb25a52 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -381,18 +381,15 @@ static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize) cap_str_10, cap_str_2); - set_capacity(vblk->disk, capacity); + set_capacity_revalidate_and_notify(vblk->disk, capacity, true); } static void virtblk_config_changed_work(struct work_struct *work) { struct virtio_blk *vblk = container_of(work, struct virtio_blk, config_work); - char *envp[] = { "RESIZE=1", NULL }; virtblk_update_capacity(vblk, true); - revalidate_disk(vblk->disk); - kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp); } static void virtblk_config_changed(struct virtio_device *vdev) From 3cbc28bb902b8e3a8a216e21eee1a67dd474eb43 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 13 Mar 2020 05:30:07 +0000 Subject: [PATCH 24/72] xen-blkfront.c: Convert to use set_capacity_revalidate_and_notify block/genhd provides set_capacity_revalidate_and_notify() for sending RESIZE notifications via uevents. Signed-off-by: Balbir Singh Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/xen-blkfront.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 9df516a56bb2..915cf5b6388c 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2338,7 +2338,6 @@ static void blkfront_connect(struct blkfront_info *info) unsigned long sector_size; unsigned int physical_sector_size; unsigned int binfo; - char *envp[] = { "RESIZE=1", NULL }; int err, i; struct blkfront_ring_info *rinfo; @@ -2354,10 +2353,7 @@ static void blkfront_connect(struct blkfront_info *info) return; printk(KERN_INFO "Setting capacity to %Lu\n", sectors); - set_capacity(info->gd, sectors); - revalidate_disk(info->gd); - kobject_uevent_env(&disk_to_dev(info->gd)->kobj, - KOBJ_CHANGE, envp); + set_capacity_revalidate_and_notify(info->gd, sectors, true); return; case BLKIF_STATE_SUSPENDED: From cb224c3af4df5996246eb779a23d968a8aabad28 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 13 Mar 2020 05:30:08 +0000 Subject: [PATCH 25/72] nvme: Convert to use set_capacity_revalidate_and_notify block/genhd provides set_capacity_revalidate_and_notify() for sending RESIZE notifications via uevents. This notification is newly added to NVME devices Signed-off-by: Balbir Singh Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a4d8c90ee7cc..41ad07f6a564 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1810,7 +1810,7 @@ static void nvme_update_disk_info(struct gendisk *disk, ns->lba_shift > PAGE_SHIFT) capacity = 0; - set_capacity(disk, capacity); + set_capacity_revalidate_and_notify(disk, capacity, false); nvme_config_discard(disk, ns); nvme_config_write_zeroes(disk, ns); From 78317c5d58e625e60d1eaded37283d14dfcd1489 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 13 Mar 2020 05:30:09 +0000 Subject: [PATCH 26/72] scsi: Convert to use set_capacity_revalidate_and_notify block/genhd provides set_capacity_revalidate_and_notify() for sending RESIZE notifications via uevents. This notification is newly added to scsi sd. Signed-off-by: Balbir Singh Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 8ca9299ffd36..707f47c0ec98 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3187,7 +3187,8 @@ static int sd_revalidate_disk(struct gendisk *disk) sdkp->first_scan = 0; - set_capacity(disk, logical_to_sectors(sdp, sdkp->capacity)); + set_capacity_revalidate_and_notify(disk, + logical_to_sectors(sdp, sdkp->capacity), false); sd_config_write_same(sdkp); kfree(buffer); From 2f95fa5c955d0a9987ffdc3a095e2f4e62c5f2a9 Mon Sep 17 00:00:00 2001 From: Zhiqiang Liu Date: Thu, 19 Mar 2020 19:18:13 +0800 Subject: [PATCH 27/72] block, bfq: fix use-after-free in bfq_idle_slice_timer_body In bfq_idle_slice_timer func, bfqq = bfqd->in_service_queue is not in bfqd-lock critical section. The bfqq, which is not equal to NULL in bfq_idle_slice_timer, may be freed after passing to bfq_idle_slice_timer_body. So we will access the freed memory. In addition, considering the bfqq may be in race, we should firstly check whether bfqq is in service before doing something on it in bfq_idle_slice_timer_body func. If the bfqq in race is not in service, it means the bfqq has been expired through __bfq_bfqq_expire func, and wait_request flags has been cleared in __bfq_bfqd_reset_in_service func. So we do not need to re-clear the wait_request of bfqq which is not in service. KASAN log is given as follows: [13058.354613] ================================================================== [13058.354640] BUG: KASAN: use-after-free in bfq_idle_slice_timer+0xac/0x290 [13058.354644] Read of size 8 at addr ffffa02cf3e63f78 by task fork13/19767 [13058.354646] [13058.354655] CPU: 96 PID: 19767 Comm: fork13 [13058.354661] Call trace: [13058.354667] dump_backtrace+0x0/0x310 [13058.354672] show_stack+0x28/0x38 [13058.354681] dump_stack+0xd8/0x108 [13058.354687] print_address_description+0x68/0x2d0 [13058.354690] kasan_report+0x124/0x2e0 [13058.354697] __asan_load8+0x88/0xb0 [13058.354702] bfq_idle_slice_timer+0xac/0x290 [13058.354707] __hrtimer_run_queues+0x298/0x8b8 [13058.354710] hrtimer_interrupt+0x1b8/0x678 [13058.354716] arch_timer_handler_phys+0x4c/0x78 [13058.354722] handle_percpu_devid_irq+0xf0/0x558 [13058.354731] generic_handle_irq+0x50/0x70 [13058.354735] __handle_domain_irq+0x94/0x110 [13058.354739] gic_handle_irq+0x8c/0x1b0 [13058.354742] el1_irq+0xb8/0x140 [13058.354748] do_wp_page+0x260/0xe28 [13058.354752] __handle_mm_fault+0x8ec/0x9b0 [13058.354756] handle_mm_fault+0x280/0x460 [13058.354762] do_page_fault+0x3ec/0x890 [13058.354765] do_mem_abort+0xc0/0x1b0 [13058.354768] el0_da+0x24/0x28 [13058.354770] [13058.354773] Allocated by task 19731: [13058.354780] kasan_kmalloc+0xe0/0x190 [13058.354784] kasan_slab_alloc+0x14/0x20 [13058.354788] kmem_cache_alloc_node+0x130/0x440 [13058.354793] bfq_get_queue+0x138/0x858 [13058.354797] bfq_get_bfqq_handle_split+0xd4/0x328 [13058.354801] bfq_init_rq+0x1f4/0x1180 [13058.354806] bfq_insert_requests+0x264/0x1c98 [13058.354811] blk_mq_sched_insert_requests+0x1c4/0x488 [13058.354818] blk_mq_flush_plug_list+0x2d4/0x6e0 [13058.354826] blk_flush_plug_list+0x230/0x548 [13058.354830] blk_finish_plug+0x60/0x80 [13058.354838] read_pages+0xec/0x2c0 [13058.354842] __do_page_cache_readahead+0x374/0x438 [13058.354846] ondemand_readahead+0x24c/0x6b0 [13058.354851] page_cache_sync_readahead+0x17c/0x2f8 [13058.354858] generic_file_buffered_read+0x588/0xc58 [13058.354862] generic_file_read_iter+0x1b4/0x278 [13058.354965] ext4_file_read_iter+0xa8/0x1d8 [ext4] [13058.354972] __vfs_read+0x238/0x320 [13058.354976] vfs_read+0xbc/0x1c0 [13058.354980] ksys_read+0xdc/0x1b8 [13058.354984] __arm64_sys_read+0x50/0x60 [13058.354990] el0_svc_common+0xb4/0x1d8 [13058.354994] el0_svc_handler+0x50/0xa8 [13058.354998] el0_svc+0x8/0xc [13058.354999] [13058.355001] Freed by task 19731: [13058.355007] __kasan_slab_free+0x120/0x228 [13058.355010] kasan_slab_free+0x10/0x18 [13058.355014] kmem_cache_free+0x288/0x3f0 [13058.355018] bfq_put_queue+0x134/0x208 [13058.355022] bfq_exit_icq_bfqq+0x164/0x348 [13058.355026] bfq_exit_icq+0x28/0x40 [13058.355030] ioc_exit_icq+0xa0/0x150 [13058.355035] put_io_context_active+0x250/0x438 [13058.355038] exit_io_context+0xd0/0x138 [13058.355045] do_exit+0x734/0xc58 [13058.355050] do_group_exit+0x78/0x220 [13058.355054] __wake_up_parent+0x0/0x50 [13058.355058] el0_svc_common+0xb4/0x1d8 [13058.355062] el0_svc_handler+0x50/0xa8 [13058.355066] el0_svc+0x8/0xc [13058.355067] [13058.355071] The buggy address belongs to the object at ffffa02cf3e63e70#012 which belongs to the cache bfq_queue of size 464 [13058.355075] The buggy address is located 264 bytes inside of#012 464-byte region [ffffa02cf3e63e70, ffffa02cf3e64040) [13058.355077] The buggy address belongs to the page: [13058.355083] page:ffff7e80b3cf9800 count:1 mapcount:0 mapping:ffff802db5c90780 index:0xffffa02cf3e606f0 compound_mapcount: 0 [13058.366175] flags: 0x2ffffe0000008100(slab|head) [13058.370781] raw: 2ffffe0000008100 ffff7e80b53b1408 ffffa02d730c1c90 ffff802db5c90780 [13058.370787] raw: ffffa02cf3e606f0 0000000000370023 00000001ffffffff 0000000000000000 [13058.370789] page dumped because: kasan: bad access detected [13058.370791] [13058.370792] Memory state around the buggy address: [13058.370797] ffffa02cf3e63e00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fb fb [13058.370801] ffffa02cf3e63e80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [13058.370805] >ffffa02cf3e63f00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [13058.370808] ^ [13058.370811] ffffa02cf3e63f80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [13058.370815] ffffa02cf3e64000: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [13058.370817] ================================================================== [13058.370820] Disabling lock debugging due to kernel taint Here, we directly pass the bfqd to bfq_idle_slice_timer_body func. -- V2->V3: rewrite the comment as suggested by Paolo Valente V1->V2: add one comment, and add Fixes and Reported-by tag. Fixes: aee69d78d ("block, bfq: introduce the BFQ-v0 I/O scheduler as an extra scheduler") Acked-by: Paolo Valente Reported-by: Wang Wang Signed-off-by: Zhiqiang Liu Signed-off-by: Feilong Lin Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 8c436abfaf14..4a44c7f19435 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6215,20 +6215,28 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) return bfqq; } -static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) +static void +bfq_idle_slice_timer_body(struct bfq_data *bfqd, struct bfq_queue *bfqq) { - struct bfq_data *bfqd = bfqq->bfqd; enum bfqq_expiration reason; unsigned long flags; spin_lock_irqsave(&bfqd->lock, flags); - bfq_clear_bfqq_wait_request(bfqq); + /* + * Considering that bfqq may be in race, we should firstly check + * whether bfqq is in service before doing something on it. If + * the bfqq in race is not in service, it has already been expired + * through __bfq_bfqq_expire func and its wait_request flags has + * been cleared in __bfq_bfqd_reset_in_service func. + */ if (bfqq != bfqd->in_service_queue) { spin_unlock_irqrestore(&bfqd->lock, flags); return; } + bfq_clear_bfqq_wait_request(bfqq); + if (bfq_bfqq_budget_timeout(bfqq)) /* * Also here the queue can be safely expired @@ -6273,7 +6281,7 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) * early. */ if (bfqq) - bfq_idle_slice_timer_body(bfqq); + bfq_idle_slice_timer_body(bfqd, bfqq); return HRTIMER_NORESTART; } From fd1bb3ae54a9a2e0c42709de861c69aa146b8955 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 21 Mar 2020 10:45:18 +0100 Subject: [PATCH 28/72] block, bfq: move forward the getting of an extra ref in bfq_bfqq_move Commit ecedd3d7e199 ("block, bfq: get extra ref to prevent a queue from being freed during a group move") gets an extra reference to a bfq_queue before possibly deactivating it (temporarily), in bfq_bfqq_move(). This prevents the bfq_queue from disappearing before being reactivated in its new group. Yet, the bfq_queue may also be expired (i.e., its service may be stopped) before the bfq_queue is deactivated. And also an expiration may lead to a premature freeing. This commit fixes this issue by simply moving forward the getting of the extra reference already introduced by commit ecedd3d7e199 ("block, bfq: get extra ref to prevent a queue from being freed during a group move"). Reported-by: cki-project@redhat.com Tested-by: cki-project@redhat.com Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index f0ff6654af28..9d963ed518d1 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -642,6 +642,12 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_entity *entity = &bfqq->entity; + /* + * Get extra reference to prevent bfqq from being freed in + * next possible expire or deactivate. + */ + bfqq->ref++; + /* If bfqq is empty, then bfq_bfqq_expire also invokes * bfq_del_bfqq_busy, thereby removing bfqq and its entity * from data structures related to current group. Otherwise we @@ -652,12 +658,6 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_bfqq_expire(bfqd, bfqd->in_service_queue, false, BFQQE_PREEMPTED); - /* - * get extra reference to prevent bfqq from being freed in - * next possible deactivate - */ - bfqq->ref++; - if (bfq_bfqq_busy(bfqq)) bfq_deactivate_bfqq(bfqd, bfqq, false, false); else if (entity->on_st_or_in_serv) @@ -677,7 +677,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfqd->in_service_queue && !bfqd->rq_in_driver) bfq_schedule_dispatch(bfqd); - /* release extra ref taken above */ + /* release extra ref taken above, bfqq may happen to be freed now */ bfq_put_queue(bfqq); } From c8997736650060594845e42c5d01d3118aec8d25 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 21 Mar 2020 10:45:19 +0100 Subject: [PATCH 29/72] block, bfq: turn put_queue into release_process_ref in __bfq_bic_change_cgroup A bfq_put_queue() may be invoked in __bfq_bic_change_cgroup(). The goal of this put is to release a process reference to a bfq_queue. But process-reference releases may trigger also some extra operation, and, to this goal, are handled through bfq_release_process_ref(). So, turn the invocation of bfq_put_queue() into an invocation of bfq_release_process_ref(). Tested-by: cki-project@redhat.com Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 5 +---- block/bfq-iosched.c | 2 -- block/bfq-iosched.h | 1 + 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 9d963ed518d1..72c6151ace96 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -714,10 +714,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, if (entity->sched_data != &bfqg->sched_data) { bic_set_bfqq(bic, NULL, 0); - bfq_log_bfqq(bfqd, async_bfqq, - "bic_change_group: %p %d", - async_bfqq, async_bfqq->ref); - bfq_put_queue(async_bfqq); + bfq_release_process_ref(bfqd, async_bfqq); } } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 4a44c7f19435..78ba57efd16b 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2716,8 +2716,6 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) } } - -static void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) { /* diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index d1233af9c684..cd224aaf9f52 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -955,6 +955,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, enum bfqq_expiration reason); void bfq_put_queue(struct bfq_queue *bfqq); void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); +void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_schedule_dispatch(struct bfq_data *bfqd); void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); From 576682fa52cbd95deb3773449566274f206acc58 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 21 Mar 2020 10:45:20 +0100 Subject: [PATCH 30/72] block, bfq: make reparent_leaf_entity actually work only on leaf entities bfq_reparent_leaf_entity() reparents the input leaf entity (a leaf entity represents just a bfq_queue in an entity tree). Yet, the input entity is guaranteed to always be a leaf entity only in two-level entity trees. In this respect, because of the error fixed by commit 14afc5936197 ("block, bfq: fix overwrite of bfq_group pointer in bfq_find_set_group()"), all (wrongly collapsed) entity trees happened to actually have only two levels. After the latter commit, this does not hold any longer. This commit fixes this problem by modifying bfq_reparent_leaf_entity(), so that it searches an active leaf entity down the path that stems from the input entity. Such a leaf entity is guaranteed to exist when bfq_reparent_leaf_entity() is invoked. Tested-by: cki-project@redhat.com Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 48 ++++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 72c6151ace96..efb89db7ba24 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -815,39 +815,53 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st) /** * bfq_reparent_leaf_entity - move leaf entity to the root_group. * @bfqd: the device data structure with the root group. - * @entity: the entity to move. + * @entity: the entity to move, if entity is a leaf; or the parent entity + * of an active leaf entity to move, if entity is not a leaf. */ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, - struct bfq_entity *entity) + struct bfq_entity *entity, + int ioprio_class) { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct bfq_queue *bfqq; + struct bfq_entity *child_entity = entity; + while (child_entity->my_sched_data) { /* leaf not reached yet */ + struct bfq_sched_data *child_sd = child_entity->my_sched_data; + struct bfq_service_tree *child_st = child_sd->service_tree + + ioprio_class; + struct rb_root *child_active = &child_st->active; + + child_entity = bfq_entity_of(rb_first(child_active)); + + if (!child_entity) + child_entity = child_sd->in_service_entity; + } + + bfqq = bfq_entity_to_bfqq(child_entity); bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); } /** - * bfq_reparent_active_entities - move to the root group all active - * entities. + * bfq_reparent_active_queues - move to the root group all active queues. * @bfqd: the device data structure with the root group. * @bfqg: the group to move from. - * @st: the service tree with the entities. + * @st: the service tree to start the search from. */ -static void bfq_reparent_active_entities(struct bfq_data *bfqd, - struct bfq_group *bfqg, - struct bfq_service_tree *st) +static void bfq_reparent_active_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg, + struct bfq_service_tree *st, + int ioprio_class) { struct rb_root *active = &st->active; - struct bfq_entity *entity = NULL; + struct bfq_entity *entity; - if (!RB_EMPTY_ROOT(&st->active)) - entity = bfq_entity_of(rb_first(active)); - - for (; entity ; entity = bfq_entity_of(rb_first(active))) - bfq_reparent_leaf_entity(bfqd, entity); + while ((entity = bfq_entity_of(rb_first(active)))) + bfq_reparent_leaf_entity(bfqd, entity, ioprio_class); if (bfqg->sched_data.in_service_entity) bfq_reparent_leaf_entity(bfqd, - bfqg->sched_data.in_service_entity); + bfqg->sched_data.in_service_entity, + ioprio_class); } /** @@ -898,7 +912,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) * There is no need to put the sync queues, as the * scheduler has taken no reference. */ - bfq_reparent_active_entities(bfqd, bfqg, st); + bfq_reparent_active_queues(bfqd, bfqg, st, i); } __bfq_deactivate_entity(entity, false); From 4d38a87fbb77fb9ff2ff4e914162a8ae6453eff5 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 21 Mar 2020 10:45:21 +0100 Subject: [PATCH 31/72] block, bfq: invoke flush_idle_tree after reparent_active_queues in pd_offline In bfq_pd_offline(), the function bfq_flush_idle_tree() is invoked to flush the rb tree that contains all idle entities belonging to the pd (cgroup) being destroyed. In particular, bfq_flush_idle_tree() is invoked before bfq_reparent_active_queues(). Yet the latter may happen to add some entities to the idle tree. It happens if, in some of the calls to bfq_bfqq_move() performed by bfq_reparent_active_queues(), the queue to move is empty and gets expired. This commit simply reverses the invocation order between bfq_flush_idle_tree() and bfq_reparent_active_queues(). Tested-by: cki-project@redhat.com Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index efb89db7ba24..68882b9b8f11 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -893,13 +893,6 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { st = bfqg->sched_data.service_tree + i; - /* - * The idle tree may still contain bfq_queues belonging - * to exited task because they never migrated to a different - * cgroup from the one being destroyed now. - */ - bfq_flush_idle_tree(st); - /* * It may happen that some queues are still active * (busy) upon group destruction (if the corresponding @@ -913,6 +906,19 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) * scheduler has taken no reference. */ bfq_reparent_active_queues(bfqd, bfqg, st, i); + + /* + * The idle tree may still contain bfq_queues + * belonging to exited task because they never + * migrated to a different cgroup from the one being + * destroyed now. In addition, even + * bfq_reparent_active_queues() may happen to add some + * entities to the idle tree. It happens if, in some + * of the calls to bfq_bfqq_move() performed by + * bfq_reparent_active_queues(), the queue to move is + * empty and gets expired. + */ + bfq_flush_idle_tree(st); } __bfq_deactivate_entity(entity, false); From d2332c5c040bc49c6e23426106c468cfa500d873 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:10 +0100 Subject: [PATCH 32/72] block: remove the blk_lookup_devt export This function is only used by init/do_mounts.c, which can't be modular. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 6a60131baffa..c5d20a48b4de 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1487,7 +1487,6 @@ dev_t blk_lookup_devt(const char *name, int partno) class_dev_iter_exit(&iter); return devt; } -EXPORT_SYMBOL(blk_lookup_devt); struct gendisk *__alloc_disk_node(int minors, int node_id) { From ea3edd4dc23027083fbb4a73b65114d08fe73a76 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:11 +0100 Subject: [PATCH 33/72] block: remove __bdevname There is no good reason for __bdevname to exist. Just open code printing the string in the callers. For three of them the format string can be trivially merged into existing printk statements, and in init/do_mounts.c we can at least do the scnprintf once at the start of the function, and unconditional of CONFIG_BLOCK to make the output for tiny configfs a little more helpful. Acked-by: Theodore Ts'o # for ext4 Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partition-generic.c | 14 -------------- drivers/md/md.c | 4 ++-- fs/ext4/super.c | 6 +++--- fs/reiserfs/journal.c | 5 ++--- include/linux/fs.h | 1 - init/do_mounts.c | 12 ++---------- 6 files changed, 9 insertions(+), 33 deletions(-) diff --git a/block/partition-generic.c b/block/partition-generic.c index 564fae77711d..98256e6beabb 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -57,20 +57,6 @@ const char *bio_devname(struct bio *bio, char *buf) } EXPORT_SYMBOL(bio_devname); -/* - * There's very little reason to use this, you should really - * have a struct block_device just about everywhere and use - * bdevname() instead. - */ -const char *__bdevname(dev_t dev, char *buffer) -{ - scnprintf(buffer, BDEVNAME_SIZE, "unknown-block(%u,%u)", - MAJOR(dev), MINOR(dev)); - return buffer; -} - -EXPORT_SYMBOL(__bdevname); - static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/drivers/md/md.c b/drivers/md/md.c index 469f551863be..8238fb64f87f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2491,12 +2491,12 @@ static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) { int err = 0; struct block_device *bdev; - char b[BDEVNAME_SIZE]; bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, shared ? (struct md_rdev *)lock_rdev : rdev); if (IS_ERR(bdev)) { - pr_warn("md: could not open %s.\n", __bdevname(dev, b)); + pr_warn("md: could not open device unknown-block(%u,%u).\n", + MAJOR(dev), MINOR(dev)); return PTR_ERR(bdev); } rdev->bdev = bdev; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0c7c4adb664e..4cbd1cc34dfa 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -927,7 +927,6 @@ void ext4_update_dynamic_rev(struct super_block *sb) static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) { struct block_device *bdev; - char b[BDEVNAME_SIZE]; bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) @@ -935,8 +934,9 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) return bdev; fail: - ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld", - __bdevname(dev, b), PTR_ERR(bdev)); + ext4_msg(sb, KERN_ERR, + "failed to open journal device unknown-block(%u,%u) %ld", + MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); return NULL; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 072156c4f895..5c766330e493 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2599,7 +2599,6 @@ static int journal_init_dev(struct super_block *super, int result; dev_t jdev; fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL; - char b[BDEVNAME_SIZE]; result = 0; @@ -2621,8 +2620,8 @@ static int journal_init_dev(struct super_block *super, result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; reiserfs_warning(super, "sh-458", - "cannot init journal device '%s': %i", - __bdevname(jdev, b), result); + "cannot init journal device unknown-block(%u,%u): %i", + MAJOR(jdev), MINOR(jdev), result); return result; } else if (jdev != super->s_dev) set_blocksize(journal->j_dev_bd, super->s_blocksize); diff --git a/include/linux/fs.h b/include/linux/fs.h index 3cd4fe6b845e..561b35e3b95b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2699,7 +2699,6 @@ static inline void unregister_chrdev(unsigned int major, const char *name) #ifdef CONFIG_BLOCK #define BLKDEV_MAJOR_MAX 512 -extern const char *__bdevname(dev_t, char *buffer); extern const char *bdevname(struct block_device *bdev, char *buffer); extern struct block_device *lookup_bdev(const char *); extern void blkdev_show(struct seq_file *,off_t); diff --git a/init/do_mounts.c b/init/do_mounts.c index 0ae9cc22f2ae..29d326b6c29d 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -429,12 +429,10 @@ void __init mount_block_root(char *name, int flags) struct page *page = alloc_page(GFP_KERNEL); char *fs_names = page_address(page); char *p; -#ifdef CONFIG_BLOCK char b[BDEVNAME_SIZE]; -#else - const char *b = name; -#endif + scnprintf(b, BDEVNAME_SIZE, "unknown-block(%u,%u)", + MAJOR(ROOT_DEV), MINOR(ROOT_DEV)); get_fs_names(fs_names); retry: for (p = fs_names; *p; p += strlen(p)+1) { @@ -451,9 +449,6 @@ retry: * and bad superblock on root device. * and give them a list of the available devices */ -#ifdef CONFIG_BLOCK - __bdevname(ROOT_DEV, b); -#endif printk("VFS: Cannot open root device \"%s\" or %s: error %d\n", root_device_name, b, err); printk("Please append a correct \"root=\" boot option; here are the available partitions:\n"); @@ -476,9 +471,6 @@ retry: for (p = fs_names; *p; p += strlen(p)+1) printk(" %s", p); printk("\n"); -#ifdef CONFIG_BLOCK - __bdevname(ROOT_DEV, b); -#endif panic("VFS: Unable to mount root fs on %s", b); out: put_page(page); From 5cbd28e3cef14b43b2a8271d36b75fc61c13bb8a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:12 +0100 Subject: [PATCH 34/72] block: move disk_name and related helpers out of partition-generic.c Thes functions aren't really related to partition support, so move them to a more suitable place. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 6 ++++++ block/genhd.c | 21 +++++++++++++++++++++ block/partition-generic.c | 32 -------------------------------- 3 files changed, 27 insertions(+), 32 deletions(-) diff --git a/block/bio.c b/block/bio.c index 0985f3422556..209715765a7a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -680,6 +680,12 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) } EXPORT_SYMBOL(bio_clone_fast); +const char *bio_devname(struct bio *bio, char *buf) +{ + return disk_name(bio->bi_disk, bio->bi_partno, buf); +} +EXPORT_SYMBOL(bio_devname); + static inline bool page_is_mergeable(const struct bio_vec *bv, struct page *page, unsigned int len, unsigned int off, bool *same_page) diff --git a/block/genhd.c b/block/genhd.c index c5d20a48b4de..2484348d1850 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -69,6 +69,27 @@ void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify); +/* + * Format the device name of the indicated disk into the supplied buffer and + * return a pointer to that same buffer for convenience. + */ +char *disk_name(struct gendisk *hd, int partno, char *buf) +{ + if (!partno) + snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); + else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) + snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); + else + snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); + + return buf; +} + +const char *bdevname(struct block_device *bdev, char *buf) +{ + return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf); +} +EXPORT_SYMBOL(bdevname); void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { diff --git a/block/partition-generic.c b/block/partition-generic.c index 98256e6beabb..6bf5aec2a0dc 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -25,38 +25,6 @@ extern void md_autodetect_dev(dev_t dev); #endif -/* - * disk_name() is used by partition check code and the genhd driver. - * It formats the devicename of the indicated disk into - * the supplied buffer (of size at least 32), and returns - * a pointer to that same buffer (for convenience). - */ - -char *disk_name(struct gendisk *hd, int partno, char *buf) -{ - if (!partno) - snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); - else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) - snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); - else - snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); - - return buf; -} - -const char *bdevname(struct block_device *bdev, char *buf) -{ - return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf); -} - -EXPORT_SYMBOL(bdevname); - -const char *bio_devname(struct bio *bio, char *buf) -{ - return disk_name(bio->bi_disk, bio->bi_partno, buf); -} -EXPORT_SYMBOL(bio_devname); - static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { From 3ad5cee5cd000dc05e6c2410b06fc1d818e7b1e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:13 +0100 Subject: [PATCH 35/72] block: move sysfs methods shared by disks and partitions to genhd.c Move the sysfs _show methods that are used both on the full disk and partition nodes to genhd.c instead of hiding them in the partitioning code. Also move the declaration for these methods to block/blk.h so that we don't expose them to drivers. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk.h | 11 ++++++ block/genhd.c | 80 ++++++++++++++++++++++++++++++++++++++- block/partition-generic.c | 76 +------------------------------------ include/linux/genhd.h | 14 ------- 4 files changed, 91 insertions(+), 90 deletions(-) diff --git a/block/blk.h b/block/blk.h index 670337b7cfa0..43df9dcb3d4e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -214,6 +214,17 @@ static inline void elevator_exit(struct request_queue *q, struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); +ssize_t part_size_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count); + #ifdef CONFIG_FAIL_IO_TIMEOUT int blk_should_fake_timeout(struct request_queue *); ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); diff --git a/block/genhd.c b/block/genhd.c index 2484348d1850..f7d60b620b97 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -1199,6 +1200,60 @@ static ssize_t disk_ro_show(struct device *dev, return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); } +ssize_t part_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%llu\n", + (unsigned long long)part_nr_sects_read(p)); +} + +ssize_t part_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q = part_to_disk(p)->queue; + unsigned int inflight; + + inflight = part_in_flight(q, p); + return sprintf(buf, + "%8lu %8lu %8llu %8u " + "%8lu %8lu %8llu %8u " + "%8u %8u %8u " + "%8lu %8lu %8llu %8u " + "%8lu %8u" + "\n", + part_stat_read(p, ios[STAT_READ]), + part_stat_read(p, merges[STAT_READ]), + (unsigned long long)part_stat_read(p, sectors[STAT_READ]), + (unsigned int)part_stat_read_msecs(p, STAT_READ), + part_stat_read(p, ios[STAT_WRITE]), + part_stat_read(p, merges[STAT_WRITE]), + (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), + (unsigned int)part_stat_read_msecs(p, STAT_WRITE), + inflight, + jiffies_to_msecs(part_stat_read(p, io_ticks)), + jiffies_to_msecs(part_stat_read(p, time_in_queue)), + part_stat_read(p, ios[STAT_DISCARD]), + part_stat_read(p, merges[STAT_DISCARD]), + (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), + (unsigned int)part_stat_read_msecs(p, STAT_DISCARD), + part_stat_read(p, ios[STAT_FLUSH]), + (unsigned int)part_stat_read_msecs(p, STAT_FLUSH)); +} + +ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q = part_to_disk(p)->queue; + unsigned int inflight[2]; + + part_in_flight_rw(q, p, inflight); + return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); +} + static ssize_t disk_capability_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1237,10 +1292,33 @@ static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); + #ifdef CONFIG_FAIL_MAKE_REQUEST +ssize_t part_fail_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%d\n", p->make_it_fail); +} + +ssize_t part_fail_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct hd_struct *p = dev_to_part(dev); + int i; + + if (count > 0 && sscanf(buf, "%d", &i) > 0) + p->make_it_fail = (i == 0) ? 0 : 1; + + return count; +} + static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); -#endif +#endif /* CONFIG_FAIL_MAKE_REQUEST */ + #ifdef CONFIG_FAIL_IO_TIMEOUT static struct device_attribute dev_attr_fail_timeout = __ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store); diff --git a/block/partition-generic.c b/block/partition-generic.c index 6bf5aec2a0dc..e6fd2226a639 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -18,6 +18,7 @@ #include #include #include +#include "blk.h" #include "partitions/check.h" @@ -41,13 +42,6 @@ static ssize_t part_start_show(struct device *dev, return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); } -ssize_t part_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p)); -} - static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -69,74 +63,6 @@ static ssize_t part_discard_alignment_show(struct device *dev, return sprintf(buf, "%u\n", p->discard_alignment); } -ssize_t part_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; - unsigned int inflight; - - inflight = part_in_flight(q, p); - return sprintf(buf, - "%8lu %8lu %8llu %8u " - "%8lu %8lu %8llu %8u " - "%8u %8u %8u " - "%8lu %8lu %8llu %8u " - "%8lu %8u" - "\n", - part_stat_read(p, ios[STAT_READ]), - part_stat_read(p, merges[STAT_READ]), - (unsigned long long)part_stat_read(p, sectors[STAT_READ]), - (unsigned int)part_stat_read_msecs(p, STAT_READ), - part_stat_read(p, ios[STAT_WRITE]), - part_stat_read(p, merges[STAT_WRITE]), - (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), - (unsigned int)part_stat_read_msecs(p, STAT_WRITE), - inflight, - jiffies_to_msecs(part_stat_read(p, io_ticks)), - jiffies_to_msecs(part_stat_read(p, time_in_queue)), - part_stat_read(p, ios[STAT_DISCARD]), - part_stat_read(p, merges[STAT_DISCARD]), - (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), - (unsigned int)part_stat_read_msecs(p, STAT_DISCARD), - part_stat_read(p, ios[STAT_FLUSH]), - (unsigned int)part_stat_read_msecs(p, STAT_FLUSH)); -} - -ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; - unsigned int inflight[2]; - - part_in_flight_rw(q, p, inflight); - return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); -} - -#ifdef CONFIG_FAIL_MAKE_REQUEST -ssize_t part_fail_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->make_it_fail); -} - -ssize_t part_fail_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct hd_struct *p = dev_to_part(dev); - int i; - - if (count > 0 && sscanf(buf, "%d", &i) > 0) - p->make_it_fail = (i == 0) ? 0 : 1; - - return count; -} -#endif - static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); static DEVICE_ATTR(start, 0444, part_start_show, NULL); static DEVICE_ATTR(size, 0444, part_size_show, NULL); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 3c1a816785c8..612d38fce55c 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -706,20 +706,6 @@ extern void blk_register_region(dev_t devt, unsigned long range, void *data); extern void blk_unregister_region(dev_t devt, unsigned long range); -extern ssize_t part_size_show(struct device *dev, - struct device_attribute *attr, char *buf); -extern ssize_t part_stat_show(struct device *dev, - struct device_attribute *attr, char *buf); -extern ssize_t part_inflight_show(struct device *dev, - struct device_attribute *attr, char *buf); -#ifdef CONFIG_FAIL_MAKE_REQUEST -extern ssize_t part_fail_show(struct device *dev, - struct device_attribute *attr, char *buf); -extern ssize_t part_fail_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count); -#endif /* CONFIG_FAIL_MAKE_REQUEST */ - #define alloc_disk_node(minors, node_id) \ ({ \ static struct lock_class_key __key; \ From f17c21c1ecb80e957bafa07d6454836854be7cf2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:14 +0100 Subject: [PATCH 36/72] block: remove alloc_part_info and free_part_info There isn't any good reason not to simply open code the allocation and freeing of the partition_meta_info structure. Especially as one of the branches in alloc_part_info is entirely dead code. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partition-generic.c | 6 ++++-- include/linux/genhd.h | 15 +-------------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/block/partition-generic.c b/block/partition-generic.c index e6fd2226a639..f2004f3bd6f7 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -249,7 +249,9 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, p->policy = get_disk_ro(disk); if (info) { - struct partition_meta_info *pinfo = alloc_part_info(disk); + struct partition_meta_info *pinfo; + + pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id); if (!pinfo) { err = -ENOMEM; goto out_free_stats; @@ -308,7 +310,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, return p; out_free_info: - free_part_info(p); + kfree(p->info); out_free_stats: free_part_stats(p); out_free: diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 612d38fce55c..77f8bb8edfcd 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -465,19 +465,6 @@ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw); -static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk) -{ - if (disk) - return kzalloc_node(sizeof(struct partition_meta_info), - GFP_KERNEL, disk->node_id); - return kzalloc(sizeof(struct partition_meta_info), GFP_KERNEL); -} - -static inline void free_part_info(struct hd_struct *part) -{ - kfree(part->info); -} - void update_io_ticks(struct hd_struct *part, unsigned long now); /* block/genhd.c */ @@ -755,7 +742,7 @@ static inline void hd_struct_kill(struct hd_struct *part) static inline void hd_free_part(struct hd_struct *part) { free_part_stats(part); - free_part_info(part); + kfree(part->info); percpu_ref_exit(&part->ref); } From e63105df8fa1c87bdec4fb82ca7b121f139da221 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:15 +0100 Subject: [PATCH 37/72] scsi: simplify scsi_bios_ptable Use read_mapping_page and kmemdup instead of the odd read_dev_sector and put_dev_sector helpers from the partitioning code. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/scsi/scsicam.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c index e969138051c7..91a9530a4dcb 100644 --- a/drivers/scsi/scsicam.c +++ b/drivers/scsi/scsicam.c @@ -35,19 +35,17 @@ static int setsize(unsigned long capacity, unsigned int *cyls, unsigned int *hds */ unsigned char *scsi_bios_ptable(struct block_device *dev) { - unsigned char *res = kmalloc(66, GFP_KERNEL); - if (res) { - struct block_device *bdev = dev->bd_contains; - Sector sect; - void *data = read_dev_sector(bdev, 0, §); - if (data) { - memcpy(res, data + 0x1be, 66); - put_dev_sector(sect); - } else { - kfree(res); - res = NULL; - } - } + struct address_space *mapping = dev->bd_contains->bd_inode->i_mapping; + unsigned char *res = NULL; + struct page *page; + + page = read_mapping_page(mapping, 0, NULL); + if (IS_ERR(page)) + return NULL; + + if (!PageError(page)) + res = kmemdup(page_address(page) + 0x1be, 66, GFP_KERNEL); + put_page(page); return res; } EXPORT_SYMBOL(scsi_bios_ptable); From 26ae3533a0e666052afdf4cb6010a0d594c0e3f4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:16 +0100 Subject: [PATCH 38/72] scsi: move scsicam_bios_param to the end of scsicam.c This avoids the need for a forward declaration and generally keeps the file in the lower level first, high level last order. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/scsi/scsicam.c | 120 ++++++++++++++++++++--------------------- 1 file changed, 59 insertions(+), 61 deletions(-) diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c index 91a9530a4dcb..248ab23d2e69 100644 --- a/drivers/scsi/scsicam.c +++ b/drivers/scsi/scsicam.c @@ -21,10 +21,6 @@ #include - -static int setsize(unsigned long capacity, unsigned int *cyls, unsigned int *hds, - unsigned int *secs); - /** * scsi_bios_ptable - Read PC partition table out of first sector of device. * @dev: from this device @@ -50,63 +46,6 @@ unsigned char *scsi_bios_ptable(struct block_device *dev) } EXPORT_SYMBOL(scsi_bios_ptable); -/** - * scsicam_bios_param - Determine geometry of a disk in cylinders/heads/sectors. - * @bdev: which device - * @capacity: size of the disk in sectors - * @ip: return value: ip[0]=heads, ip[1]=sectors, ip[2]=cylinders - * - * Description : determine the BIOS mapping/geometry used for a drive in a - * SCSI-CAM system, storing the results in ip as required - * by the HDIO_GETGEO ioctl(). - * - * Returns : -1 on failure, 0 on success. - */ - -int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip) -{ - unsigned char *p; - u64 capacity64 = capacity; /* Suppress gcc warning */ - int ret; - - p = scsi_bios_ptable(bdev); - if (!p) - return -1; - - /* try to infer mapping from partition table */ - ret = scsi_partsize(p, (unsigned long)capacity, (unsigned int *)ip + 2, - (unsigned int *)ip + 0, (unsigned int *)ip + 1); - kfree(p); - - if (ret == -1 && capacity64 < (1ULL << 32)) { - /* pick some standard mapping with at most 1024 cylinders, - and at most 62 sectors per track - this works up to - 7905 MB */ - ret = setsize((unsigned long)capacity, (unsigned int *)ip + 2, - (unsigned int *)ip + 0, (unsigned int *)ip + 1); - } - - /* if something went wrong, then apparently we have to return - a geometry with more than 1024 cylinders */ - if (ret || ip[0] > 255 || ip[1] > 63) { - if ((capacity >> 11) > 65534) { - ip[0] = 255; - ip[1] = 63; - } else { - ip[0] = 64; - ip[1] = 32; - } - - if (capacity > 65535*63*255) - ip[2] = 65535; - else - ip[2] = (unsigned long)capacity / (ip[0] * ip[1]); - } - - return 0; -} -EXPORT_SYMBOL(scsicam_bios_param); - /** * scsi_partsize - Parse cylinders/heads/sectors from PC partition table * @buf: partition table, see scsi_bios_ptable() @@ -256,3 +195,62 @@ static int setsize(unsigned long capacity, unsigned int *cyls, unsigned int *hds *hds = (unsigned int) heads; return (rv); } + +/** + * scsicam_bios_param - Determine geometry of a disk in cylinders/heads/sectors. + * @bdev: which device + * @capacity: size of the disk in sectors + * @ip: return value: ip[0]=heads, ip[1]=sectors, ip[2]=cylinders + * + * Description : determine the BIOS mapping/geometry used for a drive in a + * SCSI-CAM system, storing the results in ip as required + * by the HDIO_GETGEO ioctl(). + * + * Returns : -1 on failure, 0 on success. + */ +int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip) +{ + unsigned char *p; + u64 capacity64 = capacity; /* Suppress gcc warning */ + int ret; + + p = scsi_bios_ptable(bdev); + if (!p) + return -1; + + /* try to infer mapping from partition table */ + ret = scsi_partsize(p, (unsigned long)capacity, (unsigned int *)ip + 2, + (unsigned int *)ip + 0, (unsigned int *)ip + 1); + kfree(p); + + if (ret == -1 && capacity64 < (1ULL << 32)) { + /* + * Pick some standard mapping with at most 1024 cylinders, and + * at most 62 sectors per track - this works up to 7905 MB. + */ + ret = setsize((unsigned long)capacity, (unsigned int *)ip + 2, + (unsigned int *)ip + 0, (unsigned int *)ip + 1); + } + + /* + * If something went wrong, then apparently we have to return a geometry + * with more than 1024 cylinders. + */ + if (ret || ip[0] > 255 || ip[1] > 63) { + if ((capacity >> 11) > 65534) { + ip[0] = 255; + ip[1] = 63; + } else { + ip[0] = 64; + ip[1] = 32; + } + + if (capacity > 65535*63*255) + ip[2] = 65535; + else + ip[2] = (unsigned long)capacity / (ip[0] * ip[1]); + } + + return 0; +} +EXPORT_SYMBOL(scsicam_bios_param); From a10183d744fb4e3f8aa38086c2b5e6fdf0171a1a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:17 +0100 Subject: [PATCH 39/72] scsi: simplify scsi_partsize Call scsi_bios_ptable from scsi_partsize instead of requiring boilerplate code in the callers. Also switch the calling convention to match that of the ->bios_param instances calling this function, and use true/false for the return value instead of the weird -1 convention. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- Documentation/scsi/scsi_mid_low_api.txt | 21 --------- drivers/scsi/aic7xxx/aic79xx_osm.c | 13 ++---- drivers/scsi/aic7xxx/aic7xxx_osm.c | 13 ++---- drivers/scsi/arcmsr/arcmsr_hba.c | 13 ++---- drivers/scsi/megaraid.c | 13 +----- drivers/scsi/scsicam.c | 61 +++++++++++++------------ include/scsi/scsicam.h | 7 ++- 7 files changed, 46 insertions(+), 95 deletions(-) diff --git a/Documentation/scsi/scsi_mid_low_api.txt b/Documentation/scsi/scsi_mid_low_api.txt index 2a4be1c3e6db..537f04728487 100644 --- a/Documentation/scsi/scsi_mid_low_api.txt +++ b/Documentation/scsi/scsi_mid_low_api.txt @@ -299,7 +299,6 @@ Summary: scsi_host_alloc - return a new scsi_host instance whose refcount==1 scsi_host_get - increments Scsi_Host instance's refcount scsi_host_put - decrements Scsi_Host instance's refcount (free if 0) - scsi_partsize - parse partition table into cylinders, heads + sectors scsi_register - create and register a scsi host adapter instance. scsi_remove_device - detach and remove a SCSI device scsi_remove_host - detach and remove all SCSI devices owned by host @@ -472,26 +471,6 @@ void scsi_host_get(struct Scsi_Host *shost) void scsi_host_put(struct Scsi_Host *shost) -/** - * scsi_partsize - parse partition table into cylinders, heads + sectors - * @buf: pointer to partition table - * @capacity: size of (total) disk in 512 byte sectors - * @cyls: outputs number of cylinders calculated via this pointer - * @hds: outputs number of heads calculated via this pointer - * @secs: outputs number of sectors calculated via this pointer - * - * Returns 0 on success, -1 on failure - * - * Might block: no - * - * Notes: Caller owns memory returned (free with kfree() ) - * - * Defined in: drivers/scsi/scsicam.c - **/ -int scsi_partsize(unsigned char *buf, unsigned long capacity, - unsigned int *cyls, unsigned int *hds, unsigned int *secs) - - /** * scsi_register - create and register a scsi host adapter instance. * @sht: pointer to scsi host template diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c index 57992519384e..dc4fe334efd0 100644 --- a/drivers/scsi/aic7xxx/aic79xx_osm.c +++ b/drivers/scsi/aic7xxx/aic79xx_osm.c @@ -723,24 +723,17 @@ static int ahd_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[]) { - uint8_t *bh; int heads; int sectors; int cylinders; - int ret; int extended; struct ahd_softc *ahd; ahd = *((struct ahd_softc **)sdev->host->hostdata); - bh = scsi_bios_ptable(bdev); - if (bh) { - ret = scsi_partsize(bh, capacity, - &geom[2], &geom[0], &geom[1]); - kfree(bh); - if (ret != -1) - return (ret); - } + if (scsi_partsize(bdev, capacity, geom)) + return 0; + heads = 64; sectors = 32; cylinders = aic_sector_div(capacity, heads, sectors); diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c index d5c4a0d23706..2edfa0594f18 100644 --- a/drivers/scsi/aic7xxx/aic7xxx_osm.c +++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c @@ -695,11 +695,9 @@ static int ahc_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[]) { - uint8_t *bh; int heads; int sectors; int cylinders; - int ret; int extended; struct ahc_softc *ahc; u_int channel; @@ -707,14 +705,9 @@ ahc_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, ahc = *((struct ahc_softc **)sdev->host->hostdata); channel = sdev_channel(sdev); - bh = scsi_bios_ptable(bdev); - if (bh) { - ret = scsi_partsize(bh, capacity, - &geom[2], &geom[0], &geom[1]); - kfree(bh); - if (ret != -1) - return (ret); - } + if (scsi_partsize(bdev, capacity, geom)) + return 0; + heads = 64; sectors = 32; cylinders = aic_sector_div(capacity, heads, sectors); diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c index 40dc8eac0e3a..c2c79a37a9ef 100644 --- a/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/drivers/scsi/arcmsr/arcmsr_hba.c @@ -353,16 +353,11 @@ static irqreturn_t arcmsr_do_interrupt(int irq, void *dev_id) static int arcmsr_bios_param(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int *geom) { - int ret, heads, sectors, cylinders, total_capacity; - unsigned char *buffer;/* return copy of block device's partition table */ + int heads, sectors, cylinders, total_capacity; + + if (scsi_partsize(bdev, capacity, geom)) + return 0; - buffer = scsi_bios_ptable(bdev); - if (buffer) { - ret = scsi_partsize(buffer, capacity, &geom[2], &geom[0], &geom[1]); - kfree(buffer); - if (ret != -1) - return ret; - } total_capacity = capacity; heads = 64; sectors = 32; diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c index ff6d4aa92421..f27ffd088c8a 100644 --- a/drivers/scsi/megaraid.c +++ b/drivers/scsi/megaraid.c @@ -2795,11 +2795,9 @@ megaraid_biosparam(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[]) { adapter_t *adapter; - unsigned char *bh; int heads; int sectors; int cylinders; - int rval; /* Get pointer to host config structure */ adapter = (adapter_t *)sdev->host->hostdata; @@ -2826,15 +2824,8 @@ megaraid_biosparam(struct scsi_device *sdev, struct block_device *bdev, geom[2] = cylinders; } else { - bh = scsi_bios_ptable(bdev); - - if( bh ) { - rval = scsi_partsize(bh, capacity, - &geom[2], &geom[0], &geom[1]); - kfree(bh); - if( rval != -1 ) - return rval; - } + if (scsi_partsize(bdev, capacity, geom)) + return 0; dev_info(&adapter->dev->dev, "invalid partition on this disk on channel %d\n", diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c index 248ab23d2e69..665d7db2f94c 100644 --- a/drivers/scsi/scsicam.c +++ b/drivers/scsi/scsicam.c @@ -48,29 +48,31 @@ EXPORT_SYMBOL(scsi_bios_ptable); /** * scsi_partsize - Parse cylinders/heads/sectors from PC partition table - * @buf: partition table, see scsi_bios_ptable() + * @bdev: block device to parse * @capacity: size of the disk in sectors - * @cyls: put cylinders here - * @hds: put heads here - * @secs: put sectors here + * @geom: output in form of [hds, cylinders, sectors] * * Determine the BIOS mapping/geometry used to create the partition - * table, storing the results in @cyls, @hds, and @secs + * table, storing the results in @geom. * - * Returns: -1 on failure, 0 on success. + * Returns: %false on failure, %true on success. */ - -int scsi_partsize(unsigned char *buf, unsigned long capacity, - unsigned int *cyls, unsigned int *hds, unsigned int *secs) +bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3]) { - struct partition *p = (struct partition *)buf, *largest = NULL; - int i, largest_cyl; int cyl, ext_cyl, end_head, end_cyl, end_sector; unsigned int logical_end, physical_end, ext_physical_end; + struct partition *p, *largest = NULL; + void *buf; + int ret = false; + buf = scsi_bios_ptable(bdev); + if (!buf) + return false; if (*(unsigned short *) (buf + 64) == 0xAA55) { - for (largest_cyl = -1, i = 0; i < 4; ++i, ++p) { + int largest_cyl = -1, i; + + for (i = 0, p = buf; i < 4; i++, p++) { if (!p->sys_ind) continue; #ifdef DEBUG @@ -90,7 +92,7 @@ int scsi_partsize(unsigned char *buf, unsigned long capacity, end_sector = largest->end_sector & 0x3f; if (end_head + 1 == 0 || end_sector == 0) - return -1; + goto out_free_buf; #ifdef DEBUG printk("scsicam_bios_param : end at h = %d, c = %d, s = %d\n", @@ -115,19 +117,24 @@ int scsi_partsize(unsigned char *buf, unsigned long capacity, ,logical_end, physical_end, ext_physical_end, ext_cyl); #endif - if ((logical_end == physical_end) || - (end_cyl == 1023 && ext_physical_end == logical_end)) { - *secs = end_sector; - *hds = end_head + 1; - *cyls = capacity / ((end_head + 1) * end_sector); - return 0; + if (logical_end == physical_end || + (end_cyl == 1023 && ext_physical_end == logical_end)) { + geom[0] = end_head + 1; + geom[1] = end_sector; + geom[2] = (unsigned long)capacity / + ((end_head + 1) * end_sector); + ret = true; + goto out_free_buf; } #ifdef DEBUG printk("scsicam_bios_param : logical (%u) != physical (%u)\n", logical_end, physical_end); #endif } - return -1; + +out_free_buf: + kfree(buf); + return ret; } EXPORT_SYMBOL(scsi_partsize); @@ -210,20 +217,14 @@ static int setsize(unsigned long capacity, unsigned int *cyls, unsigned int *hds */ int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip) { - unsigned char *p; u64 capacity64 = capacity; /* Suppress gcc warning */ - int ret; - - p = scsi_bios_ptable(bdev); - if (!p) - return -1; + int ret = 0; /* try to infer mapping from partition table */ - ret = scsi_partsize(p, (unsigned long)capacity, (unsigned int *)ip + 2, - (unsigned int *)ip + 0, (unsigned int *)ip + 1); - kfree(p); + if (scsi_partsize(bdev, capacity, ip)) + return 0; - if (ret == -1 && capacity64 < (1ULL << 32)) { + if (capacity64 < (1ULL << 32)) { /* * Pick some standard mapping with at most 1024 cylinders, and * at most 62 sectors per track - this works up to 7905 MB. diff --git a/include/scsi/scsicam.h b/include/scsi/scsicam.h index 57c729254569..08edd603e521 100644 --- a/include/scsi/scsicam.h +++ b/include/scsi/scsicam.h @@ -13,8 +13,7 @@ #ifndef SCSICAM_H #define SCSICAM_H -extern int scsicam_bios_param (struct block_device *bdev, sector_t capacity, int *ip); -extern int scsi_partsize(unsigned char *buf, unsigned long capacity, - unsigned int *cyls, unsigned int *hds, unsigned int *secs); -extern unsigned char *scsi_bios_ptable(struct block_device *bdev); +int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip); +bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3]); +unsigned char *scsi_bios_ptable(struct block_device *bdev); #endif /* def SCSICAM_H */ From 1a9fba3a77a5b39d1c9e1611758303f2649474e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:18 +0100 Subject: [PATCH 40/72] block: unexport read_dev_sector and put_dev_sector read_dev_sector and put_dev_sector are now only used by the partition parsing code. Remove the export for read_dev_sector and merge it into the only caller. Clean the mess up a bit by using goto labels and the SECTOR_SHIFT constant. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partition-generic.c | 31 +++++++++++++++++++------------ block/partitions/check.h | 14 +++++++------- include/linux/blkdev.h | 9 --------- 3 files changed, 26 insertions(+), 28 deletions(-) diff --git a/block/partition-generic.c b/block/partition-generic.c index f2004f3bd6f7..fef6bacb2bbb 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -484,22 +484,29 @@ out_free_state: return ret; } -unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { - struct address_space *mapping = bdev->bd_inode->i_mapping; + struct address_space *mapping = state->bdev->bd_inode->i_mapping; struct page *page; - page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_SHIFT-9)), NULL); - if (!IS_ERR(page)) { - if (PageError(page)) - goto fail; - p->v = page; - return (unsigned char *)page_address(page) + ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << 9); -fail: - put_page(page); + if (n >= get_capacity(state->bdev->bd_disk)) { + state->access_beyond_eod = true; + return NULL; } + + page = read_mapping_page(mapping, + (pgoff_t)(n >> (PAGE_SHIFT - 9)), NULL); + if (IS_ERR(page)) + goto out; + if (PageError(page)) + goto out_put_page; + + p->v = page; + return (unsigned char *)page_address(page) + + ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << SECTOR_SHIFT); +out_put_page: + put_page(page); +out: p->v = NULL; return NULL; } - -EXPORT_SYMBOL(read_dev_sector); diff --git a/block/partitions/check.h b/block/partitions/check.h index 6042f769471a..0fcf80117887 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -28,14 +28,14 @@ void free_partitions(struct parsed_partitions *state); struct parsed_partitions * check_partition(struct gendisk *, struct block_device *); -static inline void *read_part_sector(struct parsed_partitions *state, - sector_t n, Sector *p) +typedef struct { + struct page *v; +} Sector; + +void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p); +static inline void put_dev_sector(Sector p) { - if (n >= get_capacity(state->bdev->bd_disk)) { - state->access_beyond_eod = true; - return NULL; - } - return read_dev_sector(state->bdev, n, p); + put_page(p.v); } static inline void diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f629d40c645c..53a1325efbc3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1484,15 +1484,6 @@ static inline unsigned int block_size(struct block_device *bdev) return bdev->bd_block_size; } -typedef struct {struct page *v;} Sector; - -unsigned char *read_dev_sector(struct block_device *, sector_t, Sector *); - -static inline void put_dev_sector(Sector p) -{ - put_page(p.v); -} - int kblockd_schedule_work(struct work_struct *work); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); From 74cc979c3c7f8328b24651daf15280f07533e735 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:19 +0100 Subject: [PATCH 41/72] block: cleanup how md_autodetect_dev is called Add a new include/linux/raid/detect.h header to declare the md_autodetect_dev prototype which can be shared between md and the partition code. Then use IS_BUILTIN to call it instead of the ifdef magic. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partition-generic.c | 11 ++++------- drivers/md/md.c | 1 + include/linux/raid/detect.h | 3 +++ 3 files changed, 8 insertions(+), 7 deletions(-) create mode 100644 include/linux/raid/detect.h diff --git a/block/partition-generic.c b/block/partition-generic.c index fef6bacb2bbb..4d771ae835ed 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -18,14 +18,11 @@ #include #include #include +#include #include "blk.h" #include "partitions/check.h" -#ifdef CONFIG_BLK_DEV_MD -extern void md_autodetect_dev(dev_t dev); -#endif - static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -407,10 +404,10 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, return true; } -#ifdef CONFIG_BLK_DEV_MD - if (state->parts[p].flags & ADDPART_FLAG_RAID) + if (IS_BUILTIN(CONFIG_BLK_DEV_MD) && + (state->parts[p].flags & ADDPART_FLAG_RAID)) md_autodetect_dev(part_to_dev(part)->devt); -#endif + return true; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 8238fb64f87f..1c7193715f07 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/raid/detect.h b/include/linux/raid/detect.h new file mode 100644 index 000000000000..37dd3f40cd31 --- /dev/null +++ b/include/linux/raid/detect.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +void md_autodetect_dev(dev_t dev); From ffa9ed647aa4cda79fa8cb9583a566e9d777dd4c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:20 +0100 Subject: [PATCH 42/72] block: remove warn_no_part The warn_no_part is initialized to 1 and never changed. Remove it and execute the code keyed off from it unconditionally. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/amiga.c | 10 ++++------ block/partitions/check.c | 7 ++----- block/partitions/check.h | 3 --- 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c index 560936617d9c..7fecc760b78f 100644 --- a/block/partitions/amiga.c +++ b/block/partitions/amiga.c @@ -42,9 +42,8 @@ int amiga_partition(struct parsed_partitions *state) goto rdb_done; data = read_part_sector(state, blk, §); if (!data) { - if (warn_no_part) - pr_err("Dev %s: unable to read RDB block %d\n", - bdevname(state->bdev, b), blk); + pr_err("Dev %s: unable to read RDB block %d\n", + bdevname(state->bdev, b), blk); res = -1; goto rdb_done; } @@ -85,9 +84,8 @@ int amiga_partition(struct parsed_partitions *state) blk *= blksize; /* Read in terms partition table understands */ data = read_part_sector(state, blk, §); if (!data) { - if (warn_no_part) - pr_err("Dev %s: unable to read partition block %d\n", - bdevname(state->bdev, b), blk); + pr_err("Dev %s: unable to read partition block %d\n", + bdevname(state->bdev, b), blk); res = -1; goto rdb_done; } diff --git a/block/partitions/check.c b/block/partitions/check.c index ffe408fead0c..8fe46881ef63 100644 --- a/block/partitions/check.c +++ b/block/partitions/check.c @@ -37,8 +37,6 @@ #include "sysv68.h" #include "cmdline.h" -int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ - static int (*check_part[])(struct parsed_partitions *) = { /* * Probe partition formats with tables at disk address 0 @@ -186,9 +184,8 @@ check_partition(struct gendisk *hd, struct block_device *bdev) /* The partition is unrecognized. So report I/O errors if there were any */ res = err; if (res) { - if (warn_no_part) - strlcat(state->pp_buf, - " unable to read partition table\n", PAGE_SIZE); + strlcat(state->pp_buf, + " unable to read partition table\n", PAGE_SIZE); printk(KERN_INFO "%s", state->pp_buf); } diff --git a/block/partitions/check.h b/block/partitions/check.h index 0fcf80117887..19852b494e93 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -50,6 +50,3 @@ put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) strlcat(p->pp_buf, tmp, PAGE_SIZE); } } - -extern int warn_no_part; - From 3f1b95ef81b7dd5e5481347d7b7a7b427b29307a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:21 +0100 Subject: [PATCH 43/72] block: declare all partition detection routines in check.h There is no good reason to include one header per partition type in core.c. Instead move the prototypes for the detection routins to check.h, and remove all now empty headers in block/partitions/. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/acorn.c | 1 - block/partitions/acorn.h | 15 --------------- block/partitions/aix.c | 1 - block/partitions/aix.h | 2 -- block/partitions/amiga.c | 1 - block/partitions/amiga.h | 7 ------- block/partitions/atari.h | 1 - block/partitions/check.c | 16 ---------------- block/partitions/check.h | 22 ++++++++++++++++++++++ block/partitions/cmdline.c | 1 - block/partitions/cmdline.h | 3 --- block/partitions/efi.h | 3 --- block/partitions/ibm.c | 1 - block/partitions/ibm.h | 2 -- block/partitions/karma.h | 3 --- block/partitions/ldm.h | 2 -- block/partitions/mac.h | 1 - block/partitions/msdos.c | 1 - block/partitions/msdos.h | 1 - block/partitions/osf.h | 1 - block/partitions/sgi.h | 1 - block/partitions/sun.h | 1 - block/partitions/sysv68.c | 1 - block/partitions/sysv68.h | 2 -- block/partitions/ultrix.c | 1 - block/partitions/ultrix.h | 6 ------ 26 files changed, 22 insertions(+), 75 deletions(-) delete mode 100644 block/partitions/acorn.h delete mode 100644 block/partitions/aix.h delete mode 100644 block/partitions/amiga.h delete mode 100644 block/partitions/cmdline.h delete mode 100644 block/partitions/ibm.h delete mode 100644 block/partitions/sysv68.h delete mode 100644 block/partitions/ultrix.h diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c index 7587700fad4a..c64c57b958bf 100644 --- a/block/partitions/acorn.c +++ b/block/partitions/acorn.c @@ -11,7 +11,6 @@ #include #include "check.h" -#include "acorn.h" /* * Partition types. (Oh for reusability) diff --git a/block/partitions/acorn.h b/block/partitions/acorn.h deleted file mode 100644 index 67b06601ca4c..000000000000 --- a/block/partitions/acorn.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/fs/partitions/acorn.h - * - * Copyright (C) 1996-2001 Russell King. - * - * I _hate_ this partitioning mess - why can't we have one defined - * format, and everyone stick to it? - */ - -int adfspart_check_CUMANA(struct parsed_partitions *state); -int adfspart_check_ADFS(struct parsed_partitions *state); -int adfspart_check_ICS(struct parsed_partitions *state); -int adfspart_check_POWERTEC(struct parsed_partitions *state); -int adfspart_check_EESOX(struct parsed_partitions *state); diff --git a/block/partitions/aix.c b/block/partitions/aix.c index 903f3ed175d0..c7b4fd1a4a97 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -6,7 +6,6 @@ */ #include "check.h" -#include "aix.h" struct lvm_rec { char lvm_id[4]; /* "_LVM" */ diff --git a/block/partitions/aix.h b/block/partitions/aix.h deleted file mode 100644 index b4449f0b9f2b..000000000000 --- a/block/partitions/aix.h +++ /dev/null @@ -1,2 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -extern int aix_partition(struct parsed_partitions *state); diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c index 7fecc760b78f..9526491d9aed 100644 --- a/block/partitions/amiga.c +++ b/block/partitions/amiga.c @@ -14,7 +14,6 @@ #include #include "check.h" -#include "amiga.h" static __inline__ u32 checksum_block(__be32 *m, int size) diff --git a/block/partitions/amiga.h b/block/partitions/amiga.h deleted file mode 100644 index 7e63f4d9d969..000000000000 --- a/block/partitions/amiga.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/amiga.h - */ - -int amiga_partition(struct parsed_partitions *state); - diff --git a/block/partitions/atari.h b/block/partitions/atari.h index 01c2b9457394..678202442fd3 100644 --- a/block/partitions/atari.h +++ b/block/partitions/atari.h @@ -34,4 +34,3 @@ struct rootsector u16 checksum; /* checksum for bootable disks */ } __packed; -int atari_partition(struct parsed_partitions *state); diff --git a/block/partitions/check.c b/block/partitions/check.c index 8fe46881ef63..944c478b6f0b 100644 --- a/block/partitions/check.c +++ b/block/partitions/check.c @@ -21,22 +21,6 @@ #include "check.h" -#include "acorn.h" -#include "amiga.h" -#include "atari.h" -#include "ldm.h" -#include "mac.h" -#include "msdos.h" -#include "osf.h" -#include "sgi.h" -#include "sun.h" -#include "ibm.h" -#include "ultrix.h" -#include "efi.h" -#include "karma.h" -#include "sysv68.h" -#include "cmdline.h" - static int (*check_part[])(struct parsed_partitions *) = { /* * Probe partition formats with tables at disk address 0 diff --git a/block/partitions/check.h b/block/partitions/check.h index 19852b494e93..23e7adb79617 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -50,3 +50,25 @@ put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) strlcat(p->pp_buf, tmp, PAGE_SIZE); } } + +/* detection routines go here in alphabetical order: */ +int adfspart_check_ADFS(struct parsed_partitions *state); +int adfspart_check_CUMANA(struct parsed_partitions *state); +int adfspart_check_EESOX(struct parsed_partitions *state); +int adfspart_check_ICS(struct parsed_partitions *state); +int adfspart_check_POWERTEC(struct parsed_partitions *state); +int aix_partition(struct parsed_partitions *state); +int amiga_partition(struct parsed_partitions *state); +int atari_partition(struct parsed_partitions *state); +int cmdline_partition(struct parsed_partitions *state); +int efi_partition(struct parsed_partitions *state); +int ibm_partition(struct parsed_partitions *); +int karma_partition(struct parsed_partitions *state); +int ldm_partition(struct parsed_partitions *state); +int mac_partition(struct parsed_partitions *state); +int msdos_partition(struct parsed_partitions *state); +int osf_partition(struct parsed_partitions *state); +int sgi_partition(struct parsed_partitions *state); +int sun_partition(struct parsed_partitions *state); +int sysv68_partition(struct parsed_partitions *state); +int ultrix_partition(struct parsed_partitions *state); diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index f1edd5452249..8f545c36cde4 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -18,7 +18,6 @@ #include #include "check.h" -#include "cmdline.h" static char *cmdline; static struct cmdline_parts *bdev_parts; diff --git a/block/partitions/cmdline.h b/block/partitions/cmdline.h deleted file mode 100644 index e64a31636a1f..000000000000 --- a/block/partitions/cmdline.h +++ /dev/null @@ -1,3 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -int cmdline_partition(struct parsed_partitions *state); diff --git a/block/partitions/efi.h b/block/partitions/efi.h index 3e8576157575..907bac5ce8f7 100644 --- a/block/partitions/efi.h +++ b/block/partitions/efi.h @@ -113,7 +113,4 @@ typedef struct _legacy_mbr { __le16 signature; } __packed legacy_mbr; -/* Functions */ -extern int efi_partition(struct parsed_partitions *state); - #endif diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index a5d480f807f3..073faa6a69b8 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -15,7 +15,6 @@ #include #include "check.h" -#include "ibm.h" union label_t { diff --git a/block/partitions/ibm.h b/block/partitions/ibm.h deleted file mode 100644 index 8bf13febb2b6..000000000000 --- a/block/partitions/ibm.h +++ /dev/null @@ -1,2 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -int ibm_partition(struct parsed_partitions *); diff --git a/block/partitions/karma.h b/block/partitions/karma.h index 48e074d417fb..1b5eec57ee0c 100644 --- a/block/partitions/karma.h +++ b/block/partitions/karma.h @@ -4,6 +4,3 @@ */ #define KARMA_LABEL_MAGIC 0xAB56 - -int karma_partition(struct parsed_partitions *state); - diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h index 1ca63e97bccc..841580af7f9b 100644 --- a/block/partitions/ldm.h +++ b/block/partitions/ldm.h @@ -193,7 +193,5 @@ struct ldmdb { /* Cache of the database */ struct list_head v_part; }; -int ldm_partition(struct parsed_partitions *state); - #endif /* _FS_PT_LDM_H_ */ diff --git a/block/partitions/mac.h b/block/partitions/mac.h index 453ed2964804..0e41c9da7532 100644 --- a/block/partitions/mac.h +++ b/block/partitions/mac.h @@ -42,4 +42,3 @@ struct mac_driver_desc { /* ... more stuff */ }; -int mac_partition(struct parsed_partitions *state); diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 82c44f7df911..c572022f3781 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -24,7 +24,6 @@ #include "check.h" #include "msdos.h" #include "efi.h" -#include "aix.h" /* * Many architectures don't like unaligned accesses, while diff --git a/block/partitions/msdos.h b/block/partitions/msdos.h index fcacfc486092..123e666bb932 100644 --- a/block/partitions/msdos.h +++ b/block/partitions/msdos.h @@ -5,5 +5,4 @@ #define MSDOS_LABEL_MAGIC 0xAA55 -int msdos_partition(struct parsed_partitions *state); diff --git a/block/partitions/osf.h b/block/partitions/osf.h index 4d8088e7ea8c..80a58c382b3f 100644 --- a/block/partitions/osf.h +++ b/block/partitions/osf.h @@ -5,4 +5,3 @@ #define DISKLABELMAGIC (0x82564557UL) -int osf_partition(struct parsed_partitions *state); diff --git a/block/partitions/sgi.h b/block/partitions/sgi.h index a5b77c3987cf..372cdad19fea 100644 --- a/block/partitions/sgi.h +++ b/block/partitions/sgi.h @@ -3,7 +3,6 @@ * fs/partitions/sgi.h */ -extern int sgi_partition(struct parsed_partitions *state); #define SGI_LABEL_MAGIC 0x0be5a941 diff --git a/block/partitions/sun.h b/block/partitions/sun.h index ae1b9eed3fd7..4c8877a5b52d 100644 --- a/block/partitions/sun.h +++ b/block/partitions/sun.h @@ -6,4 +6,3 @@ #define SUN_LABEL_MAGIC 0xDABE #define SUN_VTOC_SANITY 0x600DDEEE -int sun_partition(struct parsed_partitions *state); diff --git a/block/partitions/sysv68.c b/block/partitions/sysv68.c index 92e810826b01..6f6257fd4eb4 100644 --- a/block/partitions/sysv68.c +++ b/block/partitions/sysv68.c @@ -6,7 +6,6 @@ */ #include "check.h" -#include "sysv68.h" /* * Volume ID structure: on first 256-bytes sector of disk diff --git a/block/partitions/sysv68.h b/block/partitions/sysv68.h deleted file mode 100644 index 4fb6b8ec78ae..000000000000 --- a/block/partitions/sysv68.h +++ /dev/null @@ -1,2 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -extern int sysv68_partition(struct parsed_partitions *state); diff --git a/block/partitions/ultrix.c b/block/partitions/ultrix.c index ecd0d7346c3d..4aaa81043ca0 100644 --- a/block/partitions/ultrix.c +++ b/block/partitions/ultrix.c @@ -8,7 +8,6 @@ */ #include "check.h" -#include "ultrix.h" int ultrix_partition(struct parsed_partitions *state) { diff --git a/block/partitions/ultrix.h b/block/partitions/ultrix.h deleted file mode 100644 index 9f676cead222..000000000000 --- a/block/partitions/ultrix.h +++ /dev/null @@ -1,6 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/ultrix.h - */ - -int ultrix_partition(struct parsed_partitions *state); From f6d17358dc7eb2259115c0017a1ff9958a59eb2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:22 +0100 Subject: [PATCH 44/72] block: remove block/partitions/karma.h Just move the single define to block/partitions/karma.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/karma.c | 3 ++- block/partitions/karma.h | 6 ------ 2 files changed, 2 insertions(+), 7 deletions(-) delete mode 100644 block/partitions/karma.h diff --git a/block/partitions/karma.c b/block/partitions/karma.c index 59812d705c3d..4d93512f4bd4 100644 --- a/block/partitions/karma.c +++ b/block/partitions/karma.c @@ -8,9 +8,10 @@ */ #include "check.h" -#include "karma.h" #include +#define KARMA_LABEL_MAGIC 0xAB56 + int karma_partition(struct parsed_partitions *state) { int i; diff --git a/block/partitions/karma.h b/block/partitions/karma.h deleted file mode 100644 index 1b5eec57ee0c..000000000000 --- a/block/partitions/karma.h +++ /dev/null @@ -1,6 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/karma.h - */ - -#define KARMA_LABEL_MAGIC 0xAB56 From 3466f63a7cfe55217534e71af866037be2e1909e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:23 +0100 Subject: [PATCH 45/72] block: remove block/partitions/osf.h Just move the single define to block/partitions/osf.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/osf.c | 2 +- block/partitions/osf.h | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) delete mode 100644 block/partitions/osf.h diff --git a/block/partitions/osf.c b/block/partitions/osf.c index 4b873973d6c0..84560d0765ed 100644 --- a/block/partitions/osf.c +++ b/block/partitions/osf.c @@ -9,9 +9,9 @@ */ #include "check.h" -#include "osf.h" #define MAX_OSF_PARTITIONS 18 +#define DISKLABELMAGIC (0x82564557UL) int osf_partition(struct parsed_partitions *state) { diff --git a/block/partitions/osf.h b/block/partitions/osf.h deleted file mode 100644 index 80a58c382b3f..000000000000 --- a/block/partitions/osf.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/osf.h - */ - -#define DISKLABELMAGIC (0x82564557UL) - From 95f77ef35a990e2947cee100f72c0e3ddd1ccd75 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:24 +0100 Subject: [PATCH 46/72] block: remove block/partitions/sgi.h Just move the single define to block/partitions/sgi.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/sgi.c | 3 ++- block/partitions/sgi.h | 8 -------- 2 files changed, 2 insertions(+), 9 deletions(-) delete mode 100644 block/partitions/sgi.h diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c index d7b421c6e530..927cf501603e 100644 --- a/block/partitions/sgi.c +++ b/block/partitions/sgi.c @@ -6,7 +6,8 @@ */ #include "check.h" -#include "sgi.h" + +#define SGI_LABEL_MAGIC 0x0be5a941 struct sgi_disklabel { __be32 magic_mushroom; /* Big fat spliff... */ diff --git a/block/partitions/sgi.h b/block/partitions/sgi.h deleted file mode 100644 index 372cdad19fea..000000000000 --- a/block/partitions/sgi.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/sgi.h - */ - - -#define SGI_LABEL_MAGIC 0x0be5a941 - From cbb5cb3b29f9eb158bd2db39cdc07db6d8087461 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:25 +0100 Subject: [PATCH 47/72] block: remove block/partitions/sun.h Just move the two defines to block/partitions/sun.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/sun.c | 4 +++- block/partitions/sun.h | 8 -------- 2 files changed, 3 insertions(+), 9 deletions(-) delete mode 100644 block/partitions/sun.h diff --git a/block/partitions/sun.c b/block/partitions/sun.c index 90f36724e796..28b44100f2b1 100644 --- a/block/partitions/sun.c +++ b/block/partitions/sun.c @@ -9,7 +9,9 @@ */ #include "check.h" -#include "sun.h" + +#define SUN_LABEL_MAGIC 0xDABE +#define SUN_VTOC_SANITY 0x600DDEEE int sun_partition(struct parsed_partitions *state) { diff --git a/block/partitions/sun.h b/block/partitions/sun.h deleted file mode 100644 index 4c8877a5b52d..000000000000 --- a/block/partitions/sun.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/sun.h - */ - -#define SUN_LABEL_MAGIC 0xDABE -#define SUN_VTOC_SANITY 0x600DDEEE - From 1442f76d4317b420580e11238d20789708c742a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:26 +0100 Subject: [PATCH 48/72] block: move struct partition out of genhd.h struct partition is the on-disk format of a MSDOS partition table entry. Move it out of genhd.h into a new msdos_partition.h header and give it a msdos_ prefix to avoid confusion. Also move the magic number from block/partitions/msdos.h to the new header so that it can be used by the SCSI drivers looking at the DOS partition tables. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/ldm.c | 6 +++--- block/partitions/msdos.c | 28 ++++++++++++++-------------- block/partitions/msdos.h | 8 -------- drivers/scsi/BusLogic.c | 8 +++++--- drivers/scsi/aacraid/linit.c | 7 ++++--- drivers/scsi/scsi_debug.c | 5 +++-- drivers/scsi/scsicam.c | 3 ++- include/linux/genhd.h | 13 ------------- include/linux/msdos_partition.h | 20 ++++++++++++++++++++ 9 files changed, 51 insertions(+), 47 deletions(-) delete mode 100644 block/partitions/msdos.h create mode 100644 include/linux/msdos_partition.h diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index a2d97ee1908c..6fdfcb40c537 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -14,10 +14,10 @@ #include #include #include +#include #include "ldm.h" #include "check.h" -#include "msdos.h" /* * ldm_debug/info/error/crit - Output an error message @@ -493,7 +493,7 @@ static bool ldm_validate_partition_table(struct parsed_partitions *state) { Sector sect; u8 *data; - struct partition *p; + struct msdos_partition *p; int i; bool result = false; @@ -508,7 +508,7 @@ static bool ldm_validate_partition_table(struct parsed_partitions *state) if (*(__le16*) (data + 0x01FE) != cpu_to_le16 (MSDOS_LABEL_MAGIC)) goto out; - p = (struct partition*)(data + 0x01BE); + p = (struct msdos_partition *)(data + 0x01BE); for (i = 0; i < 4; i++, p++) if (SYS_IND (p) == LDM_PARTITION) { result = true; diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index c572022f3781..88ee5ee7f442 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -20,9 +20,9 @@ * Re-organised Feb 1998 Russell King */ #include +#include #include "check.h" -#include "msdos.h" #include "efi.h" /* @@ -34,17 +34,17 @@ #define SYS_IND(p) get_unaligned(&p->sys_ind) -static inline sector_t nr_sects(struct partition *p) +static inline sector_t nr_sects(struct msdos_partition *p) { return (sector_t)get_unaligned_le32(&p->nr_sects); } -static inline sector_t start_sect(struct partition *p) +static inline sector_t start_sect(struct msdos_partition *p) { return (sector_t)get_unaligned_le32(&p->start_sect); } -static inline int is_extended_partition(struct partition *p) +static inline int is_extended_partition(struct msdos_partition *p) { return (SYS_IND(p) == DOS_EXTENDED_PARTITION || SYS_IND(p) == WIN98_EXTENDED_PARTITION || @@ -67,7 +67,7 @@ msdos_magic_present(unsigned char *p) #define AIX_LABEL_MAGIC4 0xC1 static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) { - struct partition *pt = (struct partition *) (p + 0x1be); + struct msdos_partition *pt = (struct msdos_partition *) (p + 0x1be); Sector sect; unsigned char *d; int slot, ret = 0; @@ -121,7 +121,7 @@ static void parse_extended(struct parsed_partitions *state, sector_t first_sector, sector_t first_size, u32 disksig) { - struct partition *p; + struct msdos_partition *p; Sector sect; unsigned char *data; sector_t this_sector, this_size; @@ -145,7 +145,7 @@ static void parse_extended(struct parsed_partitions *state, if (!msdos_magic_present(data + 510)) goto done; - p = (struct partition *) (data + 0x1be); + p = (struct msdos_partition *) (data + 0x1be); /* * Usually, the first entry is the real data partition, @@ -402,14 +402,14 @@ static void parse_minix(struct parsed_partitions *state, #ifdef CONFIG_MINIX_SUBPARTITION Sector sect; unsigned char *data; - struct partition *p; + struct msdos_partition *p; int i; data = read_part_sector(state, offset, §); if (!data) return; - p = (struct partition *)(data + 0x1be); + p = (struct msdos_partition *)(data + 0x1be); /* The first sector of a Minix partition can have either * a secondary MBR describing its subpartitions, or @@ -453,7 +453,7 @@ int msdos_partition(struct parsed_partitions *state) sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; Sector sect; unsigned char *data; - struct partition *p; + struct msdos_partition *p; struct fat_boot_sector *fb; int slot; u32 disksig; @@ -487,7 +487,7 @@ int msdos_partition(struct parsed_partitions *state) * partition table. Reject this in case the boot indicator * is not 0 or 0x80. */ - p = (struct partition *) (data + 0x1be); + p = (struct msdos_partition *) (data + 0x1be); for (slot = 1; slot <= 4; slot++, p++) { if (p->boot_ind != 0 && p->boot_ind != 0x80) { /* @@ -509,7 +509,7 @@ int msdos_partition(struct parsed_partitions *state) } #ifdef CONFIG_EFI_PARTITION - p = (struct partition *) (data + 0x1be); + p = (struct msdos_partition *) (data + 0x1be); for (slot = 1 ; slot <= 4 ; slot++, p++) { /* If this is an EFI GPT disk, msdos should ignore it. */ if (SYS_IND(p) == EFI_PMBR_OSTYPE_EFI_GPT) { @@ -518,7 +518,7 @@ int msdos_partition(struct parsed_partitions *state) } } #endif - p = (struct partition *) (data + 0x1be); + p = (struct msdos_partition *) (data + 0x1be); disksig = le32_to_cpup((__le32 *)(data + 0x1b8)); @@ -565,7 +565,7 @@ int msdos_partition(struct parsed_partitions *state) strlcat(state->pp_buf, "\n", PAGE_SIZE); /* second pass - output for each on a separate line */ - p = (struct partition *) (0x1be + data); + p = (struct msdos_partition *) (0x1be + data); for (slot = 1 ; slot <= 4 ; slot++, p++) { unsigned char id = SYS_IND(p); int n; diff --git a/block/partitions/msdos.h b/block/partitions/msdos.h deleted file mode 100644 index 123e666bb932..000000000000 --- a/block/partitions/msdos.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/msdos.h - */ - -#define MSDOS_LABEL_MAGIC 0xAA55 - - diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c index 3170b295a5da..186259417449 100644 --- a/drivers/scsi/BusLogic.c +++ b/drivers/scsi/BusLogic.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -3410,9 +3411,10 @@ static int blogic_diskparam(struct scsi_device *sdev, struct block_device *dev, a partition table entry whose end_head matches one of the standard BusLogic geometry translations (64/32, 128/32, or 255/63). */ - if (*(unsigned short *) (buf + 64) == 0xAA55) { - struct partition *part1_entry = (struct partition *) buf; - struct partition *part_entry = part1_entry; + if (*(unsigned short *) (buf + 64) == MSDOS_LABEL_MAGIC) { + struct msdos_partition *part1_entry = + (struct msdos_partition *)buf; + struct msdos_partition *part_entry = part1_entry; int saved_cyl = diskparam->cylinders, part_no; unsigned char part_end_head = 0, part_end_sector = 0; diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index ee6bc2f9b80a..0443b74390cf 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -328,9 +329,9 @@ static int aac_biosparm(struct scsi_device *sdev, struct block_device *bdev, buf = scsi_bios_ptable(bdev); if (!buf) return 0; - if(*(__le16 *)(buf + 0x40) == cpu_to_le16(0xaa55)) { - struct partition *first = (struct partition * )buf; - struct partition *entry = first; + if (*(__le16 *)(buf + 0x40) == cpu_to_le16(MSDOS_LABEL_MAGIC)) { + struct msdos_partition *first = (struct msdos_partition *)buf; + struct msdos_partition *entry = first; int saved_cylinders = param->cylinders; int num; unsigned char end_head, end_sec; diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 44cb054d5e66..4c6c448dc2df 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -38,6 +38,7 @@ #include #include #include +#include #include @@ -4146,7 +4147,7 @@ static int scsi_debug_host_reset(struct scsi_cmnd *SCpnt) static void __init sdebug_build_parts(unsigned char *ramp, unsigned long store_size) { - struct partition *pp; + struct msdos_partition *pp; int starts[SDEBUG_MAX_PARTS + 2]; int sectors_per_part, num_sectors, k; int heads_by_sects, start_sec, end_sec; @@ -4171,7 +4172,7 @@ static void __init sdebug_build_parts(unsigned char *ramp, ramp[510] = 0x55; /* magic partition markings */ ramp[511] = 0xAA; - pp = (struct partition *)(ramp + 0x1be); + pp = (struct msdos_partition *)(ramp + 0x1be); for (k = 0; starts[k + 1]; ++k, ++pp) { start_sec = starts[k]; end_sec = starts[k + 1] - 1; diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c index 665d7db2f94c..682cf08ab041 100644 --- a/drivers/scsi/scsicam.c +++ b/drivers/scsi/scsicam.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -61,7 +62,7 @@ bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3]) { int cyl, ext_cyl, end_head, end_cyl, end_sector; unsigned int logical_end, physical_end, ext_physical_end; - struct partition *p, *largest = NULL; + struct msdos_partition *p, *largest = NULL; void *buf; int ret = false; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 77f8bb8edfcd..3d84da9ec0fa 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -70,19 +70,6 @@ enum { #include #include -struct partition { - unsigned char boot_ind; /* 0x80 - active */ - unsigned char head; /* starting head */ - unsigned char sector; /* starting sector */ - unsigned char cyl; /* starting cylinder */ - unsigned char sys_ind; /* What partition type */ - unsigned char end_head; /* end head */ - unsigned char end_sector; /* end sector */ - unsigned char end_cyl; /* end cylinder */ - __le32 start_sect; /* starting sector counting from 0 */ - __le32 nr_sects; /* nr of sectors in partition */ -} __attribute__((packed)); - struct disk_stats { u64 nsecs[NR_STAT_GROUPS]; unsigned long sectors[NR_STAT_GROUPS]; diff --git a/include/linux/msdos_partition.h b/include/linux/msdos_partition.h new file mode 100644 index 000000000000..a8e2c1b4bc66 --- /dev/null +++ b/include/linux/msdos_partition.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MSDOS_PARTITION_H +#define _LINUX_MSDOS_PARTITION_H + +#define MSDOS_LABEL_MAGIC 0xAA55 + +struct msdos_partition { + u8 boot_ind; /* 0x80 - active */ + u8 head; /* starting head */ + u8 sector; /* starting sector */ + u8 cyl; /* starting cylinder */ + u8 sys_ind; /* What partition type */ + u8 end_head; /* end head */ + u8 end_sector; /* end sector */ + u8 end_cyl; /* end cylinder */ + __le32 start_sect; /* starting sector counting from 0 */ + __le32 nr_sects; /* nr of sectors in partition */ +} __packed; + +#endif /* LINUX_MSDOS_PARTITION_H */ From 0226e9ead44b2eb8f2471d24e0b0c5ff60bc322c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:27 +0100 Subject: [PATCH 49/72] block: move the *_PARTITION enum out of genhd.h The enum containing the *_PARTITION symbolic names is only relevant for the partition parser. More specifically most values are MSDOS partition table system indicators and thus should go straight into msdos.c. One value is only used by the sun partition parser, and the sun and sgi partition parsers use the same value as the x86 Linux RAID indicator to also indicate RAID autodetection. Duplicate them in sun.c and sgi.c given that the different partition types use entirely different values otherwise. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/sgi.c | 4 ++++ block/partitions/sun.c | 5 +++++ include/linux/genhd.h | 30 ------------------------------ include/linux/msdos_partition.h | 31 +++++++++++++++++++++++++++++++ 4 files changed, 40 insertions(+), 30 deletions(-) diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c index 927cf501603e..4273f1bb0515 100644 --- a/block/partitions/sgi.c +++ b/block/partitions/sgi.c @@ -9,6 +9,10 @@ #define SGI_LABEL_MAGIC 0x0be5a941 +enum { + LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ +}; + struct sgi_disklabel { __be32 magic_mushroom; /* Big fat spliff... */ __be16 root_part_num; /* Root partition number */ diff --git a/block/partitions/sun.c b/block/partitions/sun.c index 28b44100f2b1..47dc53eccf77 100644 --- a/block/partitions/sun.c +++ b/block/partitions/sun.c @@ -13,6 +13,11 @@ #define SUN_LABEL_MAGIC 0xDABE #define SUN_VTOC_SANITY 0x600DDEEE +enum { + SUN_WHOLE_DISK = 5, + LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ +}; + int sun_partition(struct parsed_partitions *state) { int i; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 3d84da9ec0fa..3050b0ee9cc7 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -30,36 +30,6 @@ extern struct device_type part_type; extern struct kobject *block_depr; extern struct class block_class; -enum { -/* These three have identical behaviour; use the second one if DOS FDISK gets - confused about extended/logical partitions starting past cylinder 1023. */ - DOS_EXTENDED_PARTITION = 5, - LINUX_EXTENDED_PARTITION = 0x85, - WIN98_EXTENDED_PARTITION = 0x0f, - - SUN_WHOLE_DISK = DOS_EXTENDED_PARTITION, - - LINUX_SWAP_PARTITION = 0x82, - LINUX_DATA_PARTITION = 0x83, - LINUX_LVM_PARTITION = 0x8e, - LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ - - SOLARIS_X86_PARTITION = LINUX_SWAP_PARTITION, - NEW_SOLARIS_X86_PARTITION = 0xbf, - - DM6_AUX1PARTITION = 0x51, /* no DDO: use xlated geom */ - DM6_AUX3PARTITION = 0x53, /* no DDO: use xlated geom */ - DM6_PARTITION = 0x54, /* has DDO: use xlated geom & offset */ - EZD_PARTITION = 0x55, /* EZ-DRIVE */ - - FREEBSD_PARTITION = 0xa5, /* FreeBSD Partition ID */ - OPENBSD_PARTITION = 0xa6, /* OpenBSD Partition ID */ - NETBSD_PARTITION = 0xa9, /* NetBSD Partition ID */ - BSDI_PARTITION = 0xb7, /* BSDI Partition ID */ - MINIX_PARTITION = 0x81, /* Minix Partition ID */ - UNIXWARE_PARTITION = 0x63, /* Same as GNU_HURD and SCO Unix */ -}; - #define DISK_MAX_PARTS 256 #define DISK_NAME_LEN 32 diff --git a/include/linux/msdos_partition.h b/include/linux/msdos_partition.h index a8e2c1b4bc66..e151af072cd1 100644 --- a/include/linux/msdos_partition.h +++ b/include/linux/msdos_partition.h @@ -17,4 +17,35 @@ struct msdos_partition { __le32 nr_sects; /* nr of sectors in partition */ } __packed; +enum msdos_sys_ind { + /* + * These three have identical behaviour; use the second one if DOS FDISK + * gets confused about extended/logical partitions starting past + * cylinder 1023. + */ + DOS_EXTENDED_PARTITION = 5, + LINUX_EXTENDED_PARTITION = 0x85, + WIN98_EXTENDED_PARTITION = 0x0f, + + LINUX_SWAP_PARTITION = 0x82, + LINUX_DATA_PARTITION = 0x83, + LINUX_LVM_PARTITION = 0x8e, + LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ + + SOLARIS_X86_PARTITION = LINUX_SWAP_PARTITION, + NEW_SOLARIS_X86_PARTITION = 0xbf, + + DM6_AUX1PARTITION = 0x51, /* no DDO: use xlated geom */ + DM6_AUX3PARTITION = 0x53, /* no DDO: use xlated geom */ + DM6_PARTITION = 0x54, /* has DDO: use xlated geom & offset */ + EZD_PARTITION = 0x55, /* EZ-DRIVE */ + + FREEBSD_PARTITION = 0xa5, /* FreeBSD Partition ID */ + OPENBSD_PARTITION = 0xa6, /* OpenBSD Partition ID */ + NETBSD_PARTITION = 0xa9, /* NetBSD Partition ID */ + BSDI_PARTITION = 0xb7, /* BSDI Partition ID */ + MINIX_PARTITION = 0x81, /* Minix Partition ID */ + UNIXWARE_PARTITION = 0x63, /* Same as GNU_HURD and SCO Unix */ +}; + #endif /* LINUX_MSDOS_PARTITION_H */ From cb0ab52652123c47cb72e665ce9fdd3029dcb175 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:28 +0100 Subject: [PATCH 50/72] partitions/msdos: remove LINUX_SWAP_PARTITION Just always use NEW_SOLARIS_X86_PARTITION and explain the situation, as that is less confusing than two names for a single value. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/msdos.c | 18 ++++++++++++------ include/linux/msdos_partition.h | 3 +-- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 88ee5ee7f442..e44e2f0a02cc 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -77,13 +77,19 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) p[2] == AIX_LABEL_MAGIC3 && p[3] == AIX_LABEL_MAGIC4)) return 0; - /* Assume the partition table is valid if Linux partitions exists */ + + /* + * Assume the partition table is valid if Linux partitions exists. + * Note that old Solaris/x86 partitions use the same indicator as + * Linux swap partitions, so we consider that a Linux partition as + * well. + */ for (slot = 1; slot <= 4; slot++, pt++) { - if (pt->sys_ind == LINUX_SWAP_PARTITION || - pt->sys_ind == LINUX_RAID_PARTITION || - pt->sys_ind == LINUX_DATA_PARTITION || - pt->sys_ind == LINUX_LVM_PARTITION || - is_extended_partition(pt)) + if (pt->sys_ind == SOLARIS_X86_PARTITION || + pt->sys_ind == LINUX_RAID_PARTITION || + pt->sys_ind == LINUX_DATA_PARTITION || + pt->sys_ind == LINUX_LVM_PARTITION || + is_extended_partition(pt)) return 0; } d = read_part_sector(state, 7, §); diff --git a/include/linux/msdos_partition.h b/include/linux/msdos_partition.h index e151af072cd1..2cb82db2a43c 100644 --- a/include/linux/msdos_partition.h +++ b/include/linux/msdos_partition.h @@ -27,12 +27,11 @@ enum msdos_sys_ind { LINUX_EXTENDED_PARTITION = 0x85, WIN98_EXTENDED_PARTITION = 0x0f, - LINUX_SWAP_PARTITION = 0x82, LINUX_DATA_PARTITION = 0x83, LINUX_LVM_PARTITION = 0x8e, LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ - SOLARIS_X86_PARTITION = LINUX_SWAP_PARTITION, + SOLARIS_X86_PARTITION = 0x82, /* also Linux swap partitions */ NEW_SOLARIS_X86_PARTITION = 0xbf, DM6_AUX1PARTITION = 0x51, /* no DDO: use xlated geom */ From 3f4fc59c1321b74df27dcd5d77b37989ed93265b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:29 +0100 Subject: [PATCH 51/72] block: move the various x86 Unix label formats out of genhd.h All these are just used in block/partitions/msdos.c, so move them out of the genhd.h driver included by every driver. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/msdos.c | 125 ++++++++++++++++++++++++++++++++++ include/linux/genhd.h | 143 --------------------------------------- 2 files changed, 125 insertions(+), 143 deletions(-) diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index e44e2f0a02cc..8f2fcc080264 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -18,6 +18,12 @@ * Check partition table on IDE disks for common CHS translations * * Re-organised Feb 1998 Russell King + * + * BSD disklabel support by Yossi Gottlieb + * updated by Marc Espie + * + * Unixware slices support by Andrzej Krzysztofowicz + * and Krzysztof G. Baranowski */ #include #include @@ -215,6 +221,30 @@ done: put_dev_sector(sect); } +#define SOLARIS_X86_NUMSLICE 16 +#define SOLARIS_X86_VTOC_SANE (0x600DDEEEUL) + +struct solaris_x86_slice { + __le16 s_tag; /* ID tag of partition */ + __le16 s_flag; /* permission flags */ + __le32 s_start; /* start sector no of partition */ + __le32 s_size; /* # of blocks in partition */ +}; + +struct solaris_x86_vtoc { + unsigned int v_bootinfo[3]; /* info needed by mboot */ + __le32 v_sanity; /* to verify vtoc sanity */ + __le32 v_version; /* layout version */ + char v_volume[8]; /* volume name */ + __le16 v_sectorsz; /* sector size in bytes */ + __le16 v_nparts; /* number of partitions */ + unsigned int v_reserved[10]; /* free space */ + struct solaris_x86_slice + v_slice[SOLARIS_X86_NUMSLICE]; /* slice headers */ + unsigned int timestamp[SOLARIS_X86_NUMSLICE]; /* timestamp */ + char v_asciilabel[128]; /* for compatibility */ +}; + /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also indicates linux swap. Be careful before believing this is Solaris. */ @@ -270,6 +300,54 @@ static void parse_solaris_x86(struct parsed_partitions *state, #endif } +/* check against BSD src/sys/sys/disklabel.h for consistency */ +#define BSD_DISKMAGIC (0x82564557UL) /* The disk magic number */ +#define BSD_MAXPARTITIONS 16 +#define OPENBSD_MAXPARTITIONS 16 +#define BSD_FS_UNUSED 0 /* disklabel unused partition entry ID */ +struct bsd_disklabel { + __le32 d_magic; /* the magic number */ + __s16 d_type; /* drive type */ + __s16 d_subtype; /* controller/d_type specific */ + char d_typename[16]; /* type name, e.g. "eagle" */ + char d_packname[16]; /* pack identifier */ + __u32 d_secsize; /* # of bytes per sector */ + __u32 d_nsectors; /* # of data sectors per track */ + __u32 d_ntracks; /* # of tracks per cylinder */ + __u32 d_ncylinders; /* # of data cylinders per unit */ + __u32 d_secpercyl; /* # of data sectors per cylinder */ + __u32 d_secperunit; /* # of data sectors per unit */ + __u16 d_sparespertrack; /* # of spare sectors per track */ + __u16 d_sparespercyl; /* # of spare sectors per cylinder */ + __u32 d_acylinders; /* # of alt. cylinders per unit */ + __u16 d_rpm; /* rotational speed */ + __u16 d_interleave; /* hardware sector interleave */ + __u16 d_trackskew; /* sector 0 skew, per track */ + __u16 d_cylskew; /* sector 0 skew, per cylinder */ + __u32 d_headswitch; /* head switch time, usec */ + __u32 d_trkseek; /* track-to-track seek, usec */ + __u32 d_flags; /* generic flags */ +#define NDDATA 5 + __u32 d_drivedata[NDDATA]; /* drive-type specific information */ +#define NSPARE 5 + __u32 d_spare[NSPARE]; /* reserved for future use */ + __le32 d_magic2; /* the magic number (again) */ + __le16 d_checksum; /* xor of data incl. partitions */ + + /* filesystem and partition information: */ + __le16 d_npartitions; /* number of partitions in following */ + __le32 d_bbsize; /* size of boot area at sn0, bytes */ + __le32 d_sbsize; /* max size of fs superblock, bytes */ + struct bsd_partition { /* the partition table */ + __le32 p_size; /* number of sectors in partition */ + __le32 p_offset; /* starting sector */ + __le32 p_fsize; /* filesystem basic fragment size */ + __u8 p_fstype; /* filesystem type, see below */ + __u8 p_frag; /* filesystem fragments per block */ + __le16 p_cpg; /* filesystem cylinders per group */ + } d_partitions[BSD_MAXPARTITIONS]; /* actually may be more */ +}; + #if defined(CONFIG_BSD_DISKLABEL) /* * Create devices for BSD partitions listed in a disklabel, under a @@ -354,6 +432,51 @@ static void parse_openbsd(struct parsed_partitions *state, #endif } +#define UNIXWARE_DISKMAGIC (0xCA5E600DUL) /* The disk magic number */ +#define UNIXWARE_DISKMAGIC2 (0x600DDEEEUL) /* The slice table magic nr */ +#define UNIXWARE_NUMSLICE 16 +#define UNIXWARE_FS_UNUSED 0 /* Unused slice entry ID */ + +struct unixware_slice { + __le16 s_label; /* label */ + __le16 s_flags; /* permission flags */ + __le32 start_sect; /* starting sector */ + __le32 nr_sects; /* number of sectors in slice */ +}; + +struct unixware_disklabel { + __le32 d_type; /* drive type */ + __le32 d_magic; /* the magic number */ + __le32 d_version; /* version number */ + char d_serial[12]; /* serial number of the device */ + __le32 d_ncylinders; /* # of data cylinders per device */ + __le32 d_ntracks; /* # of tracks per cylinder */ + __le32 d_nsectors; /* # of data sectors per track */ + __le32 d_secsize; /* # of bytes per sector */ + __le32 d_part_start; /* # of first sector of this partition*/ + __le32 d_unknown1[12]; /* ? */ + __le32 d_alt_tbl; /* byte offset of alternate table */ + __le32 d_alt_len; /* byte length of alternate table */ + __le32 d_phys_cyl; /* # of physical cylinders per device */ + __le32 d_phys_trk; /* # of physical tracks per cylinder */ + __le32 d_phys_sec; /* # of physical sectors per track */ + __le32 d_phys_bytes; /* # of physical bytes per sector */ + __le32 d_unknown2; /* ? */ + __le32 d_unknown3; /* ? */ + __le32 d_pad[8]; /* pad */ + + struct unixware_vtoc { + __le32 v_magic; /* the magic number */ + __le32 v_version; /* version number */ + char v_name[8]; /* volume name */ + __le16 v_nslices; /* # of slices */ + __le16 v_unknown1; /* ? */ + __le32 v_reserved[10]; /* reserved */ + struct unixware_slice + v_slice[UNIXWARE_NUMSLICE]; /* slice headers */ + } vtoc; +}; /* 408 */ + /* * Create devices for Unixware partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. @@ -397,6 +520,8 @@ static void parse_unixware(struct parsed_partitions *state, #endif } +#define MINIX_NR_SUBPARTITIONS 4 + /* * Minix 2.0.0/2.0.2 subpartition support. * Anand Krishnamurthy diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 3050b0ee9cc7..da62b44b15be 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -473,149 +473,6 @@ static inline void set_capacity(struct gendisk *disk, sector_t size) disk->part0.nr_sects = size; } -#ifdef CONFIG_SOLARIS_X86_PARTITION - -#define SOLARIS_X86_NUMSLICE 16 -#define SOLARIS_X86_VTOC_SANE (0x600DDEEEUL) - -struct solaris_x86_slice { - __le16 s_tag; /* ID tag of partition */ - __le16 s_flag; /* permission flags */ - __le32 s_start; /* start sector no of partition */ - __le32 s_size; /* # of blocks in partition */ -}; - -struct solaris_x86_vtoc { - unsigned int v_bootinfo[3]; /* info needed by mboot (unsupported) */ - __le32 v_sanity; /* to verify vtoc sanity */ - __le32 v_version; /* layout version */ - char v_volume[8]; /* volume name */ - __le16 v_sectorsz; /* sector size in bytes */ - __le16 v_nparts; /* number of partitions */ - unsigned int v_reserved[10]; /* free space */ - struct solaris_x86_slice - v_slice[SOLARIS_X86_NUMSLICE]; /* slice headers */ - unsigned int timestamp[SOLARIS_X86_NUMSLICE]; /* timestamp (unsupported) */ - char v_asciilabel[128]; /* for compatibility */ -}; - -#endif /* CONFIG_SOLARIS_X86_PARTITION */ - -#ifdef CONFIG_BSD_DISKLABEL -/* - * BSD disklabel support by Yossi Gottlieb - * updated by Marc Espie - */ - -/* check against BSD src/sys/sys/disklabel.h for consistency */ - -#define BSD_DISKMAGIC (0x82564557UL) /* The disk magic number */ -#define BSD_MAXPARTITIONS 16 -#define OPENBSD_MAXPARTITIONS 16 -#define BSD_FS_UNUSED 0 /* disklabel unused partition entry ID */ -struct bsd_disklabel { - __le32 d_magic; /* the magic number */ - __s16 d_type; /* drive type */ - __s16 d_subtype; /* controller/d_type specific */ - char d_typename[16]; /* type name, e.g. "eagle" */ - char d_packname[16]; /* pack identifier */ - __u32 d_secsize; /* # of bytes per sector */ - __u32 d_nsectors; /* # of data sectors per track */ - __u32 d_ntracks; /* # of tracks per cylinder */ - __u32 d_ncylinders; /* # of data cylinders per unit */ - __u32 d_secpercyl; /* # of data sectors per cylinder */ - __u32 d_secperunit; /* # of data sectors per unit */ - __u16 d_sparespertrack; /* # of spare sectors per track */ - __u16 d_sparespercyl; /* # of spare sectors per cylinder */ - __u32 d_acylinders; /* # of alt. cylinders per unit */ - __u16 d_rpm; /* rotational speed */ - __u16 d_interleave; /* hardware sector interleave */ - __u16 d_trackskew; /* sector 0 skew, per track */ - __u16 d_cylskew; /* sector 0 skew, per cylinder */ - __u32 d_headswitch; /* head switch time, usec */ - __u32 d_trkseek; /* track-to-track seek, usec */ - __u32 d_flags; /* generic flags */ -#define NDDATA 5 - __u32 d_drivedata[NDDATA]; /* drive-type specific information */ -#define NSPARE 5 - __u32 d_spare[NSPARE]; /* reserved for future use */ - __le32 d_magic2; /* the magic number (again) */ - __le16 d_checksum; /* xor of data incl. partitions */ - - /* filesystem and partition information: */ - __le16 d_npartitions; /* number of partitions in following */ - __le32 d_bbsize; /* size of boot area at sn0, bytes */ - __le32 d_sbsize; /* max size of fs superblock, bytes */ - struct bsd_partition { /* the partition table */ - __le32 p_size; /* number of sectors in partition */ - __le32 p_offset; /* starting sector */ - __le32 p_fsize; /* filesystem basic fragment size */ - __u8 p_fstype; /* filesystem type, see below */ - __u8 p_frag; /* filesystem fragments per block */ - __le16 p_cpg; /* filesystem cylinders per group */ - } d_partitions[BSD_MAXPARTITIONS]; /* actually may be more */ -}; - -#endif /* CONFIG_BSD_DISKLABEL */ - -#ifdef CONFIG_UNIXWARE_DISKLABEL -/* - * Unixware slices support by Andrzej Krzysztofowicz - * and Krzysztof G. Baranowski - */ - -#define UNIXWARE_DISKMAGIC (0xCA5E600DUL) /* The disk magic number */ -#define UNIXWARE_DISKMAGIC2 (0x600DDEEEUL) /* The slice table magic nr */ -#define UNIXWARE_NUMSLICE 16 -#define UNIXWARE_FS_UNUSED 0 /* Unused slice entry ID */ - -struct unixware_slice { - __le16 s_label; /* label */ - __le16 s_flags; /* permission flags */ - __le32 start_sect; /* starting sector */ - __le32 nr_sects; /* number of sectors in slice */ -}; - -struct unixware_disklabel { - __le32 d_type; /* drive type */ - __le32 d_magic; /* the magic number */ - __le32 d_version; /* version number */ - char d_serial[12]; /* serial number of the device */ - __le32 d_ncylinders; /* # of data cylinders per device */ - __le32 d_ntracks; /* # of tracks per cylinder */ - __le32 d_nsectors; /* # of data sectors per track */ - __le32 d_secsize; /* # of bytes per sector */ - __le32 d_part_start; /* # of first sector of this partition */ - __le32 d_unknown1[12]; /* ? */ - __le32 d_alt_tbl; /* byte offset of alternate table */ - __le32 d_alt_len; /* byte length of alternate table */ - __le32 d_phys_cyl; /* # of physical cylinders per device */ - __le32 d_phys_trk; /* # of physical tracks per cylinder */ - __le32 d_phys_sec; /* # of physical sectors per track */ - __le32 d_phys_bytes; /* # of physical bytes per sector */ - __le32 d_unknown2; /* ? */ - __le32 d_unknown3; /* ? */ - __le32 d_pad[8]; /* pad */ - - struct unixware_vtoc { - __le32 v_magic; /* the magic number */ - __le32 v_version; /* version number */ - char v_name[8]; /* volume name */ - __le16 v_nslices; /* # of slices */ - __le16 v_unknown1; /* ? */ - __le32 v_reserved[10]; /* reserved */ - struct unixware_slice - v_slice[UNIXWARE_NUMSLICE]; /* slice headers */ - } vtoc; - -}; /* 408 */ - -#endif /* CONFIG_UNIXWARE_DISKLABEL */ - -#ifdef CONFIG_MINIX_SUBPARTITION -# define MINIX_NR_SUBPARTITIONS 4 -#endif /* CONFIG_MINIX_SUBPARTITION */ - #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 From 387048bf67eeff8bdf7c2a41b03b48230a88b3d3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:30 +0100 Subject: [PATCH 52/72] block: merge partition-generic.c and check.c Merge block/partition-generic.c and block/partitions/check.c into a single block/partitions/core.c as the content is closely related and both files are tiny. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/Makefile | 3 +- block/partitions/Makefile | 3 +- block/partitions/check.c | 179 ------------------ block/partitions/check.h | 5 - .../core.c} | 178 +++++++++++++++-- 5 files changed, 166 insertions(+), 202 deletions(-) delete mode 100644 block/partitions/check.c rename block/{partition-generic.c => partitions/core.c} (77%) diff --git a/block/Makefile b/block/Makefile index 1a43750f4b01..206b96e9387f 100644 --- a/block/Makefile +++ b/block/Makefile @@ -8,8 +8,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ - genhd.o partition-generic.o ioprio.o \ - badblocks.o partitions/ blk-rq-qos.o + genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o diff --git a/block/partitions/Makefile b/block/partitions/Makefile index 2f276b677c81..a7f05cdb02a8 100644 --- a/block/partitions/Makefile +++ b/block/partitions/Makefile @@ -3,8 +3,7 @@ # Makefile for the linux kernel. # -obj-$(CONFIG_BLOCK) := check.o - +obj-$(CONFIG_BLOCK) += core.o obj-$(CONFIG_ACORN_PARTITION) += acorn.o obj-$(CONFIG_AMIGA_PARTITION) += amiga.o obj-$(CONFIG_ATARI_PARTITION) += atari.o diff --git a/block/partitions/check.c b/block/partitions/check.c deleted file mode 100644 index 944c478b6f0b..000000000000 --- a/block/partitions/check.c +++ /dev/null @@ -1,179 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * fs/partitions/check.c - * - * Code extracted from drivers/block/genhd.c - * Copyright (C) 1991-1998 Linus Torvalds - * Re-organised Feb 1998 Russell King - * - * We now have independent partition support from the - * block drivers, which allows all the partition code to - * be grouped in one location, and it to be mostly self - * contained. - * - * Added needed MAJORS for new pairs, {hdi,hdj}, {hdk,hdl} - */ - -#include -#include -#include -#include - -#include "check.h" - -static int (*check_part[])(struct parsed_partitions *) = { - /* - * Probe partition formats with tables at disk address 0 - * that also have an ADFS boot block at 0xdc0. - */ -#ifdef CONFIG_ACORN_PARTITION_ICS - adfspart_check_ICS, -#endif -#ifdef CONFIG_ACORN_PARTITION_POWERTEC - adfspart_check_POWERTEC, -#endif -#ifdef CONFIG_ACORN_PARTITION_EESOX - adfspart_check_EESOX, -#endif - - /* - * Now move on to formats that only have partition info at - * disk address 0xdc0. Since these may also have stale - * PC/BIOS partition tables, they need to come before - * the msdos entry. - */ -#ifdef CONFIG_ACORN_PARTITION_CUMANA - adfspart_check_CUMANA, -#endif -#ifdef CONFIG_ACORN_PARTITION_ADFS - adfspart_check_ADFS, -#endif - -#ifdef CONFIG_CMDLINE_PARTITION - cmdline_partition, -#endif -#ifdef CONFIG_EFI_PARTITION - efi_partition, /* this must come before msdos */ -#endif -#ifdef CONFIG_SGI_PARTITION - sgi_partition, -#endif -#ifdef CONFIG_LDM_PARTITION - ldm_partition, /* this must come before msdos */ -#endif -#ifdef CONFIG_MSDOS_PARTITION - msdos_partition, -#endif -#ifdef CONFIG_OSF_PARTITION - osf_partition, -#endif -#ifdef CONFIG_SUN_PARTITION - sun_partition, -#endif -#ifdef CONFIG_AMIGA_PARTITION - amiga_partition, -#endif -#ifdef CONFIG_ATARI_PARTITION - atari_partition, -#endif -#ifdef CONFIG_MAC_PARTITION - mac_partition, -#endif -#ifdef CONFIG_ULTRIX_PARTITION - ultrix_partition, -#endif -#ifdef CONFIG_IBM_PARTITION - ibm_partition, -#endif -#ifdef CONFIG_KARMA_PARTITION - karma_partition, -#endif -#ifdef CONFIG_SYSV68_PARTITION - sysv68_partition, -#endif - NULL -}; - -static struct parsed_partitions *allocate_partitions(struct gendisk *hd) -{ - struct parsed_partitions *state; - int nr; - - state = kzalloc(sizeof(*state), GFP_KERNEL); - if (!state) - return NULL; - - nr = disk_max_parts(hd); - state->parts = vzalloc(array_size(nr, sizeof(state->parts[0]))); - if (!state->parts) { - kfree(state); - return NULL; - } - - state->limit = nr; - - return state; -} - -void free_partitions(struct parsed_partitions *state) -{ - vfree(state->parts); - kfree(state); -} - -struct parsed_partitions * -check_partition(struct gendisk *hd, struct block_device *bdev) -{ - struct parsed_partitions *state; - int i, res, err; - - state = allocate_partitions(hd); - if (!state) - return NULL; - state->pp_buf = (char *)__get_free_page(GFP_KERNEL); - if (!state->pp_buf) { - free_partitions(state); - return NULL; - } - state->pp_buf[0] = '\0'; - - state->bdev = bdev; - disk_name(hd, 0, state->name); - snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); - if (isdigit(state->name[strlen(state->name)-1])) - sprintf(state->name, "p"); - - i = res = err = 0; - while (!res && check_part[i]) { - memset(state->parts, 0, state->limit * sizeof(state->parts[0])); - res = check_part[i++](state); - if (res < 0) { - /* We have hit an I/O error which we don't report now. - * But record it, and let the others do their job. - */ - err = res; - res = 0; - } - - } - if (res > 0) { - printk(KERN_INFO "%s", state->pp_buf); - - free_page((unsigned long)state->pp_buf); - return state; - } - if (state->access_beyond_eod) - err = -ENOSPC; - if (err) - /* The partition is unrecognized. So report I/O errors if there were any */ - res = err; - if (res) { - strlcat(state->pp_buf, - " unable to read partition table\n", PAGE_SIZE); - printk(KERN_INFO "%s", state->pp_buf); - } - - free_page((unsigned long)state->pp_buf); - free_partitions(state); - return ERR_PTR(res); -} diff --git a/block/partitions/check.h b/block/partitions/check.h index 23e7adb79617..f845355489ec 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -23,11 +23,6 @@ struct parsed_partitions { char *pp_buf; }; -void free_partitions(struct parsed_partitions *state); - -struct parsed_partitions * -check_partition(struct gendisk *, struct block_device *); - typedef struct { struct page *v; } Sector; diff --git a/block/partition-generic.c b/block/partitions/core.c similarity index 77% rename from block/partition-generic.c rename to block/partitions/core.c index 4d771ae835ed..b442bc209b86 100644 --- a/block/partition-generic.c +++ b/block/partitions/core.c @@ -1,27 +1,177 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Code extracted from drivers/block/genhd.c - * Copyright (C) 1991-1998 Linus Torvalds - * Re-organised Feb 1998 Russell King - * - * We now have independent partition support from the - * block drivers, which allows all the partition code to - * be grouped in one location, and it to be mostly self - * contained. + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King */ - -#include -#include #include #include -#include #include #include +#include #include #include -#include "blk.h" +#include "../blk.h" +#include "check.h" -#include "partitions/check.h" +static int (*check_part[])(struct parsed_partitions *) = { + /* + * Probe partition formats with tables at disk address 0 + * that also have an ADFS boot block at 0xdc0. + */ +#ifdef CONFIG_ACORN_PARTITION_ICS + adfspart_check_ICS, +#endif +#ifdef CONFIG_ACORN_PARTITION_POWERTEC + adfspart_check_POWERTEC, +#endif +#ifdef CONFIG_ACORN_PARTITION_EESOX + adfspart_check_EESOX, +#endif + + /* + * Now move on to formats that only have partition info at + * disk address 0xdc0. Since these may also have stale + * PC/BIOS partition tables, they need to come before + * the msdos entry. + */ +#ifdef CONFIG_ACORN_PARTITION_CUMANA + adfspart_check_CUMANA, +#endif +#ifdef CONFIG_ACORN_PARTITION_ADFS + adfspart_check_ADFS, +#endif + +#ifdef CONFIG_CMDLINE_PARTITION + cmdline_partition, +#endif +#ifdef CONFIG_EFI_PARTITION + efi_partition, /* this must come before msdos */ +#endif +#ifdef CONFIG_SGI_PARTITION + sgi_partition, +#endif +#ifdef CONFIG_LDM_PARTITION + ldm_partition, /* this must come before msdos */ +#endif +#ifdef CONFIG_MSDOS_PARTITION + msdos_partition, +#endif +#ifdef CONFIG_OSF_PARTITION + osf_partition, +#endif +#ifdef CONFIG_SUN_PARTITION + sun_partition, +#endif +#ifdef CONFIG_AMIGA_PARTITION + amiga_partition, +#endif +#ifdef CONFIG_ATARI_PARTITION + atari_partition, +#endif +#ifdef CONFIG_MAC_PARTITION + mac_partition, +#endif +#ifdef CONFIG_ULTRIX_PARTITION + ultrix_partition, +#endif +#ifdef CONFIG_IBM_PARTITION + ibm_partition, +#endif +#ifdef CONFIG_KARMA_PARTITION + karma_partition, +#endif +#ifdef CONFIG_SYSV68_PARTITION + sysv68_partition, +#endif + NULL +}; + +static struct parsed_partitions *allocate_partitions(struct gendisk *hd) +{ + struct parsed_partitions *state; + int nr; + + state = kzalloc(sizeof(*state), GFP_KERNEL); + if (!state) + return NULL; + + nr = disk_max_parts(hd); + state->parts = vzalloc(array_size(nr, sizeof(state->parts[0]))); + if (!state->parts) { + kfree(state); + return NULL; + } + + state->limit = nr; + + return state; +} + +static void free_partitions(struct parsed_partitions *state) +{ + vfree(state->parts); + kfree(state); +} + +static struct parsed_partitions *check_partition(struct gendisk *hd, + struct block_device *bdev) +{ + struct parsed_partitions *state; + int i, res, err; + + state = allocate_partitions(hd); + if (!state) + return NULL; + state->pp_buf = (char *)__get_free_page(GFP_KERNEL); + if (!state->pp_buf) { + free_partitions(state); + return NULL; + } + state->pp_buf[0] = '\0'; + + state->bdev = bdev; + disk_name(hd, 0, state->name); + snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); + if (isdigit(state->name[strlen(state->name)-1])) + sprintf(state->name, "p"); + + i = res = err = 0; + while (!res && check_part[i]) { + memset(state->parts, 0, state->limit * sizeof(state->parts[0])); + res = check_part[i++](state); + if (res < 0) { + /* + * We have hit an I/O error which we don't report now. + * But record it, and let the others do their job. + */ + err = res; + res = 0; + } + + } + if (res > 0) { + printk(KERN_INFO "%s", state->pp_buf); + + free_page((unsigned long)state->pp_buf); + return state; + } + if (state->access_beyond_eod) + err = -ENOSPC; + /* + * The partition is unrecognized. So report I/O errors if there were any + */ + if (err) + res = err; + if (res) { + strlcat(state->pp_buf, + " unable to read partition table\n", PAGE_SIZE); + printk(KERN_INFO "%s", state->pp_buf); + } + + free_page((unsigned long)state->pp_buf); + free_partitions(state); + return ERR_PTR(res); +} static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) From 2b8bd423614c595540eaadcfbc702afe8e155e50 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 25 Mar 2020 16:07:04 +0300 Subject: [PATCH 53/72] block/diskstats: more accurate approximation of io_ticks for slow disks Currently io_ticks is approximated by adding one at each start and end of requests if jiffies counter has changed. This works perfectly for requests shorter than a jiffy or if one of requests starts/ends at each jiffy. If disk executes just one request at a time and they are longer than two jiffies then only first and last jiffies will be accounted. Fix is simple: at the end of request add up into io_ticks jiffies passed since last update rather than just one jiffy. Example: common HDD executes random read 4k requests around 12ms. fio --name=test --filename=/dev/sdb --rw=randread --direct=1 --runtime=30 & iostat -x 10 sdb Note changes of iostat's "%util" 8,43% -> 99,99% before/after patch: Before: Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util sdb 0,00 0,00 82,60 0,00 330,40 0,00 8,00 0,96 12,09 12,09 0,00 1,02 8,43 After: Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util sdb 0,00 0,00 82,50 0,00 330,00 0,00 8,00 1,00 12,10 12,10 0,00 12,12 99,99 Now io_ticks does not loose time between start and end of requests, but for queue-depth > 1 some I/O time between adjacent starts might be lost. For load estimation "%util" is not as useful as average queue length, but it clearly shows how often disk queue is completely empty. Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting") Signed-off-by: Konstantin Khlebnikov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- Documentation/admin-guide/iostats.rst | 5 ++++- block/bio.c | 8 ++++---- block/blk-core.c | 4 ++-- include/linux/genhd.h | 2 +- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/iostats.rst b/Documentation/admin-guide/iostats.rst index df5b8345c41d..9b14b0c2c9c4 100644 --- a/Documentation/admin-guide/iostats.rst +++ b/Documentation/admin-guide/iostats.rst @@ -100,7 +100,7 @@ Field 10 -- # of milliseconds spent doing I/Os (unsigned int) Since 5.0 this field counts jiffies when at least one request was started or completed. If request runs more than 2 jiffies then some - I/O time will not be accounted unless there are other requests. + I/O time might be not accounted in case of concurrent requests. Field 11 -- weighted # of milliseconds spent doing I/Os (unsigned int) This field is incremented at each I/O start, I/O completion, I/O @@ -143,6 +143,9 @@ are summed (possibly overflowing the unsigned long variable they are summed to) and the result given to the user. There is no convenient user interface for accessing the per-CPU counters themselves. +Since 4.19 request times are measured with nanoseconds precision and +truncated to milliseconds before showing in this interface. + Disks vs Partitions ------------------- diff --git a/block/bio.c b/block/bio.c index 209715765a7a..68f65ef2ceba 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1768,14 +1768,14 @@ defer: schedule_work(&bio_dirty_work); } -void update_io_ticks(struct hd_struct *part, unsigned long now) +void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) { unsigned long stamp; again: stamp = READ_ONCE(part->stamp); if (unlikely(stamp != now)) { if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) { - __part_stat_add(part, io_ticks, 1); + __part_stat_add(part, io_ticks, end ? now - stamp : 1); } } if (part->partno) { @@ -1791,7 +1791,7 @@ void generic_start_io_acct(struct request_queue *q, int op, part_stat_lock(); - update_io_ticks(part, jiffies); + update_io_ticks(part, jiffies, false); part_stat_inc(part, ios[sgrp]); part_stat_add(part, sectors[sgrp], sectors); part_inc_in_flight(q, part, op_is_write(op)); @@ -1809,7 +1809,7 @@ void generic_end_io_acct(struct request_queue *q, int req_op, part_stat_lock(); - update_io_ticks(part, now); + update_io_ticks(part, now, true); part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); part_stat_add(part, time_in_queue, duration); part_dec_in_flight(q, part, op_is_write(req_op)); diff --git a/block/blk-core.c b/block/blk-core.c index abfdcf81a228..4401b30a1751 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1337,7 +1337,7 @@ void blk_account_io_done(struct request *req, u64 now) part_stat_lock(); part = req->part; - update_io_ticks(part, jiffies); + update_io_ticks(part, jiffies, true); part_stat_inc(part, ios[sgrp]); part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); part_stat_add(part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns)); @@ -1379,7 +1379,7 @@ void blk_account_io_start(struct request *rq, bool new_io) rq->part = part; } - update_io_ticks(part, jiffies); + update_io_ticks(part, jiffies, false); part_stat_unlock(); } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index da62b44b15be..13bb51f37b3f 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -422,7 +422,7 @@ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw); -void update_io_ticks(struct hd_struct *part, unsigned long now); +void update_io_ticks(struct hd_struct *part, unsigned long now, bool end); /* block/genhd.c */ extern void device_add_disk(struct device *parent, struct gendisk *disk, From ea18e0f0a63af9064db3d4065d90fa743ae0991b Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 25 Mar 2020 16:07:06 +0300 Subject: [PATCH 54/72] block/diskstats: accumulate all per-cpu counters in one pass Reading /proc/diskstats iterates over all cpus for summing each field. It's faster to sum all fields in one pass. Hammering /proc/diskstats with fio shows 2x performance improvement: fio --name=test --numjobs=$JOBS --filename=/proc/diskstats \ --size=1k --bs=1k --fallocate=none --create_on_open=1 \ --time_based=1 --runtime=10 --invalidate=0 --group_report JOBS=1 JOBS=10 Before: 7k iops 64k iops After: 18k iops 120k iops Also this way code is more compact: add/remove: 1/0 grow/shrink: 0/2 up/down: 194/-1540 (-1346) Function old new delta part_stat_read_all - 194 +194 diskstats_show 1344 631 -713 part_stat_show 1219 392 -827 Total: Before=14966947, After=14965601, chg -0.01% Signed-off-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- block/genhd.c | 102 +++++++++++++++++++++++++++++------------- include/linux/genhd.h | 3 -- 2 files changed, 70 insertions(+), 35 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index f7d60b620b97..9eb981f7e5a4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -92,6 +92,34 @@ const char *bdevname(struct block_device *bdev, char *buf) } EXPORT_SYMBOL(bdevname); +#ifdef CONFIG_SMP +static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) +{ + int cpu; + + memset(stat, 0, sizeof(struct disk_stats)); + for_each_possible_cpu(cpu) { + struct disk_stats *ptr = per_cpu_ptr(part->dkstats, cpu); + int group; + + for (group = 0; group < NR_STAT_GROUPS; group++) { + stat->nsecs[group] += ptr->nsecs[group]; + stat->sectors[group] += ptr->sectors[group]; + stat->ios[group] += ptr->ios[group]; + stat->merges[group] += ptr->merges[group]; + } + + stat->io_ticks += ptr->io_ticks; + stat->time_in_queue += ptr->time_in_queue; + } +} +#else /* CONFIG_SMP */ +static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) +{ + memcpy(stat, &part->dkstats, sizeof(struct disk_stats)); +} +#endif /* CONFIG_SMP */ + void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { if (queue_is_mq(q)) @@ -1214,9 +1242,12 @@ ssize_t part_stat_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); struct request_queue *q = part_to_disk(p)->queue; + struct disk_stats stat; unsigned int inflight; + part_stat_read_all(p, &stat); inflight = part_in_flight(q, p); + return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " @@ -1224,23 +1255,23 @@ ssize_t part_stat_show(struct device *dev, "%8lu %8lu %8llu %8u " "%8lu %8u" "\n", - part_stat_read(p, ios[STAT_READ]), - part_stat_read(p, merges[STAT_READ]), - (unsigned long long)part_stat_read(p, sectors[STAT_READ]), - (unsigned int)part_stat_read_msecs(p, STAT_READ), - part_stat_read(p, ios[STAT_WRITE]), - part_stat_read(p, merges[STAT_WRITE]), - (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), - (unsigned int)part_stat_read_msecs(p, STAT_WRITE), + stat.ios[STAT_READ], + stat.merges[STAT_READ], + (unsigned long long)stat.sectors[STAT_READ], + (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC), + stat.ios[STAT_WRITE], + stat.merges[STAT_WRITE], + (unsigned long long)stat.sectors[STAT_WRITE], + (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC), inflight, - jiffies_to_msecs(part_stat_read(p, io_ticks)), - jiffies_to_msecs(part_stat_read(p, time_in_queue)), - part_stat_read(p, ios[STAT_DISCARD]), - part_stat_read(p, merges[STAT_DISCARD]), - (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), - (unsigned int)part_stat_read_msecs(p, STAT_DISCARD), - part_stat_read(p, ios[STAT_FLUSH]), - (unsigned int)part_stat_read_msecs(p, STAT_FLUSH)); + jiffies_to_msecs(stat.io_ticks), + jiffies_to_msecs(stat.time_in_queue), + stat.ios[STAT_DISCARD], + stat.merges[STAT_DISCARD], + (unsigned long long)stat.sectors[STAT_DISCARD], + (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC), + stat.ios[STAT_FLUSH], + (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); } ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, @@ -1492,6 +1523,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) struct hd_struct *hd; char buf[BDEVNAME_SIZE]; unsigned int inflight; + struct disk_stats stat; /* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) @@ -1503,7 +1535,9 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { + part_stat_read_all(hd, &stat); inflight = part_in_flight(gp->queue, hd); + seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " "%lu %lu %lu %u " @@ -1513,23 +1547,27 @@ static int diskstats_show(struct seq_file *seqf, void *v) "\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), - part_stat_read(hd, ios[STAT_READ]), - part_stat_read(hd, merges[STAT_READ]), - part_stat_read(hd, sectors[STAT_READ]), - (unsigned int)part_stat_read_msecs(hd, STAT_READ), - part_stat_read(hd, ios[STAT_WRITE]), - part_stat_read(hd, merges[STAT_WRITE]), - part_stat_read(hd, sectors[STAT_WRITE]), - (unsigned int)part_stat_read_msecs(hd, STAT_WRITE), + stat.ios[STAT_READ], + stat.merges[STAT_READ], + stat.sectors[STAT_READ], + (unsigned int)div_u64(stat.nsecs[STAT_READ], + NSEC_PER_MSEC), + stat.ios[STAT_WRITE], + stat.merges[STAT_WRITE], + stat.sectors[STAT_WRITE], + (unsigned int)div_u64(stat.nsecs[STAT_WRITE], + NSEC_PER_MSEC), inflight, - jiffies_to_msecs(part_stat_read(hd, io_ticks)), - jiffies_to_msecs(part_stat_read(hd, time_in_queue)), - part_stat_read(hd, ios[STAT_DISCARD]), - part_stat_read(hd, merges[STAT_DISCARD]), - part_stat_read(hd, sectors[STAT_DISCARD]), - (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD), - part_stat_read(hd, ios[STAT_FLUSH]), - (unsigned int)part_stat_read_msecs(hd, STAT_FLUSH) + jiffies_to_msecs(stat.io_ticks), + jiffies_to_msecs(stat.time_in_queue), + stat.ios[STAT_DISCARD], + stat.merges[STAT_DISCARD], + stat.sectors[STAT_DISCARD], + (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], + NSEC_PER_MSEC), + stat.ios[STAT_FLUSH], + (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC) ); } disk_part_iter_exit(&piter); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 13bb51f37b3f..b0c588d1aa29 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -380,9 +380,6 @@ static inline void free_part_stats(struct hd_struct *part) #endif /* CONFIG_SMP */ -#define part_stat_read_msecs(part, which) \ - div_u64(part_stat_read(part, nsecs[which]), NSEC_PER_MSEC) - #define part_stat_read_accum(part, field) \ (part_stat_read(part, field[STAT_READ]) + \ part_stat_read(part, field[STAT_WRITE]) + \ From 8cd5b8fc00716fb71f6b32d594b38a8f286d6c20 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 25 Mar 2020 16:07:08 +0300 Subject: [PATCH 55/72] block/diskstats: replace time_in_queue with sum of request times Column "time_in_queue" in diskstats is supposed to show total waiting time of all requests. I.e. value should be equal to the sum of times from other columns. But this is not true, because column "time_in_queue" is counted separately in jiffies rather than in nanoseconds as other times. This patch removes redundant counter for "time_in_queue" and shows total time of read, write, discard and flush requests. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- block/bio.c | 1 - block/blk-core.c | 1 - block/genhd.c | 13 ++++++++++--- include/linux/genhd.h | 1 - 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/block/bio.c b/block/bio.c index 68f65ef2ceba..bc9152977bf0 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1811,7 +1811,6 @@ void generic_end_io_acct(struct request_queue *q, int req_op, update_io_ticks(part, now, true); part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); - part_stat_add(part, time_in_queue, duration); part_dec_in_flight(q, part, op_is_write(req_op)); part_stat_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index 4401b30a1751..eaf6cb3887e6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1340,7 +1340,6 @@ void blk_account_io_done(struct request *req, u64 now) update_io_ticks(part, jiffies, true); part_stat_inc(part, ios[sgrp]); part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); - part_stat_add(part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns)); part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); diff --git a/block/genhd.c b/block/genhd.c index 9eb981f7e5a4..792356e922a1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -110,7 +110,6 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) } stat->io_ticks += ptr->io_ticks; - stat->time_in_queue += ptr->time_in_queue; } } #else /* CONFIG_SMP */ @@ -1265,7 +1264,11 @@ ssize_t part_stat_show(struct device *dev, (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC), inflight, jiffies_to_msecs(stat.io_ticks), - jiffies_to_msecs(stat.time_in_queue), + (unsigned int)div_u64(stat.nsecs[STAT_READ] + + stat.nsecs[STAT_WRITE] + + stat.nsecs[STAT_DISCARD] + + stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC), stat.ios[STAT_DISCARD], stat.merges[STAT_DISCARD], (unsigned long long)stat.sectors[STAT_DISCARD], @@ -1559,7 +1562,11 @@ static int diskstats_show(struct seq_file *seqf, void *v) NSEC_PER_MSEC), inflight, jiffies_to_msecs(stat.io_ticks), - jiffies_to_msecs(stat.time_in_queue), + (unsigned int)div_u64(stat.nsecs[STAT_READ] + + stat.nsecs[STAT_WRITE] + + stat.nsecs[STAT_DISCARD] + + stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC), stat.ios[STAT_DISCARD], stat.merges[STAT_DISCARD], stat.sectors[STAT_DISCARD], diff --git a/include/linux/genhd.h b/include/linux/genhd.h index b0c588d1aa29..790fdc3e0b3d 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -46,7 +46,6 @@ struct disk_stats { unsigned long ios[NR_STAT_GROUPS]; unsigned long merges[NR_STAT_GROUPS]; unsigned long io_ticks; - unsigned long time_in_queue; local_t in_flight[2]; }; From c92a41031a6d57395889b5c87cea359220a24d2a Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 25 Mar 2020 00:24:44 +0900 Subject: [PATCH 56/72] block: factor out requeue handling from dispatch code Factor out the requeue handling from the dispatch code, this will make subsequent addition of different requeueing schemes easier. Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 5b2e6550e0b6..745ec592a513 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1178,6 +1178,23 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ +static void blk_mq_handle_dev_resource(struct request *rq, + struct list_head *list) +{ + struct request *next = + list_first_entry_or_null(list, struct request, queuelist); + + /* + * If an I/O scheduler has been configured and we got a driver tag for + * the next request already, free it. + */ + if (next) + blk_mq_put_driver_tag(next); + + list_add(&rq->queuelist, list); + __blk_mq_requeue_request(rq); +} + /* * Returns true if we did some work AND can potentially do more. */ @@ -1245,17 +1262,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, ret = q->mq_ops->queue_rq(hctx, &bd); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { - /* - * If an I/O scheduler has been configured and we got a - * driver tag for the next request already, free it - * again. - */ - if (!list_empty(list)) { - nxt = list_first_entry(list, struct request, queuelist); - blk_mq_put_driver_tag(nxt); - } - list_add(&rq->queuelist, list); - __blk_mq_requeue_request(rq); + blk_mq_handle_dev_resource(rq, list); break; } From 31eb6186797c149665fd44f3847bdf8d539efa59 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:35 +0100 Subject: [PATCH 57/72] block: mark block_depr static Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- include/linux/genhd.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 792356e922a1..7dafd7504493 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -27,7 +27,7 @@ #include "blk.h" static DEFINE_MUTEX(block_class_lock); -struct kobject *block_depr; +static struct kobject *block_depr; /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 790fdc3e0b3d..b322e805a916 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -27,7 +27,6 @@ #define part_to_dev(part) (&((part)->__dev)) extern struct device_type part_type; -extern struct kobject *block_depr; extern struct class block_class; #define DISK_MAX_PARTS 256 From 6005771c17db56b6a9acc12bd084134191560e18 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:36 +0100 Subject: [PATCH 58/72] block: mark part_in_flight and part_in_flight_rw static Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 7 ++++--- include/linux/genhd.h | 3 --- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 7dafd7504493..5f9df331822a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -139,7 +139,8 @@ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) part_stat_local_dec(&part_to_disk(part)->part0, in_flight[rw]); } -unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part) +static unsigned int part_in_flight(struct request_queue *q, + struct hd_struct *part) { int cpu; unsigned int inflight; @@ -159,8 +160,8 @@ unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part) return inflight; } -void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) +static void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]) { int cpu; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index b322e805a916..c0c5bb51fa56 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -409,9 +409,6 @@ static inline void free_part_stats(struct hd_struct *part) #define part_stat_local_read_cpu(gendiskp, field, cpu) \ local_read(&(part_stat_get_cpu(gendiskp, field, cpu))) -unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part); -void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]); void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw); void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, From 572e7fc85b7ac3bfbf4d4ec7d2c6e7ad3507f57a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:37 +0100 Subject: [PATCH 59/72] block: unexport disk_get_part disk_get_part is not used by any modular code. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 5f9df331822a..0ee74b7e01f4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -217,7 +217,6 @@ struct hd_struct *disk_get_part(struct gendisk *disk, int partno) return part; } -EXPORT_SYMBOL_GPL(disk_get_part); /** * disk_part_iter_init - initialize partition iterator From a7818aedda7101f2270d5495aa4c4114b13510bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:38 +0100 Subject: [PATCH 60/72] block: unexport disk_map_sector_rcu disk_map_sector_rcu is not used by any modular code. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 0ee74b7e01f4..1e4855c8265a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -372,7 +372,6 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) } return &disk->part0; } -EXPORT_SYMBOL_GPL(disk_map_sector_rcu); /* * Can be deleted altogether. Later. From 1b4d4dbdaeb7087122a39d3fb9ae32487e001b6c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:39 +0100 Subject: [PATCH 61/72] block: unexport get_gendisk get_gendisk is not used by any modular code. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 1e4855c8265a..6323cc789efa 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -980,7 +980,6 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) } return disk; } -EXPORT_SYMBOL(get_gendisk); /** * bdget_disk - do bdget() by gendisk and partition number From 29125ed624eeb3ac2eb7bca313a8de29c1c84dcd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:40 +0100 Subject: [PATCH 62/72] block: move guard_bio_eod to bio.c This is bio layer functionality and not related to buffer heads. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 43 +++++++++++++++++++++++++++++++++++++++++++ fs/buffer.c | 43 ------------------------------------------- fs/internal.h | 1 - include/linux/bio.h | 1 + 4 files changed, 44 insertions(+), 44 deletions(-) diff --git a/block/bio.c b/block/bio.c index bc9152977bf0..11e6aac35092 100644 --- a/block/bio.c +++ b/block/bio.c @@ -588,6 +588,49 @@ void bio_truncate(struct bio *bio, unsigned new_size) bio->bi_iter.bi_size = new_size; } +/** + * guard_bio_eod - truncate a BIO to fit the block device + * @bio: bio to truncate + * + * This allows us to do IO even on the odd last sectors of a device, even if the + * block size is some multiple of the physical sector size. + * + * We'll just truncate the bio to the size of the device, and clear the end of + * the buffer head manually. Truly out-of-range accesses will turn into actual + * I/O errors, this only handles the "we need to be able to do I/O at the final + * sector" case. + */ +void guard_bio_eod(struct bio *bio) +{ + sector_t maxsector; + struct hd_struct *part; + + rcu_read_lock(); + part = __disk_get_part(bio->bi_disk, bio->bi_partno); + if (part) + maxsector = part_nr_sects_read(part); + else + maxsector = get_capacity(bio->bi_disk); + rcu_read_unlock(); + + if (!maxsector) + return; + + /* + * If the *whole* IO is past the end of the device, + * let it through, and the IO layer will turn it into + * an EIO. + */ + if (unlikely(bio->bi_iter.bi_sector >= maxsector)) + return; + + maxsector -= bio->bi_iter.bi_sector; + if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) + return; + + bio_truncate(bio, maxsector << 9); +} + /** * bio_put - release a reference to a bio * @bio: bio to release reference to diff --git a/fs/buffer.c b/fs/buffer.c index b8d28370cfd7..3f5758e01e40 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3019,49 +3019,6 @@ static void end_bio_bh_io_sync(struct bio *bio) bio_put(bio); } -/* - * This allows us to do IO even on the odd last sectors - * of a device, even if the block size is some multiple - * of the physical sector size. - * - * We'll just truncate the bio to the size of the device, - * and clear the end of the buffer head manually. - * - * Truly out-of-range accesses will turn into actual IO - * errors, this only handles the "we need to be able to - * do IO at the final sector" case. - */ -void guard_bio_eod(struct bio *bio) -{ - sector_t maxsector; - struct hd_struct *part; - - rcu_read_lock(); - part = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (part) - maxsector = part_nr_sects_read(part); - else - maxsector = get_capacity(bio->bi_disk); - rcu_read_unlock(); - - if (!maxsector) - return; - - /* - * If the *whole* IO is past the end of the device, - * let it through, and the IO layer will turn it into - * an EIO. - */ - if (unlikely(bio->bi_iter.bi_sector >= maxsector)) - return; - - maxsector -= bio->bi_iter.bi_sector; - if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) - return; - - bio_truncate(bio, maxsector << 9); -} - static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, enum rw_hint write_hint, struct writeback_control *wbc) { diff --git a/fs/internal.h b/fs/internal.h index f3f280b952a3..4d37912a5587 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -38,7 +38,6 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) /* * buffer.c */ -extern void guard_bio_eod(struct bio *bio); extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, get_block_t *get_block, struct iomap *iomap); diff --git a/include/linux/bio.h b/include/linux/bio.h index 853d92ceee64..a430e9c1c2d2 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -471,6 +471,7 @@ extern struct bio *bio_copy_user_iov(struct request_queue *, extern int bio_uncopy_user(struct bio *); void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); void bio_truncate(struct bio *bio, unsigned new_size); +void guard_bio_eod(struct bio *bio); static inline void zero_fill_bio(struct bio *bio) { From 581e26004a09c50e5017caadc850ea17e374a5ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:41 +0100 Subject: [PATCH 63/72] block: move block layer internals out of include/linux/genhd.h None of this needs to be exposed to drivers. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk.h | 116 +++++++++++++++++++++++++++++++++++++ block/ioctl.c | 1 + block/partitions/check.h | 1 + block/partitions/core.c | 1 - include/linux/genhd.h | 120 --------------------------------------- 5 files changed, 118 insertions(+), 121 deletions(-) diff --git a/block/blk.h b/block/blk.h index 43df9dcb3d4e..ac20f972842e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -149,6 +149,9 @@ static inline bool integrity_req_gap_front_merge(struct request *req, return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], bip_next->bip_vec[0].bv_offset); } + +void blk_integrity_add(struct gendisk *); +void blk_integrity_del(struct gendisk *); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline bool integrity_req_gap_back_merge(struct request *req, struct bio *next) @@ -171,6 +174,12 @@ static inline bool bio_integrity_endio(struct bio *bio) static inline void bio_integrity_free(struct bio *bio) { } +static inline void blk_integrity_add(struct gendisk *disk) +{ +} +static inline void blk_integrity_del(struct gendisk *disk) +{ +} #endif /* CONFIG_BLK_DEV_INTEGRITY */ unsigned long blk_rq_timeout(unsigned long timeout); @@ -365,4 +374,111 @@ void blk_queue_free_zone_bitmaps(struct request_queue *q); static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} #endif +void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, + int rw); +void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, + int rw); +void update_io_ticks(struct hd_struct *part, unsigned long now, bool end); +struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); + +int blk_alloc_devt(struct hd_struct *part, dev_t *devt); +void blk_free_devt(dev_t devt); +void blk_invalidate_devt(dev_t devt); +char *disk_name(struct gendisk *hd, int partno, char *buf); +#define ADDPART_FLAG_NONE 0 +#define ADDPART_FLAG_RAID 1 +#define ADDPART_FLAG_WHOLEDISK 2 +struct hd_struct *__must_check add_partition(struct gendisk *disk, int partno, + sector_t start, sector_t len, int flags, + struct partition_meta_info *info); +void __delete_partition(struct percpu_ref *ref); +void delete_partition(struct gendisk *disk, int partno); +int disk_expand_part_tbl(struct gendisk *disk, int target); + +static inline int hd_ref_init(struct hd_struct *part) +{ + if (percpu_ref_init(&part->ref, __delete_partition, 0, + GFP_KERNEL)) + return -ENOMEM; + return 0; +} + +static inline void hd_struct_get(struct hd_struct *part) +{ + percpu_ref_get(&part->ref); +} + +static inline int hd_struct_try_get(struct hd_struct *part) +{ + return percpu_ref_tryget_live(&part->ref); +} + +static inline void hd_struct_put(struct hd_struct *part) +{ + percpu_ref_put(&part->ref); +} + +static inline void hd_struct_kill(struct hd_struct *part) +{ + percpu_ref_kill(&part->ref); +} + +static inline void hd_free_part(struct hd_struct *part) +{ + free_part_stats(part); + kfree(part->info); + percpu_ref_exit(&part->ref); +} + +/* + * Any access of part->nr_sects which is not protected by partition + * bd_mutex or gendisk bdev bd_mutex, should be done using this + * accessor function. + * + * Code written along the lines of i_size_read() and i_size_write(). + * CONFIG_PREEMPTION case optimizes the case of UP kernel with preemption + * on. + */ +static inline sector_t part_nr_sects_read(struct hd_struct *part) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + sector_t nr_sects; + unsigned seq; + do { + seq = read_seqcount_begin(&part->nr_sects_seq); + nr_sects = part->nr_sects; + } while (read_seqcount_retry(&part->nr_sects_seq, seq)); + return nr_sects; +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + sector_t nr_sects; + + preempt_disable(); + nr_sects = part->nr_sects; + preempt_enable(); + return nr_sects; +#else + return part->nr_sects; +#endif +} + +/* + * Should be called with mutex lock held (typically bd_mutex) of partition + * to provide mutual exlusion among writers otherwise seqcount might be + * left in wrong state leaving the readers spinning infinitely. + */ +static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&part->nr_sects_seq); + part->nr_sects = size; + write_seqcount_end(&part->nr_sects_seq); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + part->nr_sects = size; + preempt_enable(); +#else + part->nr_sects = size; +#endif +} + #endif /* BLK_INTERNAL_H */ diff --git a/block/ioctl.c b/block/ioctl.c index 127194b9f9bd..6e827de1a4c4 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -11,6 +11,7 @@ #include #include #include +#include "blk.h" static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) diff --git a/block/partitions/check.h b/block/partitions/check.h index f845355489ec..c577e9ee67f0 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -2,6 +2,7 @@ #include #include #include +#include "../blk.h" /* * add_gd_partition adds a partitions details to the devices partition diff --git a/block/partitions/core.c b/block/partitions/core.c index b442bc209b86..b79c4513629b 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -10,7 +10,6 @@ #include #include #include -#include "../blk.h" #include "check.h" static int (*check_part[])(struct parsed_partitions *) = { diff --git a/include/linux/genhd.h b/include/linux/genhd.h index c0c5bb51fa56..14354a6e89c2 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -298,9 +298,6 @@ extern void disk_part_iter_init(struct disk_part_iter *piter, extern struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter); extern void disk_part_iter_exit(struct disk_part_iter *piter); -extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, - sector_t sector); - /* * Macros to operate on percpu disk statistics: * @@ -409,13 +406,6 @@ static inline void free_part_stats(struct hd_struct *part) #define part_stat_local_read_cpu(gendiskp, field, cpu) \ local_read(&(part_stat_get_cpu(gendiskp, field, cpu))) -void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, - int rw); -void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, - int rw); - -void update_io_ticks(struct hd_struct *part, unsigned long now, bool end); - /* block/genhd.c */ extern void device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups); @@ -465,27 +455,11 @@ static inline void set_capacity(struct gendisk *disk, sector_t size) disk->part0.nr_sects = size; } -#define ADDPART_FLAG_NONE 0 -#define ADDPART_FLAG_RAID 1 -#define ADDPART_FLAG_WHOLEDISK 2 - -extern int blk_alloc_devt(struct hd_struct *part, dev_t *devt); -extern void blk_free_devt(dev_t devt); -extern void blk_invalidate_devt(dev_t devt); extern dev_t blk_lookup_devt(const char *name, int partno); -extern char *disk_name (struct gendisk *hd, int partno, char *buf); int bdev_disk_changed(struct block_device *bdev, bool invalidate); int blk_add_partitions(struct gendisk *disk, struct block_device *bdev); int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev); -extern int disk_expand_part_tbl(struct gendisk *disk, int target); -extern struct hd_struct * __must_check add_partition(struct gendisk *disk, - int partno, sector_t start, - sector_t len, int flags, - struct partition_meta_info - *info); -extern void __delete_partition(struct percpu_ref *); -extern void delete_partition(struct gendisk *, int); extern void printk_all_partitions(void); extern struct gendisk *__alloc_disk_node(int minors, int node_id); @@ -517,100 +491,6 @@ extern void blk_unregister_region(dev_t devt, unsigned long range); #define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE) -static inline int hd_ref_init(struct hd_struct *part) -{ - if (percpu_ref_init(&part->ref, __delete_partition, 0, - GFP_KERNEL)) - return -ENOMEM; - return 0; -} - -static inline void hd_struct_get(struct hd_struct *part) -{ - percpu_ref_get(&part->ref); -} - -static inline int hd_struct_try_get(struct hd_struct *part) -{ - return percpu_ref_tryget_live(&part->ref); -} - -static inline void hd_struct_put(struct hd_struct *part) -{ - percpu_ref_put(&part->ref); -} - -static inline void hd_struct_kill(struct hd_struct *part) -{ - percpu_ref_kill(&part->ref); -} - -static inline void hd_free_part(struct hd_struct *part) -{ - free_part_stats(part); - kfree(part->info); - percpu_ref_exit(&part->ref); -} - -/* - * Any access of part->nr_sects which is not protected by partition - * bd_mutex or gendisk bdev bd_mutex, should be done using this - * accessor function. - * - * Code written along the lines of i_size_read() and i_size_write(). - * CONFIG_PREEMPTION case optimizes the case of UP kernel with preemption - * on. - */ -static inline sector_t part_nr_sects_read(struct hd_struct *part) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - sector_t nr_sects; - unsigned seq; - do { - seq = read_seqcount_begin(&part->nr_sects_seq); - nr_sects = part->nr_sects; - } while (read_seqcount_retry(&part->nr_sects_seq, seq)); - return nr_sects; -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) - sector_t nr_sects; - - preempt_disable(); - nr_sects = part->nr_sects; - preempt_enable(); - return nr_sects; -#else - return part->nr_sects; -#endif -} - -/* - * Should be called with mutex lock held (typically bd_mutex) of partition - * to provide mutual exlusion among writers otherwise seqcount might be - * left in wrong state leaving the readers spinning infinitely. - */ -static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - write_seqcount_begin(&part->nr_sects_seq); - part->nr_sects = size; - write_seqcount_end(&part->nr_sects_seq); -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) - preempt_disable(); - part->nr_sects = size; - preempt_enable(); -#else - part->nr_sects = size; -#endif -} - -#if defined(CONFIG_BLK_DEV_INTEGRITY) -extern void blk_integrity_add(struct gendisk *); -extern void blk_integrity_del(struct gendisk *); -#else /* CONFIG_BLK_DEV_INTEGRITY */ -static inline void blk_integrity_add(struct gendisk *disk) { } -static inline void blk_integrity_del(struct gendisk *disk) { } -#endif /* CONFIG_BLK_DEV_INTEGRITY */ - #else /* CONFIG_BLOCK */ static inline void printk_all_partitions(void) { } From c6a564ffadc9105880329710164ee493f0de103c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:42 +0100 Subject: [PATCH 64/72] block: move the part_stat* helpers from genhd.h to a new header These macros are just used by a few files. Move them out of genhd.h, which is included everywhere into a new standalone header. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk.h | 1 + drivers/block/drbd/drbd_receiver.c | 1 + drivers/block/drbd/drbd_worker.c | 1 + drivers/block/zram/zram_drv.c | 1 + drivers/md/dm.c | 1 + drivers/md/md.c | 1 + drivers/nvme/target/admin-cmd.c | 1 + fs/ext4/super.c | 2 +- fs/ext4/sysfs.c | 1 + fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 1 + include/linux/genhd.h | 108 --------------------------- include/linux/part_stat.h | 115 +++++++++++++++++++++++++++++ 13 files changed, 126 insertions(+), 109 deletions(-) create mode 100644 include/linux/part_stat.h diff --git a/block/blk.h b/block/blk.h index ac20f972842e..d9673164a145 100644 --- a/block/blk.h +++ b/block/blk.h @@ -4,6 +4,7 @@ #include #include +#include #include #include "blk-mq.h" #include "blk-mq-sched.h" diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 79e216446030..c15e7083b13a 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index b7f605c6e231..0dc019da1f8d 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "drbd_int.h" #include "drbd_protocol.h" diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 1bdb5793842b..76e75097e9ab 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "zram_drv.h" diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0413018c8305..0d881cfa160b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -25,6 +25,7 @@ #include #include #include +#include #define DM_MSG_PREFIX "core" diff --git a/drivers/md/md.c b/drivers/md/md.c index 1c7193715f07..f6cf3b53f6c1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include "md.h" diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 72a7e41f3018..cca759c918a4 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -6,6 +6,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include +#include #include #include diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4cbd1cc34dfa..c8dff4c68141 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -43,7 +43,7 @@ #include #include #include - +#include #include #include diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index d218ebdafa4a..04bfaf63752c 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5355be6b6755..088c3e7a1080 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 65a7a432dfee..d398b2d90c6c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 14354a6e89c2..927eed0be179 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -298,114 +298,6 @@ extern void disk_part_iter_init(struct disk_part_iter *piter, extern struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter); extern void disk_part_iter_exit(struct disk_part_iter *piter); -/* - * Macros to operate on percpu disk statistics: - * - * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters - * and should be called between disk_stat_lock() and - * disk_stat_unlock(). - * - * part_stat_read() can be called at any time. - * - * part_stat_{add|set_all}() and {init|free}_part_stats are for - * internal use only. - */ -#ifdef CONFIG_SMP -#define part_stat_lock() ({ rcu_read_lock(); get_cpu(); }) -#define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0) - -#define part_stat_get_cpu(part, field, cpu) \ - (per_cpu_ptr((part)->dkstats, (cpu))->field) - -#define part_stat_get(part, field) \ - part_stat_get_cpu(part, field, smp_processor_id()) - -#define part_stat_read(part, field) \ -({ \ - typeof((part)->dkstats->field) res = 0; \ - unsigned int _cpu; \ - for_each_possible_cpu(_cpu) \ - res += per_cpu_ptr((part)->dkstats, _cpu)->field; \ - res; \ -}) - -static inline void part_stat_set_all(struct hd_struct *part, int value) -{ - int i; - - for_each_possible_cpu(i) - memset(per_cpu_ptr(part->dkstats, i), value, - sizeof(struct disk_stats)); -} - -static inline int init_part_stats(struct hd_struct *part) -{ - part->dkstats = alloc_percpu(struct disk_stats); - if (!part->dkstats) - return 0; - return 1; -} - -static inline void free_part_stats(struct hd_struct *part) -{ - free_percpu(part->dkstats); -} - -#else /* !CONFIG_SMP */ -#define part_stat_lock() ({ rcu_read_lock(); 0; }) -#define part_stat_unlock() rcu_read_unlock() - -#define part_stat_get(part, field) ((part)->dkstats.field) -#define part_stat_get_cpu(part, field, cpu) part_stat_get(part, field) -#define part_stat_read(part, field) part_stat_get(part, field) - -static inline void part_stat_set_all(struct hd_struct *part, int value) -{ - memset(&part->dkstats, value, sizeof(struct disk_stats)); -} - -static inline int init_part_stats(struct hd_struct *part) -{ - return 1; -} - -static inline void free_part_stats(struct hd_struct *part) -{ -} - -#endif /* CONFIG_SMP */ - -#define part_stat_read_accum(part, field) \ - (part_stat_read(part, field[STAT_READ]) + \ - part_stat_read(part, field[STAT_WRITE]) + \ - part_stat_read(part, field[STAT_DISCARD])) - -#define __part_stat_add(part, field, addnd) \ - (part_stat_get(part, field) += (addnd)) - -#define part_stat_add(part, field, addnd) do { \ - __part_stat_add((part), field, addnd); \ - if ((part)->partno) \ - __part_stat_add(&part_to_disk((part))->part0, \ - field, addnd); \ -} while (0) - -#define part_stat_dec(gendiskp, field) \ - part_stat_add(gendiskp, field, -1) -#define part_stat_inc(gendiskp, field) \ - part_stat_add(gendiskp, field, 1) -#define part_stat_sub(gendiskp, field, subnd) \ - part_stat_add(gendiskp, field, -subnd) - -#define part_stat_local_dec(gendiskp, field) \ - local_dec(&(part_stat_get(gendiskp, field))) -#define part_stat_local_inc(gendiskp, field) \ - local_inc(&(part_stat_get(gendiskp, field))) -#define part_stat_local_read(gendiskp, field) \ - local_read(&(part_stat_get(gendiskp, field))) -#define part_stat_local_read_cpu(gendiskp, field, cpu) \ - local_read(&(part_stat_get_cpu(gendiskp, field, cpu))) - /* block/genhd.c */ extern void device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups); diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h new file mode 100644 index 000000000000..ece607607a86 --- /dev/null +++ b/include/linux/part_stat.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PART_STAT_H +#define _LINUX_PART_STAT_H + +#include + +/* + * Macros to operate on percpu disk statistics: + * + * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters + * and should be called between disk_stat_lock() and + * disk_stat_unlock(). + * + * part_stat_read() can be called at any time. + * + * part_stat_{add|set_all}() and {init|free}_part_stats are for + * internal use only. + */ +#ifdef CONFIG_SMP +#define part_stat_lock() ({ rcu_read_lock(); get_cpu(); }) +#define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0) + +#define part_stat_get_cpu(part, field, cpu) \ + (per_cpu_ptr((part)->dkstats, (cpu))->field) + +#define part_stat_get(part, field) \ + part_stat_get_cpu(part, field, smp_processor_id()) + +#define part_stat_read(part, field) \ +({ \ + typeof((part)->dkstats->field) res = 0; \ + unsigned int _cpu; \ + for_each_possible_cpu(_cpu) \ + res += per_cpu_ptr((part)->dkstats, _cpu)->field; \ + res; \ +}) + +static inline void part_stat_set_all(struct hd_struct *part, int value) +{ + int i; + + for_each_possible_cpu(i) + memset(per_cpu_ptr(part->dkstats, i), value, + sizeof(struct disk_stats)); +} + +static inline int init_part_stats(struct hd_struct *part) +{ + part->dkstats = alloc_percpu(struct disk_stats); + if (!part->dkstats) + return 0; + return 1; +} + +static inline void free_part_stats(struct hd_struct *part) +{ + free_percpu(part->dkstats); +} + +#else /* !CONFIG_SMP */ +#define part_stat_lock() ({ rcu_read_lock(); 0; }) +#define part_stat_unlock() rcu_read_unlock() + +#define part_stat_get(part, field) ((part)->dkstats.field) +#define part_stat_get_cpu(part, field, cpu) part_stat_get(part, field) +#define part_stat_read(part, field) part_stat_get(part, field) + +static inline void part_stat_set_all(struct hd_struct *part, int value) +{ + memset(&part->dkstats, value, sizeof(struct disk_stats)); +} + +static inline int init_part_stats(struct hd_struct *part) +{ + return 1; +} + +static inline void free_part_stats(struct hd_struct *part) +{ +} + +#endif /* CONFIG_SMP */ + +#define part_stat_read_accum(part, field) \ + (part_stat_read(part, field[STAT_READ]) + \ + part_stat_read(part, field[STAT_WRITE]) + \ + part_stat_read(part, field[STAT_DISCARD])) + +#define __part_stat_add(part, field, addnd) \ + (part_stat_get(part, field) += (addnd)) + +#define part_stat_add(part, field, addnd) do { \ + __part_stat_add((part), field, addnd); \ + if ((part)->partno) \ + __part_stat_add(&part_to_disk((part))->part0, \ + field, addnd); \ +} while (0) + +#define part_stat_dec(gendiskp, field) \ + part_stat_add(gendiskp, field, -1) +#define part_stat_inc(gendiskp, field) \ + part_stat_add(gendiskp, field, 1) +#define part_stat_sub(gendiskp, field, subnd) \ + part_stat_add(gendiskp, field, -subnd) + +#define part_stat_local_dec(gendiskp, field) \ + local_dec(&(part_stat_get(gendiskp, field))) +#define part_stat_local_inc(gendiskp, field) \ + local_inc(&(part_stat_get(gendiskp, field))) +#define part_stat_local_read(gendiskp, field) \ + local_read(&(part_stat_get(gendiskp, field))) +#define part_stat_local_read_cpu(gendiskp, field, cpu) \ + local_read(&(part_stat_get_cpu(gendiskp, field, cpu))) + +#endif /* _LINUX_PART_STAT_H */ From 348e114bbd4dce430eae70f01a04c8fc259b4cf1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2020 09:07:17 +0100 Subject: [PATCH 65/72] block: move the ->devnode callback to struct block_device_operations There really isn't any good reason to stash a method directly into struct gendisk. Move it together with the other block device operations. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 4 ++-- drivers/block/pktcdvd.c | 12 ++++++------ include/linux/blkdev.h | 1 + include/linux/genhd.h | 1 - 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 6323cc789efa..14cf395a1479 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1497,8 +1497,8 @@ static char *block_devnode(struct device *dev, umode_t *mode, { struct gendisk *disk = dev_to_disk(dev); - if (disk->devnode) - return disk->devnode(disk, mode); + if (disk->fops->devnode) + return disk->fops->devnode(disk, mode); return NULL; } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 5f970a7d32c0..0d286a87e647 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2679,6 +2679,11 @@ static unsigned int pkt_check_events(struct gendisk *disk, return attached_disk->fops->check_events(attached_disk, clearing); } +static char *pkt_devnode(struct gendisk *disk, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "pktcdvd/%s", disk->disk_name); +} + static const struct block_device_operations pktcdvd_ops = { .owner = THIS_MODULE, .open = pkt_open, @@ -2686,13 +2691,9 @@ static const struct block_device_operations pktcdvd_ops = { .ioctl = pkt_ioctl, .compat_ioctl = blkdev_compat_ptr_ioctl, .check_events = pkt_check_events, + .devnode = pkt_devnode, }; -static char *pktcdvd_devnode(struct gendisk *gd, umode_t *mode) -{ - return kasprintf(GFP_KERNEL, "pktcdvd/%s", gd->disk_name); -} - /* * Set up mapping from pktcdvd device to CD-ROM device. */ @@ -2748,7 +2749,6 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) disk->fops = &pktcdvd_ops; disk->flags = GENHD_FL_REMOVABLE; strcpy(disk->disk_name, pd->name); - disk->devnode = pktcdvd_devnode; disk->private_data = pd; disk->queue = blk_alloc_queue(GFP_KERNEL); if (!disk->queue) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 53a1325efbc3..e8defd718d62 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1697,6 +1697,7 @@ struct block_device_operations { void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); + char *(*devnode)(struct gendisk *disk, umode_t *mode); struct module *owner; const struct pr_ops *pr_ops; }; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 927eed0be179..85b9e253cd39 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -191,7 +191,6 @@ struct gendisk { * disks that can't be partitioned. */ char disk_name[DISK_NAME_LEN]; /* name of major driver */ - char *(*devnode)(struct gendisk *gd, umode_t *mode); unsigned short events; /* supported events */ unsigned short event_flags; /* flags related to event processing */ From 2f227bb99934a4faa6dfe2cda2594bce8897a323 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2020 09:30:08 +0100 Subject: [PATCH 66/72] block: add a blk_mq_init_queue_data helper This allows a driver to pass a queuedata member before ->init_hctx is called. null_blk currently open codes this logic, but I'd rather have it in the core to ease future maintainance. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 10 +++++++++- include/linux/blk-mq.h | 2 ++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 745ec592a513..216bf62e88b6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2724,13 +2724,15 @@ void blk_mq_release(struct request_queue *q) blk_mq_sysfs_deinit(q); } -struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) +struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, + void *queuedata) { struct request_queue *uninit_q, *q; uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); if (!uninit_q) return ERR_PTR(-ENOMEM); + uninit_q->queuedata = queuedata; /* * Initialize the queue without an elevator. device_add_disk() will do @@ -2742,6 +2744,12 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) return q; } +EXPORT_SYMBOL_GPL(blk_mq_init_queue_data); + +struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) +{ + return blk_mq_init_queue_data(set, NULL); +} EXPORT_SYMBOL(blk_mq_init_queue); /* diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 31344d5f83e2..f389d7c724bd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -412,6 +412,8 @@ enum { << BLK_MQ_F_ALLOC_POLICY_START_BIT) struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); +struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, + void *queuedata); struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q, bool elevator_init); From 8d96a1117c21faad4f88d3d2df8c62712b3f495d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2020 09:30:09 +0100 Subject: [PATCH 67/72] null_blk: use blk_mq_init_queue_data Use the new blk_mq_init_queue_data instead of open coding the queue allocation and initialization. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/null_blk_main.c | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 89bb16a99007..2f864782550d 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1698,27 +1698,6 @@ static bool null_setup_fault(void) return true; } -/* - * This function is identical to blk_mq_init_queue() except that it sets - * queuedata before .init_hctx is called. - */ -static struct request_queue *nullb_alloc_queue(struct nullb *nullb) -{ - struct request_queue *uninit_q, *q; - struct blk_mq_tag_set *set = nullb->tag_set; - - uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); - if (!uninit_q) - return ERR_PTR(-ENOMEM); - - uninit_q->queuedata = nullb; - q = blk_mq_init_allocated_queue(set, uninit_q, false); - if (IS_ERR(q)) - blk_cleanup_queue(uninit_q); - - return q; -} - static int null_add_dev(struct nullb_device *dev) { struct nullb *nullb; @@ -1758,7 +1737,7 @@ static int null_add_dev(struct nullb_device *dev) goto out_cleanup_queues; nullb->tag_set->timeout = 5 * HZ; - nullb->q = nullb_alloc_queue(nullb); + nullb->q = blk_mq_init_queue_data(nullb->tag_set, nullb); if (IS_ERR(nullb->q)) { rv = -ENOMEM; goto out_cleanup_tags; From ff27668ce8092c74965f21b5c02ebc6b6764db95 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2020 09:30:10 +0100 Subject: [PATCH 68/72] bcache: pass the make_request methods to blk_queue_make_request bcache is the only driver not actually passing its make_request methods to blk_queue_make_request, but instead just sets them up manually a little later. Make bcache follow the common way of setting up make_request based queues. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 7 ++----- drivers/md/bcache/request.h | 3 +++ drivers/md/bcache/super.c | 10 ++++++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 820d8402a1dc..71a90fbec314 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1161,8 +1161,7 @@ static void quit_max_writeback_rate(struct cache_set *c, /* Cached devices - read & write stuff */ -static blk_qc_t cached_dev_make_request(struct request_queue *q, - struct bio *bio) +blk_qc_t cached_dev_make_request(struct request_queue *q, struct bio *bio) { struct search *s; struct bcache_device *d = bio->bi_disk->private_data; @@ -1266,7 +1265,6 @@ void bch_cached_dev_request_init(struct cached_dev *dc) { struct gendisk *g = dc->disk.disk; - g->queue->make_request_fn = cached_dev_make_request; g->queue->backing_dev_info->congested_fn = cached_dev_congested; dc->disk.cache_miss = cached_dev_cache_miss; dc->disk.ioctl = cached_dev_ioctl; @@ -1301,8 +1299,7 @@ static void flash_dev_nodata(struct closure *cl) continue_at(cl, search_free, NULL); } -static blk_qc_t flash_dev_make_request(struct request_queue *q, - struct bio *bio) +blk_qc_t flash_dev_make_request(struct request_queue *q, struct bio *bio) { struct search *s; struct closure *cl; diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index c64dbd7a91aa..bb005c93dd72 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -37,7 +37,10 @@ unsigned int bch_get_congested(const struct cache_set *c); void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); +blk_qc_t cached_dev_make_request(struct request_queue *q, struct bio *bio); + void bch_flash_dev_request_init(struct bcache_device *d); +blk_qc_t flash_dev_make_request(struct request_queue *q, struct bio *bio); extern struct kmem_cache *bch_search_cache; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 0c3c5419c52b..5e38a167c85e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -816,7 +816,7 @@ static void bcache_device_free(struct bcache_device *d) } static int bcache_device_init(struct bcache_device *d, unsigned int block_size, - sector_t sectors) + sector_t sectors, make_request_fn make_request_fn) { struct request_queue *q; const size_t max_stripes = min_t(size_t, INT_MAX, @@ -870,7 +870,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, if (!q) return -ENOMEM; - blk_queue_make_request(q, NULL); + blk_queue_make_request(q, make_request_fn); d->disk->queue = q; q->queuedata = d; q->backing_dev_info->congested_data = d; @@ -1339,7 +1339,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) q->limits.raid_partial_stripes_expensive; ret = bcache_device_init(&dc->disk, block_size, - dc->bdev->bd_part->nr_sects - dc->sb.data_offset); + dc->bdev->bd_part->nr_sects - dc->sb.data_offset, + cached_dev_make_request); if (ret) return ret; @@ -1451,7 +1452,8 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) kobject_init(&d->kobj, &bch_flash_dev_ktype); - if (bcache_device_init(d, block_bytes(c), u->sectors)) + if (bcache_device_init(d, block_bytes(c), u->sectors, + flash_dev_make_request)) goto err; bcache_device_attach(d, c, u - c->uuids); From 3d745ea5b095a3985129e162900b7e6c22518a9d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2020 09:30:11 +0100 Subject: [PATCH 69/72] block: simplify queue allocation Current make_request based drivers use either blk_alloc_queue_node or blk_alloc_queue to allocate a queue, and then set up the make_request_fn function pointer and a few parameters using the blk_queue_make_request helper. Simplify this by passing the make_request pointer to blk_alloc_queue, and while at it merge the _node variant into the main helper by always passing a node_id, and remove the superfluous gfp_mask parameter. A lower-level __blk_alloc_queue is kept for the blk-mq case. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- arch/m68k/emu/nfblock.c | 3 +-- arch/xtensa/platforms/iss/simdisk.c | 3 +-- block/blk-cgroup.c | 2 +- block/blk-core.c | 39 +++++++++++++++++------------ block/blk-mq.c | 8 ++---- block/blk-settings.c | 36 -------------------------- block/blk.h | 2 ++ drivers/block/brd.c | 4 +-- drivers/block/drbd/drbd_main.c | 3 +-- drivers/block/null_blk_main.c | 3 +-- drivers/block/pktcdvd.c | 3 +-- drivers/block/ps3vram.c | 3 +-- drivers/block/rsxx/dev.c | 3 +-- drivers/block/umem.c | 4 +-- drivers/block/zram/zram_drv.c | 4 +-- drivers/lightnvm/core.c | 3 +-- drivers/md/bcache/super.c | 3 +-- drivers/md/dm.c | 9 +++---- drivers/md/md.c | 3 +-- drivers/nvdimm/blk.c | 3 +-- drivers/nvdimm/btt.c | 3 +-- drivers/nvdimm/pmem.c | 3 +-- drivers/nvme/host/multipath.c | 3 +-- drivers/s390/block/dcssblk.c | 4 +-- drivers/s390/block/xpram.c | 4 +-- include/linux/blkdev.h | 4 +-- 26 files changed, 54 insertions(+), 108 deletions(-) diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index 40712e49381b..c3a630440512 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -118,12 +118,11 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) dev->bsize = bsize; dev->bshift = ffs(bsize) - 10; - dev->queue = blk_alloc_queue(GFP_KERNEL); + dev->queue = blk_alloc_queue(nfhd_make_request, NUMA_NO_NODE); if (dev->queue == NULL) goto free_dev; dev->queue->queuedata = dev; - blk_queue_make_request(dev->queue, nfhd_make_request); blk_queue_logical_block_size(dev->queue, bsize); dev->disk = alloc_disk(16); diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 833109880165..49322b66cda9 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -267,13 +267,12 @@ static int __init simdisk_setup(struct simdisk *dev, int which, spin_lock_init(&dev->lock); dev->users = 0; - dev->queue = blk_alloc_queue(GFP_KERNEL); + dev->queue = blk_alloc_queue(simdisk_make_request, NUMA_NO_NODE); if (dev->queue == NULL) { pr_err("blk_alloc_queue failed\n"); goto out_alloc_queue; } - blk_queue_make_request(dev->queue, simdisk_make_request); dev->queue->queuedata = dev; dev->gd = alloc_disk(SIMDISK_MINORS); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index a229b94d5390..c15a26096038 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1010,7 +1010,7 @@ unlock: * blkcg_init_queue - initialize blkcg part of request queue * @q: request_queue to initialize * - * Called from blk_alloc_queue_node(). Responsible for initializing blkcg + * Called from __blk_alloc_queue(). Responsible for initializing blkcg * part of new request_queue @q. * * RETURNS: diff --git a/block/blk-core.c b/block/blk-core.c index eaf6cb3887e6..18b8c09d093e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -388,12 +388,6 @@ void blk_cleanup_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_cleanup_queue); -struct request_queue *blk_alloc_queue(gfp_t gfp_mask) -{ - return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE); -} -EXPORT_SYMBOL(blk_alloc_queue); - /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer @@ -470,24 +464,19 @@ static void blk_timeout_work(struct work_struct *work) { } -/** - * blk_alloc_queue_node - allocate a request queue - * @gfp_mask: memory allocation flags - * @node_id: NUMA node to allocate memory from - */ -struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) +struct request_queue *__blk_alloc_queue(int node_id) { struct request_queue *q; int ret; q = kmem_cache_alloc_node(blk_requestq_cachep, - gfp_mask | __GFP_ZERO, node_id); + GFP_KERNEL | __GFP_ZERO, node_id); if (!q) return NULL; q->last_merge = NULL; - q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); + q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); if (q->id < 0) goto fail_q; @@ -495,7 +484,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (ret) goto fail_id; - q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id); + q->backing_dev_info = bdi_alloc_node(GFP_KERNEL, node_id); if (!q->backing_dev_info) goto fail_split; @@ -541,6 +530,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (blkcg_init_queue(q)) goto fail_ref; + blk_queue_dma_alignment(q, 511); + blk_set_default_limits(&q->limits); + return q; fail_ref: @@ -557,7 +549,22 @@ fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; } -EXPORT_SYMBOL(blk_alloc_queue_node); + +struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id) +{ + struct request_queue *q; + + if (WARN_ON_ONCE(!make_request)) + return -EINVAL; + + q = __blk_alloc_queue(node_id); + if (!q) + return NULL; + q->make_request_fn = make_request; + q->nr_requests = BLKDEV_MAX_RQ; + return q; +} +EXPORT_SYMBOL(blk_alloc_queue); bool blk_get_queue(struct request_queue *q) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 216bf62e88b6..f6291ceedee4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2729,7 +2729,7 @@ struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, { struct request_queue *uninit_q, *q; - uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); + uninit_q = __blk_alloc_queue(set->numa_node); if (!uninit_q) return ERR_PTR(-ENOMEM); uninit_q->queuedata = queuedata; @@ -2939,11 +2939,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); - blk_queue_make_request(q, blk_mq_make_request); - - /* - * Do this after blk_queue_make_request() overrides it... - */ + q->make_request_fn = blk_mq_make_request; q->nr_requests = set->queue_depth; /* diff --git a/block/blk-settings.c b/block/blk-settings.c index c8eda2e7b91e..126d216a2db6 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -86,42 +86,6 @@ void blk_set_stacking_limits(struct queue_limits *lim) } EXPORT_SYMBOL(blk_set_stacking_limits); -/** - * blk_queue_make_request - define an alternate make_request function for a device - * @q: the request queue for the device to be affected - * @mfn: the alternate make_request function - * - * Description: - * The normal way for &struct bios to be passed to a device - * driver is for them to be collected into requests on a request - * queue, and then to allow the device driver to select requests - * off that queue when it is ready. This works well for many block - * devices. However some block devices (typically virtual devices - * such as md or lvm) do not benefit from the processing on the - * request queue, and are served best by having the requests passed - * directly to them. This can be achieved by providing a function - * to blk_queue_make_request(). - * - * Caveat: - * The driver that does this *must* be able to deal appropriately - * with buffers in "highmemory". This can be accomplished by either calling - * kmap_atomic() to get a temporary kernel mapping, or by calling - * blk_queue_bounce() to create a buffer in normal memory. - **/ -void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) -{ - /* - * set defaults - */ - q->nr_requests = BLKDEV_MAX_RQ; - - q->make_request_fn = mfn; - blk_queue_dma_alignment(q, 511); - - blk_set_default_limits(&q->limits); -} -EXPORT_SYMBOL(blk_queue_make_request); - /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device diff --git a/block/blk.h b/block/blk.h index d9673164a145..491e52fc0aa6 100644 --- a/block/blk.h +++ b/block/blk.h @@ -482,4 +482,6 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) #endif } +struct request_queue *__blk_alloc_queue(int node_id); + #endif /* BLK_INTERNAL_H */ diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 220c5e18aba0..2fb25c348d53 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -381,12 +381,10 @@ static struct brd_device *brd_alloc(int i) spin_lock_init(&brd->brd_lock); INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC); - brd->brd_queue = blk_alloc_queue(GFP_KERNEL); + brd->brd_queue = blk_alloc_queue(brd_make_request, NUMA_NO_NODE); if (!brd->brd_queue) goto out_free_dev; - blk_queue_make_request(brd->brd_queue, brd_make_request); - /* This is so fdisk will align partitions on 4k, because of * direct_access API needing 4k alignment, returning a PFN * (This is only a problem on very small devices <= 4M, diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a18155cdce41..72a7c3ea2ce3 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2801,7 +2801,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig drbd_init_set_defaults(device); - q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE); + q = blk_alloc_queue(drbd_make_request, NUMA_NO_NODE); if (!q) goto out_no_q; device->rq_queue = q; @@ -2828,7 +2828,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig q->backing_dev_info->congested_fn = drbd_congested; q->backing_dev_info->congested_data = device; - blk_queue_make_request(q, drbd_make_request); blk_queue_write_cache(q, true, true); /* Setting the max_hw_sectors to an odd value of 8kibyte here This triggers a max_bio_size message upon first attach or connect */ diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 2f864782550d..5f13793d35ee 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1743,12 +1743,11 @@ static int null_add_dev(struct nullb_device *dev) goto out_cleanup_tags; } } else if (dev->queue_mode == NULL_Q_BIO) { - nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node); + nullb->q = blk_alloc_queue(null_queue_bio, dev->home_node); if (!nullb->q) { rv = -ENOMEM; goto out_cleanup_queues; } - blk_queue_make_request(nullb->q, null_queue_bio); rv = init_driver_queues(nullb); if (rv) goto out_cleanup_blk_queue; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 0d286a87e647..0b944ac96d6b 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2493,7 +2493,6 @@ static void pkt_init_queue(struct pktcdvd_device *pd) { struct request_queue *q = pd->disk->queue; - blk_queue_make_request(q, pkt_make_request); blk_queue_logical_block_size(q, CD_FRAMESIZE); blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS); q->queuedata = pd; @@ -2750,7 +2749,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) disk->flags = GENHD_FL_REMOVABLE; strcpy(disk->disk_name, pd->name); disk->private_data = pd; - disk->queue = blk_alloc_queue(GFP_KERNEL); + disk->queue = blk_alloc_queue(pkt_make_request, NUMA_NO_NODE); if (!disk->queue) goto out_mem2; diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 4628e1a27a2b..821d4d8b1d76 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -737,7 +737,7 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) ps3vram_proc_init(dev); - queue = blk_alloc_queue(GFP_KERNEL); + queue = blk_alloc_queue(ps3vram_make_request, NUMA_NO_NODE); if (!queue) { dev_err(&dev->core, "blk_alloc_queue failed\n"); error = -ENOMEM; @@ -746,7 +746,6 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) priv->queue = queue; queue->queuedata = dev; - blk_queue_make_request(queue, ps3vram_make_request); blk_queue_max_segments(queue, BLK_MAX_SEGMENTS); blk_queue_max_segment_size(queue, BLK_MAX_SEGMENT_SIZE); blk_queue_max_hw_sectors(queue, BLK_SAFE_MAX_SECTORS); diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index c47d28b2ce44..8ffa8260dcaf 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -248,7 +248,7 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card) return -ENOMEM; } - card->queue = blk_alloc_queue(GFP_KERNEL); + card->queue = blk_alloc_queue(rsxx_make_request, NUMA_NO_NODE); if (!card->queue) { dev_err(CARD_TO_DEV(card), "Failed queue alloc\n"); unregister_blkdev(card->major, DRIVER_NAME); @@ -269,7 +269,6 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card) blk_queue_logical_block_size(card->queue, blk_size); } - blk_queue_make_request(card->queue, rsxx_make_request); blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors); blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE); diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 4eaf97d7a170..d84e8a878df2 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -885,11 +885,9 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) card->biotail = &card->bio; spin_lock_init(&card->lock); - card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE); + card->queue = blk_alloc_queue(mm_make_request, NUMA_NO_NODE); if (!card->queue) goto failed_alloc; - - blk_queue_make_request(card->queue, mm_make_request); card->queue->queuedata = card; tasklet_init(&card->tasklet, process_page, (unsigned long)card); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 76e75097e9ab..ebb234f36909 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1895,7 +1895,7 @@ static int zram_add(void) #ifdef CONFIG_ZRAM_WRITEBACK spin_lock_init(&zram->wb_limit_lock); #endif - queue = blk_alloc_queue(GFP_KERNEL); + queue = blk_alloc_queue(zram_make_request, NUMA_NO_NODE); if (!queue) { pr_err("Error allocating disk queue for device %d\n", device_id); @@ -1903,8 +1903,6 @@ static int zram_add(void) goto out_free_idr; } - blk_queue_make_request(queue, zram_make_request); - /* gendisk structure */ zram->disk = alloc_disk(1); if (!zram->disk) { diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 7543e395a2c6..db38a68abb6c 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -380,12 +380,11 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) goto err_dev; } - tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node); + tqueue = blk_alloc_queue(tt->make_rq, dev->q->node); if (!tqueue) { ret = -ENOMEM; goto err_disk; } - blk_queue_make_request(tqueue, tt->make_rq); strlcpy(tdisk->disk_name, create->tgtname, sizeof(tdisk->disk_name)); tdisk->flags = GENHD_FL_EXT_DEVT; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 5e38a167c85e..d98354fa28e3 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -866,11 +866,10 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, d->disk->fops = &bcache_ops; d->disk->private_data = d; - q = blk_alloc_queue(GFP_KERNEL); + q = blk_alloc_queue(make_request_fn, NUMA_NO_NODE); if (!q) return -ENOMEM; - blk_queue_make_request(q, make_request_fn); d->disk->queue = q; q->queuedata = d; q->backing_dev_info->congested_data = d; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0d881cfa160b..753302e83910 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1939,16 +1939,15 @@ static struct mapped_device *alloc_dev(int minor) INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); - md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id); - if (!md->queue) - goto bad; - md->queue->queuedata = md; /* * default to bio-based required ->make_request_fn until DM * table is loaded and md->type established. If request-based * table is loaded: blk-mq will override accordingly. */ - blk_queue_make_request(md->queue, dm_make_request); + md->queue = blk_alloc_queue(dm_make_request, numa_node_id); + if (!md->queue) + goto bad; + md->queue->queuedata = md; md->disk = alloc_disk_node(1, md->numa_node_id); if (!md->disk) diff --git a/drivers/md/md.c b/drivers/md/md.c index f6cf3b53f6c1..cd1210a0d957 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5623,12 +5623,11 @@ static int md_alloc(dev_t dev, char *name) mddev->hold_active = UNTIL_STOP; error = -ENOMEM; - mddev->queue = blk_alloc_queue(GFP_KERNEL); + mddev->queue = blk_alloc_queue(md_make_request, NUMA_NO_NODE); if (!mddev->queue) goto abort; mddev->queue->queuedata = mddev; - blk_queue_make_request(mddev->queue, md_make_request); blk_set_stacking_limits(&mddev->queue->limits); disk = alloc_disk(1 << shift); diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 677d6f45b5c4..43751fab9d36 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -249,13 +249,12 @@ static int nsblk_attach_disk(struct nd_namespace_blk *nsblk) internal_nlba = div_u64(nsblk->size, nsblk_internal_lbasize(nsblk)); available_disk_size = internal_nlba * nsblk_sector_size(nsblk); - q = blk_alloc_queue(GFP_KERNEL); + q = blk_alloc_queue(nd_blk_make_request, NUMA_NO_NODE); if (!q) return -ENOMEM; if (devm_add_action_or_reset(dev, nd_blk_release_queue, q)) return -ENOMEM; - blk_queue_make_request(q, nd_blk_make_request); blk_queue_max_hw_sectors(q, UINT_MAX); blk_queue_logical_block_size(q, nsblk_sector_size(nsblk)); blk_queue_flag_set(QUEUE_FLAG_NONROT, q); diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 0d04ea3d9fd7..3b09419218d6 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1521,7 +1521,7 @@ static int btt_blk_init(struct btt *btt) struct nd_namespace_common *ndns = nd_btt->ndns; /* create a new disk and request queue for btt */ - btt->btt_queue = blk_alloc_queue(GFP_KERNEL); + btt->btt_queue = blk_alloc_queue(btt_make_request, NUMA_NO_NODE); if (!btt->btt_queue) return -ENOMEM; @@ -1540,7 +1540,6 @@ static int btt_blk_init(struct btt *btt) btt->btt_disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; - blk_queue_make_request(btt->btt_queue, btt_make_request); blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX); blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_queue); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 4eae441f86c9..4ffc6f7ca131 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -395,7 +395,7 @@ static int pmem_attach_disk(struct device *dev, return -EBUSY; } - q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev)); + q = blk_alloc_queue(pmem_make_request, dev_to_node(dev)); if (!q) return -ENOMEM; @@ -433,7 +433,6 @@ static int pmem_attach_disk(struct device *dev, pmem->virt_addr = addr; blk_queue_write_cache(q, true, fua); - blk_queue_make_request(q, pmem_make_request); blk_queue_physical_block_size(q, PAGE_SIZE); blk_queue_logical_block_size(q, pmem_sector_size(ndns)); blk_queue_max_hw_sectors(q, UINT_MAX); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index a11900cf3a36..a38d7f196aba 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -377,11 +377,10 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) return 0; - q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node); + q = blk_alloc_queue(nvme_ns_head_make_request, ctrl->numa_node); if (!q) goto out; q->queuedata = head; - blk_queue_make_request(q, nvme_ns_head_make_request); blk_queue_flag_set(QUEUE_FLAG_NONROT, q); /* set to a default value for 512 until disk is validated */ blk_queue_logical_block_size(q, 512); diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 63502ca537eb..80d22290f268 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -636,10 +636,10 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char } dev_info->gd->major = dcssblk_major; dev_info->gd->fops = &dcssblk_devops; - dev_info->dcssblk_queue = blk_alloc_queue(GFP_KERNEL); + dev_info->dcssblk_queue = + blk_alloc_queue(dcssblk_make_request, NUMA_NO_NODE); dev_info->gd->queue = dev_info->dcssblk_queue; dev_info->gd->private_data = dev_info; - blk_queue_make_request(dev_info->dcssblk_queue, dcssblk_make_request); blk_queue_logical_block_size(dev_info->dcssblk_queue, 4096); blk_queue_flag_set(QUEUE_FLAG_DAX, dev_info->dcssblk_queue); diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c index 3df5d68d09f0..45a04daec89e 100644 --- a/drivers/s390/block/xpram.c +++ b/drivers/s390/block/xpram.c @@ -343,14 +343,14 @@ static int __init xpram_setup_blkdev(void) xpram_disks[i] = alloc_disk(1); if (!xpram_disks[i]) goto out; - xpram_queues[i] = blk_alloc_queue(GFP_KERNEL); + xpram_queues[i] = blk_alloc_queue(xpram_make_request, + NUMA_NO_NODE); if (!xpram_queues[i]) { put_disk(xpram_disks[i]); goto out; } blk_queue_flag_set(QUEUE_FLAG_NONROT, xpram_queues[i]); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, xpram_queues[i]); - blk_queue_make_request(xpram_queues[i], xpram_make_request); blk_queue_logical_block_size(xpram_queues[i], 4096); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e8defd718d62..3f27ff08483e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1063,7 +1063,6 @@ extern void blk_abort_request(struct request *); * Access functions for manipulating queue properties */ extern void blk_cleanup_queue(struct request_queue *); -extern void blk_queue_make_request(struct request_queue *, make_request_fn *); extern void blk_queue_bounce_limit(struct request_queue *, u64); extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); @@ -1140,8 +1139,7 @@ extern void blk_dump_rq_flags(struct request *, char *); extern long nr_blockdev_pages(void); bool __must_check blk_get_queue(struct request_queue *); -struct request_queue *blk_alloc_queue(gfp_t); -struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id); +struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id); extern void blk_put_queue(struct request_queue *); extern void blk_set_queue_dying(struct request_queue *); From f01b411f41f91fc3196eae4317cf8b4d872830a6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2020 09:30:12 +0100 Subject: [PATCH 70/72] Revert "blkdev: check for valid request queue before issuing flush" This reverts commit f10d9f617a65905c556c3b37c9b9646ae7d04ed7. We can't have queues without a make_request_fn any more (and the loop device uses blk-mq these days anyway..). Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-flush.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 843d25683691..c7f396e3d5e2 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -454,15 +454,6 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, if (!q) return -ENXIO; - /* - * some block devices may not have their queue correctly set up here - * (e.g. loop device without a backing file) and so issuing a flush - * here will panic. Ensure there is a request function before issuing - * the flush. - */ - if (!q->make_request_fn) - return -ENXIO; - bio = bio_alloc(gfp_mask, 0); bio_set_dev(bio, bdev); bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; From 130879f1ee0e25b0391b8c78b3baac6fe41f4d38 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2020 18:48:37 +0100 Subject: [PATCH 71/72] block: move bio_map_* to blk-map.c The bio_map_* helpers are just the low-level helpers for the blk_rq_map_* APIs. Move them together for better logical grouping, as no there isn't much overlap with other code in bio.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 510 +------------------------------------------- block/blk-map.c | 508 +++++++++++++++++++++++++++++++++++++++++++ block/blk.h | 4 + include/linux/bio.h | 14 -- 4 files changed, 513 insertions(+), 523 deletions(-) diff --git a/block/bio.c b/block/bio.c index 11e6aac35092..21cbaa6a1c20 100644 --- a/block/bio.c +++ b/block/bio.c @@ -780,7 +780,7 @@ static bool bio_try_merge_pc_page(struct request_queue *q, struct bio *bio, * * This should only be used by passthrough bios. */ -static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, +int __bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, bool *same_page) { @@ -1194,90 +1194,6 @@ void bio_list_copy_data(struct bio *dst, struct bio *src) } EXPORT_SYMBOL(bio_list_copy_data); -struct bio_map_data { - int is_our_pages; - struct iov_iter iter; - struct iovec iov[]; -}; - -static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, - gfp_t gfp_mask) -{ - struct bio_map_data *bmd; - if (data->nr_segs > UIO_MAXIOV) - return NULL; - - bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask); - if (!bmd) - return NULL; - memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); - bmd->iter = *data; - bmd->iter.iov = bmd->iov; - return bmd; -} - -/** - * bio_copy_from_iter - copy all pages from iov_iter to bio - * @bio: The &struct bio which describes the I/O as destination - * @iter: iov_iter as source - * - * Copy all pages from iov_iter to bio. - * Returns 0 on success, or error on failure. - */ -static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) -{ - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - ssize_t ret; - - ret = copy_page_from_iter(bvec->bv_page, - bvec->bv_offset, - bvec->bv_len, - iter); - - if (!iov_iter_count(iter)) - break; - - if (ret < bvec->bv_len) - return -EFAULT; - } - - return 0; -} - -/** - * bio_copy_to_iter - copy all pages from bio to iov_iter - * @bio: The &struct bio which describes the I/O as source - * @iter: iov_iter as destination - * - * Copy all pages from bio to iov_iter. - * Returns 0 on success, or error on failure. - */ -static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) -{ - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - ssize_t ret; - - ret = copy_page_to_iter(bvec->bv_page, - bvec->bv_offset, - bvec->bv_len, - &iter); - - if (!iov_iter_count(&iter)) - break; - - if (ret < bvec->bv_len) - return -EFAULT; - } - - return 0; -} - void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; @@ -1288,430 +1204,6 @@ void bio_free_pages(struct bio *bio) } EXPORT_SYMBOL(bio_free_pages); -/** - * bio_uncopy_user - finish previously mapped bio - * @bio: bio being terminated - * - * Free pages allocated from bio_copy_user_iov() and write back data - * to user space in case of a read. - */ -int bio_uncopy_user(struct bio *bio) -{ - struct bio_map_data *bmd = bio->bi_private; - int ret = 0; - - if (!bio_flagged(bio, BIO_NULL_MAPPED)) { - /* - * if we're in a workqueue, the request is orphaned, so - * don't copy into a random user address space, just free - * and return -EINTR so user space doesn't expect any data. - */ - if (!current->mm) - ret = -EINTR; - else if (bio_data_dir(bio) == READ) - ret = bio_copy_to_iter(bio, bmd->iter); - if (bmd->is_our_pages) - bio_free_pages(bio); - } - kfree(bmd); - bio_put(bio); - return ret; -} - -/** - * bio_copy_user_iov - copy user data to bio - * @q: destination block queue - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @iter: iovec iterator - * @gfp_mask: memory allocation flags - * - * Prepares and returns a bio for indirect user io, bouncing data - * to/from kernel pages as necessary. Must be paired with - * call bio_uncopy_user() on io completion. - */ -struct bio *bio_copy_user_iov(struct request_queue *q, - struct rq_map_data *map_data, - struct iov_iter *iter, - gfp_t gfp_mask) -{ - struct bio_map_data *bmd; - struct page *page; - struct bio *bio; - int i = 0, ret; - int nr_pages; - unsigned int len = iter->count; - unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0; - - bmd = bio_alloc_map_data(iter, gfp_mask); - if (!bmd) - return ERR_PTR(-ENOMEM); - - /* - * We need to do a deep copy of the iov_iter including the iovecs. - * The caller provided iov might point to an on-stack or otherwise - * shortlived one. - */ - bmd->is_our_pages = map_data ? 0 : 1; - - nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); - if (nr_pages > BIO_MAX_PAGES) - nr_pages = BIO_MAX_PAGES; - - ret = -ENOMEM; - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - goto out_bmd; - - ret = 0; - - if (map_data) { - nr_pages = 1 << map_data->page_order; - i = map_data->offset / PAGE_SIZE; - } - while (len) { - unsigned int bytes = PAGE_SIZE; - - bytes -= offset; - - if (bytes > len) - bytes = len; - - if (map_data) { - if (i == map_data->nr_entries * nr_pages) { - ret = -ENOMEM; - break; - } - - page = map_data->pages[i / nr_pages]; - page += (i % nr_pages); - - i++; - } else { - page = alloc_page(q->bounce_gfp | gfp_mask); - if (!page) { - ret = -ENOMEM; - break; - } - } - - if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { - if (!map_data) - __free_page(page); - break; - } - - len -= bytes; - offset = 0; - } - - if (ret) - goto cleanup; - - if (map_data) - map_data->offset += bio->bi_iter.bi_size; - - /* - * success - */ - if ((iov_iter_rw(iter) == WRITE && (!map_data || !map_data->null_mapped)) || - (map_data && map_data->from_user)) { - ret = bio_copy_from_iter(bio, iter); - if (ret) - goto cleanup; - } else { - if (bmd->is_our_pages) - zero_fill_bio(bio); - iov_iter_advance(iter, bio->bi_iter.bi_size); - } - - bio->bi_private = bmd; - if (map_data && map_data->null_mapped) - bio_set_flag(bio, BIO_NULL_MAPPED); - return bio; -cleanup: - if (!map_data) - bio_free_pages(bio); - bio_put(bio); -out_bmd: - kfree(bmd); - return ERR_PTR(ret); -} - -/** - * bio_map_user_iov - map user iovec into bio - * @q: the struct request_queue for the bio - * @iter: iovec iterator - * @gfp_mask: memory allocation flags - * - * Map the user space address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_user_iov(struct request_queue *q, - struct iov_iter *iter, - gfp_t gfp_mask) -{ - int j; - struct bio *bio; - int ret; - - if (!iov_iter_count(iter)) - return ERR_PTR(-EINVAL); - - bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); - if (!bio) - return ERR_PTR(-ENOMEM); - - while (iov_iter_count(iter)) { - struct page **pages; - ssize_t bytes; - size_t offs, added = 0; - int npages; - - bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs); - if (unlikely(bytes <= 0)) { - ret = bytes ? bytes : -EFAULT; - goto out_unmap; - } - - npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); - - if (unlikely(offs & queue_dma_alignment(q))) { - ret = -EINVAL; - j = 0; - } else { - for (j = 0; j < npages; j++) { - struct page *page = pages[j]; - unsigned int n = PAGE_SIZE - offs; - bool same_page = false; - - if (n > bytes) - n = bytes; - - if (!__bio_add_pc_page(q, bio, page, n, offs, - &same_page)) { - if (same_page) - put_page(page); - break; - } - - added += n; - bytes -= n; - offs = 0; - } - iov_iter_advance(iter, added); - } - /* - * release the pages we didn't map into the bio, if any - */ - while (j < npages) - put_page(pages[j++]); - kvfree(pages); - /* couldn't stuff something into bio? */ - if (bytes) - break; - } - - bio_set_flag(bio, BIO_USER_MAPPED); - - /* - * subtle -- if bio_map_user_iov() ended up bouncing a bio, - * it would normally disappear when its bi_end_io is run. - * however, we need it for the unmap, so grab an extra - * reference to it - */ - bio_get(bio); - return bio; - - out_unmap: - bio_release_pages(bio, false); - bio_put(bio); - return ERR_PTR(ret); -} - -/** - * bio_unmap_user - unmap a bio - * @bio: the bio being unmapped - * - * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from - * process context. - * - * bio_unmap_user() may sleep. - */ -void bio_unmap_user(struct bio *bio) -{ - bio_release_pages(bio, bio_data_dir(bio) == READ); - bio_put(bio); - bio_put(bio); -} - -static void bio_invalidate_vmalloc_pages(struct bio *bio) -{ -#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE - if (bio->bi_private && !op_is_write(bio_op(bio))) { - unsigned long i, len = 0; - - for (i = 0; i < bio->bi_vcnt; i++) - len += bio->bi_io_vec[i].bv_len; - invalidate_kernel_vmap_range(bio->bi_private, len); - } -#endif -} - -static void bio_map_kern_endio(struct bio *bio) -{ - bio_invalidate_vmalloc_pages(bio); - bio_put(bio); -} - -/** - * bio_map_kern - map kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to map - * @len: length in bytes - * @gfp_mask: allocation flags for bio allocation - * - * Map the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, - gfp_t gfp_mask) -{ - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - const int nr_pages = end - start; - bool is_vmalloc = is_vmalloc_addr(data); - struct page *page; - int offset, i; - struct bio *bio; - - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - if (is_vmalloc) { - flush_kernel_vmap_range(data, len); - bio->bi_private = data; - } - - offset = offset_in_page(kaddr); - for (i = 0; i < nr_pages; i++) { - unsigned int bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - if (!is_vmalloc) - page = virt_to_page(data); - else - page = vmalloc_to_page(data); - if (bio_add_pc_page(q, bio, page, bytes, - offset) < bytes) { - /* we don't support partial mappings */ - bio_put(bio); - return ERR_PTR(-EINVAL); - } - - data += bytes; - len -= bytes; - offset = 0; - } - - bio->bi_end_io = bio_map_kern_endio; - return bio; -} - -static void bio_copy_kern_endio(struct bio *bio) -{ - bio_free_pages(bio); - bio_put(bio); -} - -static void bio_copy_kern_endio_read(struct bio *bio) -{ - char *p = bio->bi_private; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - memcpy(p, page_address(bvec->bv_page), bvec->bv_len); - p += bvec->bv_len; - } - - bio_copy_kern_endio(bio); -} - -/** - * bio_copy_kern - copy kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to copy - * @len: length in bytes - * @gfp_mask: allocation flags for bio and page allocation - * @reading: data direction is READ - * - * copy the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, - gfp_t gfp_mask, int reading) -{ - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - struct bio *bio; - void *p = data; - int nr_pages = 0; - - /* - * Overflow, abort - */ - if (end < start) - return ERR_PTR(-EINVAL); - - nr_pages = end - start; - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - while (len) { - struct page *page; - unsigned int bytes = PAGE_SIZE; - - if (bytes > len) - bytes = len; - - page = alloc_page(q->bounce_gfp | gfp_mask); - if (!page) - goto cleanup; - - if (!reading) - memcpy(page_address(page), p, bytes); - - if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) - break; - - len -= bytes; - p += bytes; - } - - if (reading) { - bio->bi_end_io = bio_copy_kern_endio_read; - bio->bi_private = data; - } else { - bio->bi_end_io = bio_copy_kern_endio; - } - - return bio; - -cleanup: - bio_free_pages(bio); - bio_put(bio); - return ERR_PTR(-ENOMEM); -} - /* * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions * for performing direct-IO in BIOs. diff --git a/block/blk-map.c b/block/blk-map.c index b0790268ed9d..b72c361911a4 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -11,6 +11,514 @@ #include "blk.h" +struct bio_map_data { + int is_our_pages; + struct iov_iter iter; + struct iovec iov[]; +}; + +static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, + gfp_t gfp_mask) +{ + struct bio_map_data *bmd; + + if (data->nr_segs > UIO_MAXIOV) + return NULL; + + bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask); + if (!bmd) + return NULL; + memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); + bmd->iter = *data; + bmd->iter.iov = bmd->iov; + return bmd; +} + +/** + * bio_copy_from_iter - copy all pages from iov_iter to bio + * @bio: The &struct bio which describes the I/O as destination + * @iter: iov_iter as source + * + * Copy all pages from iov_iter to bio. + * Returns 0 on success, or error on failure. + */ +static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) +{ + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + ssize_t ret; + + ret = copy_page_from_iter(bvec->bv_page, + bvec->bv_offset, + bvec->bv_len, + iter); + + if (!iov_iter_count(iter)) + break; + + if (ret < bvec->bv_len) + return -EFAULT; + } + + return 0; +} + +/** + * bio_copy_to_iter - copy all pages from bio to iov_iter + * @bio: The &struct bio which describes the I/O as source + * @iter: iov_iter as destination + * + * Copy all pages from bio to iov_iter. + * Returns 0 on success, or error on failure. + */ +static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) +{ + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + ssize_t ret; + + ret = copy_page_to_iter(bvec->bv_page, + bvec->bv_offset, + bvec->bv_len, + &iter); + + if (!iov_iter_count(&iter)) + break; + + if (ret < bvec->bv_len) + return -EFAULT; + } + + return 0; +} + +/** + * bio_uncopy_user - finish previously mapped bio + * @bio: bio being terminated + * + * Free pages allocated from bio_copy_user_iov() and write back data + * to user space in case of a read. + */ +static int bio_uncopy_user(struct bio *bio) +{ + struct bio_map_data *bmd = bio->bi_private; + int ret = 0; + + if (!bio_flagged(bio, BIO_NULL_MAPPED)) { + /* + * if we're in a workqueue, the request is orphaned, so + * don't copy into a random user address space, just free + * and return -EINTR so user space doesn't expect any data. + */ + if (!current->mm) + ret = -EINTR; + else if (bio_data_dir(bio) == READ) + ret = bio_copy_to_iter(bio, bmd->iter); + if (bmd->is_our_pages) + bio_free_pages(bio); + } + kfree(bmd); + bio_put(bio); + return ret; +} + +/** + * bio_copy_user_iov - copy user data to bio + * @q: destination block queue + * @map_data: pointer to the rq_map_data holding pages (if necessary) + * @iter: iovec iterator + * @gfp_mask: memory allocation flags + * + * Prepares and returns a bio for indirect user io, bouncing data + * to/from kernel pages as necessary. Must be paired with + * call bio_uncopy_user() on io completion. + */ +static struct bio *bio_copy_user_iov(struct request_queue *q, + struct rq_map_data *map_data, struct iov_iter *iter, + gfp_t gfp_mask) +{ + struct bio_map_data *bmd; + struct page *page; + struct bio *bio; + int i = 0, ret; + int nr_pages; + unsigned int len = iter->count; + unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0; + + bmd = bio_alloc_map_data(iter, gfp_mask); + if (!bmd) + return ERR_PTR(-ENOMEM); + + /* + * We need to do a deep copy of the iov_iter including the iovecs. + * The caller provided iov might point to an on-stack or otherwise + * shortlived one. + */ + bmd->is_our_pages = map_data ? 0 : 1; + + nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); + if (nr_pages > BIO_MAX_PAGES) + nr_pages = BIO_MAX_PAGES; + + ret = -ENOMEM; + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + goto out_bmd; + + ret = 0; + + if (map_data) { + nr_pages = 1 << map_data->page_order; + i = map_data->offset / PAGE_SIZE; + } + while (len) { + unsigned int bytes = PAGE_SIZE; + + bytes -= offset; + + if (bytes > len) + bytes = len; + + if (map_data) { + if (i == map_data->nr_entries * nr_pages) { + ret = -ENOMEM; + break; + } + + page = map_data->pages[i / nr_pages]; + page += (i % nr_pages); + + i++; + } else { + page = alloc_page(q->bounce_gfp | gfp_mask); + if (!page) { + ret = -ENOMEM; + break; + } + } + + if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { + if (!map_data) + __free_page(page); + break; + } + + len -= bytes; + offset = 0; + } + + if (ret) + goto cleanup; + + if (map_data) + map_data->offset += bio->bi_iter.bi_size; + + /* + * success + */ + if ((iov_iter_rw(iter) == WRITE && + (!map_data || !map_data->null_mapped)) || + (map_data && map_data->from_user)) { + ret = bio_copy_from_iter(bio, iter); + if (ret) + goto cleanup; + } else { + if (bmd->is_our_pages) + zero_fill_bio(bio); + iov_iter_advance(iter, bio->bi_iter.bi_size); + } + + bio->bi_private = bmd; + if (map_data && map_data->null_mapped) + bio_set_flag(bio, BIO_NULL_MAPPED); + return bio; +cleanup: + if (!map_data) + bio_free_pages(bio); + bio_put(bio); +out_bmd: + kfree(bmd); + return ERR_PTR(ret); +} + +/** + * bio_map_user_iov - map user iovec into bio + * @q: the struct request_queue for the bio + * @iter: iovec iterator + * @gfp_mask: memory allocation flags + * + * Map the user space address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +static struct bio *bio_map_user_iov(struct request_queue *q, + struct iov_iter *iter, gfp_t gfp_mask) +{ + int j; + struct bio *bio; + int ret; + + if (!iov_iter_count(iter)) + return ERR_PTR(-EINVAL); + + bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); + if (!bio) + return ERR_PTR(-ENOMEM); + + while (iov_iter_count(iter)) { + struct page **pages; + ssize_t bytes; + size_t offs, added = 0; + int npages; + + bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs); + if (unlikely(bytes <= 0)) { + ret = bytes ? bytes : -EFAULT; + goto out_unmap; + } + + npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); + + if (unlikely(offs & queue_dma_alignment(q))) { + ret = -EINVAL; + j = 0; + } else { + for (j = 0; j < npages; j++) { + struct page *page = pages[j]; + unsigned int n = PAGE_SIZE - offs; + bool same_page = false; + + if (n > bytes) + n = bytes; + + if (!__bio_add_pc_page(q, bio, page, n, offs, + &same_page)) { + if (same_page) + put_page(page); + break; + } + + added += n; + bytes -= n; + offs = 0; + } + iov_iter_advance(iter, added); + } + /* + * release the pages we didn't map into the bio, if any + */ + while (j < npages) + put_page(pages[j++]); + kvfree(pages); + /* couldn't stuff something into bio? */ + if (bytes) + break; + } + + bio_set_flag(bio, BIO_USER_MAPPED); + + /* + * subtle -- if bio_map_user_iov() ended up bouncing a bio, + * it would normally disappear when its bi_end_io is run. + * however, we need it for the unmap, so grab an extra + * reference to it + */ + bio_get(bio); + return bio; + + out_unmap: + bio_release_pages(bio, false); + bio_put(bio); + return ERR_PTR(ret); +} + +/** + * bio_unmap_user - unmap a bio + * @bio: the bio being unmapped + * + * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from + * process context. + * + * bio_unmap_user() may sleep. + */ +static void bio_unmap_user(struct bio *bio) +{ + bio_release_pages(bio, bio_data_dir(bio) == READ); + bio_put(bio); + bio_put(bio); +} + +static void bio_invalidate_vmalloc_pages(struct bio *bio) +{ +#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE + if (bio->bi_private && !op_is_write(bio_op(bio))) { + unsigned long i, len = 0; + + for (i = 0; i < bio->bi_vcnt; i++) + len += bio->bi_io_vec[i].bv_len; + invalidate_kernel_vmap_range(bio->bi_private, len); + } +#endif +} + +static void bio_map_kern_endio(struct bio *bio) +{ + bio_invalidate_vmalloc_pages(bio); + bio_put(bio); +} + +/** + * bio_map_kern - map kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to map + * @len: length in bytes + * @gfp_mask: allocation flags for bio allocation + * + * Map the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +static struct bio *bio_map_kern(struct request_queue *q, void *data, + unsigned int len, gfp_t gfp_mask) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + const int nr_pages = end - start; + bool is_vmalloc = is_vmalloc_addr(data); + struct page *page; + int offset, i; + struct bio *bio; + + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + if (is_vmalloc) { + flush_kernel_vmap_range(data, len); + bio->bi_private = data; + } + + offset = offset_in_page(kaddr); + for (i = 0; i < nr_pages; i++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + if (!is_vmalloc) + page = virt_to_page(data); + else + page = vmalloc_to_page(data); + if (bio_add_pc_page(q, bio, page, bytes, + offset) < bytes) { + /* we don't support partial mappings */ + bio_put(bio); + return ERR_PTR(-EINVAL); + } + + data += bytes; + len -= bytes; + offset = 0; + } + + bio->bi_end_io = bio_map_kern_endio; + return bio; +} + +static void bio_copy_kern_endio(struct bio *bio) +{ + bio_free_pages(bio); + bio_put(bio); +} + +static void bio_copy_kern_endio_read(struct bio *bio) +{ + char *p = bio->bi_private; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + memcpy(p, page_address(bvec->bv_page), bvec->bv_len); + p += bvec->bv_len; + } + + bio_copy_kern_endio(bio); +} + +/** + * bio_copy_kern - copy kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to copy + * @len: length in bytes + * @gfp_mask: allocation flags for bio and page allocation + * @reading: data direction is READ + * + * copy the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +static struct bio *bio_copy_kern(struct request_queue *q, void *data, + unsigned int len, gfp_t gfp_mask, int reading) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + struct bio *bio; + void *p = data; + int nr_pages = 0; + + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + + nr_pages = end - start; + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + while (len) { + struct page *page; + unsigned int bytes = PAGE_SIZE; + + if (bytes > len) + bytes = len; + + page = alloc_page(q->bounce_gfp | gfp_mask); + if (!page) + goto cleanup; + + if (!reading) + memcpy(page_address(page), p, bytes); + + if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) + break; + + len -= bytes; + p += bytes; + } + + if (reading) { + bio->bi_end_io = bio_copy_kern_endio_read; + bio->bi_private = data; + } else { + bio->bi_end_io = bio_copy_kern_endio; + } + + return bio; + +cleanup: + bio_free_pages(bio); + bio_put(bio); + return ERR_PTR(-ENOMEM); +} + /* * Append a bio to a passthrough request. Only works if the bio can be merged * into the request based on the driver constraints. diff --git a/block/blk.h b/block/blk.h index 491e52fc0aa6..0a94ec68af32 100644 --- a/block/blk.h +++ b/block/blk.h @@ -484,4 +484,8 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) struct request_queue *__blk_alloc_queue(int node_id); +int __bio_add_pc_page(struct request_queue *q, struct bio *bio, + struct page *page, unsigned int len, unsigned int offset, + bool *same_page); + #endif /* BLK_INTERNAL_H */ diff --git a/include/linux/bio.h b/include/linux/bio.h index a430e9c1c2d2..c1c0f9ea4e63 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -441,14 +441,6 @@ void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); void bio_release_pages(struct bio *bio, bool mark_dirty); -struct rq_map_data; -extern struct bio *bio_map_user_iov(struct request_queue *, - struct iov_iter *, gfp_t); -extern void bio_unmap_user(struct bio *); -extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int, - gfp_t); -extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, - gfp_t, int); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); @@ -463,12 +455,6 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, extern void bio_copy_data(struct bio *dst, struct bio *src); extern void bio_list_copy_data(struct bio *dst, struct bio *src); extern void bio_free_pages(struct bio *bio); - -extern struct bio *bio_copy_user_iov(struct request_queue *, - struct rq_map_data *, - struct iov_iter *, - gfp_t); -extern int bio_uncopy_user(struct bio *); void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); void bio_truncate(struct bio *bio, unsigned new_size); void guard_bio_eod(struct bio *bio); From 654a3667df364f778b9b5bcdfb32e545aceb6a51 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 29 Mar 2020 10:08:26 -0600 Subject: [PATCH 72/72] block: return NULL in blk_alloc_queue() on error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes follwoing warning: block/blk-core.c: In function ‘blk_alloc_queue’: block/blk-core.c:558:10: warning: returning ‘int’ from a function with return type ‘struct request_queue *’ makes pointer from integer without a cast [-Wint-conversion] return -EINVAL; Fixes: 3d745ea5b095a ("block: simplify queue allocation") Reviewed-by: Christoph Hellwig Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 18b8c09d093e..7e4a1da0715e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -555,7 +555,7 @@ struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id) struct request_queue *q; if (WARN_ON_ONCE(!make_request)) - return -EINVAL; + return NULL; q = __blk_alloc_queue(node_id); if (!q)