From 1aecfe4887713838c79bc52f774609a57db4f988 Mon Sep 17 00:00:00 2001 From: Ming Lei <tom.leiming@gmail.com> Date: Sun, 1 Jun 2014 00:43:36 +0800 Subject: [PATCH 1/7] blk-mq: move blk_mq_get_ctx/blk_mq_put_ctx to mq private header The blk-mq tag code need these helpers. Signed-off-by: Ming Lei <tom.leiming@gmail.com> Signed-off-by: Jens Axboe <axboe@fb.com> --- block/blk-mq.c | 22 ---------------------- block/blk-mq.h | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 0f5879c42dcd..b9230c522c6b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -33,28 +33,6 @@ static LIST_HEAD(all_q_list); static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); -static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, - unsigned int cpu) -{ - return per_cpu_ptr(q->queue_ctx, cpu); -} - -/* - * This assumes per-cpu software queueing queues. They could be per-node - * as well, for instance. For now this is hardcoded as-is. Note that we don't - * care about preemption, since we know the ctx's are persistent. This does - * mean that we can't rely on ctx always matching the currently running CPU. - */ -static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) -{ - return __blk_mq_get_ctx(q, get_cpu()); -} - -static void blk_mq_put_ctx(struct blk_mq_ctx *ctx) -{ - put_cpu(); -} - /* * Check if any of the ctx's have pending work in this hardware queue */ diff --git a/block/blk-mq.h b/block/blk-mq.h index de7b3bbd5bd6..57a7968e47b3 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -69,4 +69,26 @@ struct blk_align_bitmap { unsigned long depth; } ____cacheline_aligned_in_smp; +static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, + unsigned int cpu) +{ + return per_cpu_ptr(q->queue_ctx, cpu); +} + +/* + * This assumes per-cpu software queueing queues. They could be per-node + * as well, for instance. For now this is hardcoded as-is. Note that we don't + * care about preemption, since we know the ctx's are persistent. This does + * mean that we can't rely on ctx always matching the currently running CPU. + */ +static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) +{ + return __blk_mq_get_ctx(q, get_cpu()); +} + +static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) +{ + put_cpu(); +} + #endif From cb96a42cc1f50ba1c7b1e9b2343bec80b926107f Mon Sep 17 00:00:00 2001 From: Ming Lei <tom.leiming@gmail.com> Date: Sun, 1 Jun 2014 00:43:37 +0800 Subject: [PATCH 2/7] blk-mq: fix schedule from atomic context blk_mq_put_ctx() has to be called before io_schedule() in bt_get(). This patch fixes the problem by taking similar approach from percpu_ida allocation for the situation. Signed-off-by: Ming Lei <tom.leiming@gmail.com> Signed-off-by: Jens Axboe <axboe@fb.com> --- block/blk-mq-tag.c | 48 ++++++++++++++++++++++++++++++---------------- block/blk-mq-tag.h | 2 +- block/blk-mq.c | 36 +++++++++++++++++++++------------- block/blk-mq.h | 23 ++++++++++++++++++++++ 4 files changed, 78 insertions(+), 31 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index d90c4aeb7dd3..1aab39f71d95 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -221,8 +221,10 @@ static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, return bs; } -static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx, - unsigned int *last_tag, gfp_t gfp) +static int bt_get(struct blk_mq_alloc_data *data, + struct blk_mq_bitmap_tags *bt, + struct blk_mq_hw_ctx *hctx, + unsigned int *last_tag) { struct bt_wait_state *bs; DEFINE_WAIT(wait); @@ -232,7 +234,7 @@ static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx, if (tag != -1) return tag; - if (!(gfp & __GFP_WAIT)) + if (!(data->gfp & __GFP_WAIT)) return -1; bs = bt_wait_ptr(bt, hctx); @@ -249,50 +251,62 @@ static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx, if (was_empty) atomic_set(&bs->wait_cnt, bt->wake_cnt); + blk_mq_put_ctx(data->ctx); + io_schedule(); + + data->ctx = blk_mq_get_ctx(data->q); + data->hctx = data->q->mq_ops->map_queue(data->q, + data->ctx->cpu); + if (data->reserved) { + bt = &data->hctx->tags->breserved_tags; + } else { + last_tag = &data->ctx->last_tag; + hctx = data->hctx; + bt = &hctx->tags->bitmap_tags; + } + finish_wait(&bs->wait, &wait); + bs = bt_wait_ptr(bt, hctx); } while (1); finish_wait(&bs->wait, &wait); return tag; } -static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, - struct blk_mq_hw_ctx *hctx, - unsigned int *last_tag, gfp_t gfp) +static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data) { int tag; - tag = bt_get(&tags->bitmap_tags, hctx, last_tag, gfp); + tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx, + &data->ctx->last_tag); if (tag >= 0) - return tag + tags->nr_reserved_tags; + return tag + data->hctx->tags->nr_reserved_tags; return BLK_MQ_TAG_FAIL; } -static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags, - gfp_t gfp) +static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data) { int tag, zero = 0; - if (unlikely(!tags->nr_reserved_tags)) { + if (unlikely(!data->hctx->tags->nr_reserved_tags)) { WARN_ON_ONCE(1); return BLK_MQ_TAG_FAIL; } - tag = bt_get(&tags->breserved_tags, NULL, &zero, gfp); + tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero); if (tag < 0) return BLK_MQ_TAG_FAIL; return tag; } -unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, - gfp_t gfp, bool reserved) +unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) { - if (!reserved) - return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp); + if (!data->reserved) + return __blk_mq_get_tag(data); - return __blk_mq_get_reserved_tag(hctx->tags, gfp); + return __blk_mq_get_reserved_tag(data); } static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index c959de58d2a5..98696a65d4d4 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -48,7 +48,7 @@ struct blk_mq_tags { extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); extern void blk_mq_free_tags(struct blk_mq_tags *tags); -extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved); +extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); diff --git a/block/blk-mq.c b/block/blk-mq.c index b9230c522c6b..43eb3156e110 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -210,24 +210,23 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, } static struct request * -__blk_mq_alloc_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, int rw, gfp_t gfp, bool reserved) +__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw) { struct request *rq; unsigned int tag; - tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved); + tag = blk_mq_get_tag(data); if (tag != BLK_MQ_TAG_FAIL) { - rq = hctx->tags->rqs[tag]; + rq = data->hctx->tags->rqs[tag]; rq->cmd_flags = 0; - if (blk_mq_tag_busy(hctx)) { + if (blk_mq_tag_busy(data->hctx)) { rq->cmd_flags = REQ_MQ_INFLIGHT; - atomic_inc(&hctx->nr_active); + atomic_inc(&data->hctx->nr_active); } rq->tag = tag; - blk_mq_rq_ctx_init(q, ctx, rq, rw); + blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw); return rq; } @@ -240,22 +239,27 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; struct request *rq; + struct blk_mq_alloc_data alloc_data; if (blk_mq_queue_enter(q)) return NULL; ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); + blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT, + reserved, ctx, hctx); - rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp & ~__GFP_WAIT, - reserved); + rq = __blk_mq_alloc_request(&alloc_data, rw); if (!rq && (gfp & __GFP_WAIT)) { __blk_mq_run_hw_queue(hctx); blk_mq_put_ctx(ctx); ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp, reserved); + blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx, + hctx); + rq = __blk_mq_alloc_request(&alloc_data, rw); + ctx = alloc_data.ctx; } blk_mq_put_ctx(ctx); return rq; @@ -1136,6 +1140,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, struct blk_mq_ctx *ctx; struct request *rq; int rw = bio_data_dir(bio); + struct blk_mq_alloc_data alloc_data; if (unlikely(blk_mq_queue_enter(q))) { bio_endio(bio, -EIO); @@ -1149,7 +1154,9 @@ static struct request *blk_mq_map_request(struct request_queue *q, rw |= REQ_SYNC; trace_block_getrq(q, bio, rw); - rq = __blk_mq_alloc_request(q, hctx, ctx, rw, GFP_ATOMIC, false); + blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx, + hctx); + rq = __blk_mq_alloc_request(&alloc_data, rw); if (unlikely(!rq)) { __blk_mq_run_hw_queue(hctx); blk_mq_put_ctx(ctx); @@ -1157,8 +1164,11 @@ static struct request *blk_mq_map_request(struct request_queue *q, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - rq = __blk_mq_alloc_request(q, hctx, ctx, rw, - __GFP_WAIT|GFP_ATOMIC, false); + blk_mq_set_alloc_data(&alloc_data, q, + __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx); + rq = __blk_mq_alloc_request(&alloc_data, rw); + ctx = alloc_data.ctx; + hctx = alloc_data.hctx; } hctx->queued++; diff --git a/block/blk-mq.h b/block/blk-mq.h index 57a7968e47b3..26460884c6cd 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -91,4 +91,27 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) put_cpu(); } +struct blk_mq_alloc_data { + /* input parameter */ + struct request_queue *q; + gfp_t gfp; + bool reserved; + + /* input & output parameter */ + struct blk_mq_ctx *ctx; + struct blk_mq_hw_ctx *hctx; +}; + +static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, + struct request_queue *q, gfp_t gfp, bool reserved, + struct blk_mq_ctx *ctx, + struct blk_mq_hw_ctx *hctx) +{ + data->q = q; + data->gfp = gfp; + data->reserved = reserved; + data->ctx = ctx; + data->hctx = hctx; +} + #endif From e6cdb0929fe6726ba5203fc5529b74564d98a9e9 Mon Sep 17 00:00:00 2001 From: Ming Lei <tom.leiming@gmail.com> Date: Tue, 3 Jun 2014 11:24:06 +0800 Subject: [PATCH 3/7] blk-mq: fix sparse warning on missed __percpu annotation 'struct blk_mq_ctx' is __percpu, so add the annotation and fix the sparse warning reported from Fengguang: [block:for-linus 2/3] block/blk-mq.h:75:16: sparse: incorrect type in initializer (different address spaces) Reported-by: kbuild test robot <fengguang.wu@intel.com> Signed-off-by: Ming Lei <tom.leiming@gmail.com> Signed-off-by: Jens Axboe <axboe@fb.com> --- block/blk-mq.c | 2 +- include/linux/blkdev.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 43eb3156e110..3bb4cfec276b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1767,7 +1767,7 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) { struct blk_mq_hw_ctx **hctxs; - struct blk_mq_ctx *ctx; + struct blk_mq_ctx __percpu *ctx; struct request_queue *q; unsigned int *map; int i; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8aba35f46f87..5c6f836afa1b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -335,7 +335,7 @@ struct request_queue { unsigned int *mq_map; /* sw queues */ - struct blk_mq_ctx *queue_ctx; + struct blk_mq_ctx __percpu *queue_ctx; unsigned int nr_queues; /* hw dispatch queues */ From ff87bcec197774f938fbd1fe996068005f3dfb3c Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@fb.com> Date: Tue, 3 Jun 2014 11:59:49 -0600 Subject: [PATCH 4/7] blk-mq: handle NULL req return from blk_map_request in single queue mode blk_mq_map_request() can return NULL if we fail entering the queue (dying, or removed), in which case it has already ended IO on the bio. So nothing more to do, except just return. Signed-off-by: Jens Axboe <axboe@fb.com> --- block/blk-mq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 3bb4cfec276b..96e6eb638f00 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1276,6 +1276,8 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) return; rq = blk_mq_map_request(q, bio, &data); + if (unlikely(!rq)) + return; if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); From f899fed4421d6b098ed6a9d69303c70e590bf2c0 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@fb.com> Date: Wed, 4 Jun 2014 09:11:53 -0600 Subject: [PATCH 5/7] blk-mq: fix regression from commit 624dbe475416 When the code was collapsed to avoid duplication, the recent patch for ensuring that a queue is idled before free was dropped, which was added by commit 19c5d84f14d2. Add back the blk_mq_tag_idle(), to ensure we don't leak a reference to an active queue when it is freed. Signed-off-by: Jens Axboe <axboe@fb.com> --- block/blk-mq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 96e6eb638f00..4e8e8cf00815 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1552,6 +1552,8 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, if (i == nr_queue) break; + blk_mq_tag_idle(hctx); + if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, i); From 0e62f51f8753b048f391ee2d7f2af1f7297b0be5 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@fb.com> Date: Wed, 4 Jun 2014 10:23:49 -0600 Subject: [PATCH 6/7] blk-mq: let blk_mq_tag_to_rq() take blk_mq_tags as the main parameter We currently pass in the hardware queue, and get the tags from there. But from scsi-mq, with a shared tag space, it's a lot more convenient to pass in the blk_mq_tags instead as the hardware queue isn't always directly available. So instead of having to re-map to a given hardware queue from rq->mq_ctx, just pass in the tags structure. Signed-off-by: Jens Axboe <axboe@fb.com> --- block/blk-mq.c | 19 ++++++++++++------- drivers/block/mtip32xx/mtip32xx.c | 4 +++- include/linux/blk-mq.h | 2 +- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 4e8e8cf00815..4e4cd6208052 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -529,15 +529,20 @@ void blk_mq_kick_requeue_list(struct request_queue *q) } EXPORT_SYMBOL(blk_mq_kick_requeue_list); -struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag) +static inline bool is_flush_request(struct request *rq, unsigned int tag) { - struct request_queue *q = hctx->queue; + return ((rq->cmd_flags & REQ_FLUSH_SEQ) && + rq->q->flush_rq->tag == tag); +} - if ((q->flush_rq->cmd_flags & REQ_FLUSH_SEQ) && - q->flush_rq->tag == tag) - return q->flush_rq; +struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) +{ + struct request *rq = tags->rqs[tag]; - return hctx->tags->rqs[tag]; + if (!is_flush_request(rq, tag)) + return rq; + + return rq->q->flush_rq; } EXPORT_SYMBOL(blk_mq_tag_to_rq); @@ -566,7 +571,7 @@ static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) if (tag >= hctx->tags->nr_tags) break; - rq = blk_mq_tag_to_rq(hctx, tag++); + rq = blk_mq_tag_to_rq(hctx->tags, tag++); if (rq->q != hctx->queue) continue; if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index abc858b3528b..74abd49fabdc 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -193,7 +193,9 @@ static void mtip_put_int_command(struct driver_data *dd, struct mtip_cmd *cmd) static struct request *mtip_rq_from_tag(struct driver_data *dd, unsigned int tag) { - return blk_mq_tag_to_rq(dd->queue->queue_hw_ctx[0], tag); + struct blk_mq_hw_ctx *hctx = dd->queue->queue_hw_ctx[0]; + + return blk_mq_tag_to_rq(hctx->tags, tag); } static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c15128833100..0feedebfde48 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -155,7 +155,7 @@ void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved); -struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag); +struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int); From 14b83e172f0bc83b8dcf78ee8b1844beeffb418d Mon Sep 17 00:00:00 2001 From: Ming Lei <tom.leiming@gmail.com> Date: Thu, 5 Jun 2014 00:23:55 +0800 Subject: [PATCH 7/7] block: mq flush: clear flush_rq's tag in flush_end_io() blk_mq_tag_to_rq() needs to be able to tell if it should return the original request, or the flush request if we are doing a flush sequence. Clear the flush tag when IO completes for a flush, since that is what we are comparing against. Signed-off-by: Ming Lei <tom.leiming@gmail.com> Signed-off-by: Jens Axboe <axboe@fb.com> --- block/blk-flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index ff87c664b7df..8ffee4b5f93d 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -225,7 +225,7 @@ static void flush_end_io(struct request *flush_rq, int error) if (q->mq_ops) { spin_lock_irqsave(&q->mq_flush_lock, flags); - q->flush_rq->cmd_flags = 0; + q->flush_rq->tag = -1; } running = &q->flush_queue[q->flush_running_idx];