forked from Minki/linux
blk-mq: move srcu from blk_mq_hw_ctx to request_queue
In case of BLK_MQ_F_BLOCKING, per-hctx srcu is used to protect dispatch critical area. However, this srcu instance stays at the end of hctx, and it often takes standalone cacheline, often cold. Inside srcu_read_lock() and srcu_read_unlock(), WRITE is always done on the indirect percpu variable which is allocated from heap instead of being embedded, srcu->srcu_idx is read only in srcu_read_lock(). It doesn't matter if srcu structure stays in hctx or request queue. So switch to per-request-queue srcu for protecting dispatch, and this way simplifies quiesce a lot, not mention quiesce is always done on the request queue wide. Signed-off-by: Ming Lei <ming.lei@redhat.com> Link: https://lore.kernel.org/r/20211203131534.3668411-3-ming.lei@redhat.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
parent
2a904d0085
commit
704b914f15
@ -66,6 +66,7 @@ DEFINE_IDA(blk_queue_ida);
|
||||
* For queue allocation
|
||||
*/
|
||||
struct kmem_cache *blk_requestq_cachep;
|
||||
struct kmem_cache *blk_requestq_srcu_cachep;
|
||||
|
||||
/*
|
||||
* Controlling structure to kblockd
|
||||
@ -437,21 +438,27 @@ static void blk_timeout_work(struct work_struct *work)
|
||||
{
|
||||
}
|
||||
|
||||
struct request_queue *blk_alloc_queue(int node_id)
|
||||
struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
|
||||
{
|
||||
struct request_queue *q;
|
||||
int ret;
|
||||
|
||||
q = kmem_cache_alloc_node(blk_requestq_cachep,
|
||||
GFP_KERNEL | __GFP_ZERO, node_id);
|
||||
q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
|
||||
GFP_KERNEL | __GFP_ZERO, node_id);
|
||||
if (!q)
|
||||
return NULL;
|
||||
|
||||
if (alloc_srcu) {
|
||||
blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q);
|
||||
if (init_srcu_struct(q->srcu) != 0)
|
||||
goto fail_q;
|
||||
}
|
||||
|
||||
q->last_merge = NULL;
|
||||
|
||||
q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
|
||||
if (q->id < 0)
|
||||
goto fail_q;
|
||||
goto fail_srcu;
|
||||
|
||||
ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
|
||||
if (ret)
|
||||
@ -508,8 +515,11 @@ fail_split:
|
||||
bioset_exit(&q->bio_split);
|
||||
fail_id:
|
||||
ida_simple_remove(&blk_queue_ida, q->id);
|
||||
fail_srcu:
|
||||
if (alloc_srcu)
|
||||
cleanup_srcu_struct(q->srcu);
|
||||
fail_q:
|
||||
kmem_cache_free(blk_requestq_cachep, q);
|
||||
kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -1301,6 +1311,9 @@ int __init blk_dev_init(void)
|
||||
sizeof_field(struct request, cmd_flags));
|
||||
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
|
||||
sizeof_field(struct bio, bi_opf));
|
||||
BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu),
|
||||
__alignof__(struct request_queue)) !=
|
||||
sizeof(struct request_queue));
|
||||
|
||||
/* used for unplugging and affects IO latency/throughput - HIGHPRI */
|
||||
kblockd_workqueue = alloc_workqueue("kblockd",
|
||||
@ -1311,6 +1324,10 @@ int __init blk_dev_init(void)
|
||||
blk_requestq_cachep = kmem_cache_create("request_queue",
|
||||
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
|
||||
|
||||
blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu",
|
||||
sizeof(struct request_queue) +
|
||||
sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL);
|
||||
|
||||
blk_debugfs_root = debugfs_create_dir("block", NULL);
|
||||
|
||||
return 0;
|
||||
|
@ -36,8 +36,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
|
||||
struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
|
||||
kobj);
|
||||
|
||||
if (hctx->flags & BLK_MQ_F_BLOCKING)
|
||||
cleanup_srcu_struct(hctx->srcu);
|
||||
blk_free_flush_queue(hctx->fq);
|
||||
sbitmap_free(&hctx->ctx_map);
|
||||
free_cpumask_var(hctx->cpumask);
|
||||
|
@ -260,17 +260,9 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
|
||||
*/
|
||||
void blk_mq_wait_quiesce_done(struct request_queue *q)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
unsigned int i;
|
||||
bool rcu = false;
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
if (hctx->flags & BLK_MQ_F_BLOCKING)
|
||||
synchronize_srcu(hctx->srcu);
|
||||
else
|
||||
rcu = true;
|
||||
}
|
||||
if (rcu)
|
||||
if (blk_queue_has_srcu(q))
|
||||
synchronize_srcu(q->srcu);
|
||||
else
|
||||
synchronize_rcu();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
|
||||
@ -3400,20 +3392,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
|
||||
}
|
||||
}
|
||||
|
||||
static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
|
||||
{
|
||||
int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
|
||||
|
||||
BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
|
||||
__alignof__(struct blk_mq_hw_ctx)) !=
|
||||
sizeof(struct blk_mq_hw_ctx));
|
||||
|
||||
if (tag_set->flags & BLK_MQ_F_BLOCKING)
|
||||
hw_ctx_size += sizeof(struct srcu_struct);
|
||||
|
||||
return hw_ctx_size;
|
||||
}
|
||||
|
||||
static int blk_mq_init_hctx(struct request_queue *q,
|
||||
struct blk_mq_tag_set *set,
|
||||
struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
|
||||
@ -3451,7 +3429,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
|
||||
|
||||
hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
|
||||
hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
|
||||
if (!hctx)
|
||||
goto fail_alloc_hctx;
|
||||
|
||||
@ -3493,8 +3471,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
|
||||
if (!hctx->fq)
|
||||
goto free_bitmap;
|
||||
|
||||
if (hctx->flags & BLK_MQ_F_BLOCKING)
|
||||
init_srcu_struct(hctx->srcu);
|
||||
blk_mq_hctx_kobj_init(hctx);
|
||||
|
||||
return hctx;
|
||||
@ -3830,7 +3806,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
|
||||
struct request_queue *q;
|
||||
int ret;
|
||||
|
||||
q = blk_alloc_queue(set->numa_node);
|
||||
q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING);
|
||||
if (!q)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
q->queuedata = queuedata;
|
||||
@ -3979,6 +3955,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
|
||||
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
|
||||
struct request_queue *q)
|
||||
{
|
||||
WARN_ON_ONCE(blk_queue_has_srcu(q) !=
|
||||
!!(set->flags & BLK_MQ_F_BLOCKING));
|
||||
|
||||
/* mark the queue as mq asap */
|
||||
q->mq_ops = set->ops;
|
||||
|
||||
|
@ -385,9 +385,9 @@ do { \
|
||||
int srcu_idx; \
|
||||
\
|
||||
might_sleep(); \
|
||||
srcu_idx = srcu_read_lock((hctx)->srcu); \
|
||||
srcu_idx = srcu_read_lock((hctx)->queue->srcu); \
|
||||
(dispatch_ops); \
|
||||
srcu_read_unlock((hctx)->srcu, srcu_idx); \
|
||||
srcu_read_unlock((hctx)->queue->srcu, srcu_idx); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
@ -735,7 +735,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
|
||||
{
|
||||
struct request_queue *q = container_of(rcu_head, struct request_queue,
|
||||
rcu_head);
|
||||
kmem_cache_free(blk_requestq_cachep, q);
|
||||
|
||||
kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
|
||||
}
|
||||
|
||||
/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
|
||||
|
10
block/blk.h
10
block/blk.h
@ -27,6 +27,7 @@ struct blk_flush_queue {
|
||||
};
|
||||
|
||||
extern struct kmem_cache *blk_requestq_cachep;
|
||||
extern struct kmem_cache *blk_requestq_srcu_cachep;
|
||||
extern struct kobj_type blk_queue_ktype;
|
||||
extern struct ida blk_queue_ida;
|
||||
|
||||
@ -424,7 +425,14 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
|
||||
struct page *page, unsigned int len, unsigned int offset,
|
||||
unsigned int max_sectors, bool *same_page);
|
||||
|
||||
struct request_queue *blk_alloc_queue(int node_id);
|
||||
static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu)
|
||||
{
|
||||
if (srcu)
|
||||
return blk_requestq_srcu_cachep;
|
||||
return blk_requestq_cachep;
|
||||
}
|
||||
struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu);
|
||||
|
||||
int disk_scan_partitions(struct gendisk *disk, fmode_t mode);
|
||||
|
||||
int disk_alloc_events(struct gendisk *disk);
|
||||
|
@ -1338,7 +1338,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
|
||||
struct request_queue *q;
|
||||
struct gendisk *disk;
|
||||
|
||||
q = blk_alloc_queue(node);
|
||||
q = blk_alloc_queue(node, false);
|
||||
if (!q)
|
||||
return NULL;
|
||||
|
||||
|
@ -4,7 +4,6 @@
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/sbitmap.h>
|
||||
#include <linux/srcu.h>
|
||||
#include <linux/lockdep.h>
|
||||
#include <linux/scatterlist.h>
|
||||
#include <linux/prefetch.h>
|
||||
@ -375,13 +374,6 @@ struct blk_mq_hw_ctx {
|
||||
* q->unused_hctx_list.
|
||||
*/
|
||||
struct list_head hctx_list;
|
||||
|
||||
/**
|
||||
* @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
|
||||
* blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
|
||||
* blk_mq_hw_ctx_size().
|
||||
*/
|
||||
struct srcu_struct srcu[];
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include <linux/percpu-refcount.h>
|
||||
#include <linux/blkzoned.h>
|
||||
#include <linux/sbitmap.h>
|
||||
#include <linux/srcu.h>
|
||||
|
||||
struct module;
|
||||
struct request_queue;
|
||||
@ -373,11 +374,18 @@ struct request_queue {
|
||||
* devices that do not have multiple independent access ranges.
|
||||
*/
|
||||
struct blk_independent_access_ranges *ia_ranges;
|
||||
|
||||
/**
|
||||
* @srcu: Sleepable RCU. Use as lock when type of the request queue
|
||||
* is blocking (BLK_MQ_F_BLOCKING). Must be the last member
|
||||
*/
|
||||
struct srcu_struct srcu[];
|
||||
};
|
||||
|
||||
/* Keep blk_queue_flag_name[] in sync with the definitions below */
|
||||
#define QUEUE_FLAG_STOPPED 0 /* queue is stopped */
|
||||
#define QUEUE_FLAG_DYING 1 /* queue being torn down */
|
||||
#define QUEUE_FLAG_HAS_SRCU 2 /* SRCU is allocated */
|
||||
#define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */
|
||||
#define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */
|
||||
#define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */
|
||||
@ -415,6 +423,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
|
||||
|
||||
#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
|
||||
#define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
|
||||
#define blk_queue_has_srcu(q) test_bit(QUEUE_FLAG_HAS_SRCU, &(q)->queue_flags)
|
||||
#define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
|
||||
#define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
|
||||
#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
|
||||
|
Loading…
Reference in New Issue
Block a user