The kernfs built-in lock of 'kn->count' is held in sysfs .show/.store
path. Meantime, inside block's .show/.store callback, q->sysfs_lock is
required.
However, when mq & iosched kobjects are removed via
blk_mq_unregister_dev() & elv_unregister_queue(), q->sysfs_lock is held
too. This way causes AB-BA lock because the kernfs built-in lock of
'kn-count' is required inside kobject_del() too, see the lockdep warning[1].
On the other hand, it isn't necessary to acquire q->sysfs_lock for
both blk_mq_unregister_dev() & elv_unregister_queue() because
clearing REGISTERED flag prevents storing to 'queue/scheduler'
from being happened. Also sysfs write(store) is exclusive, so no
necessary to hold the lock for elv_unregister_queue() when it is
called in switching elevator path.
So split .sysfs_lock into two: one is still named as .sysfs_lock for
covering sync .store, the other one is named as .sysfs_dir_lock
for covering kobjects and related status change.
sysfs itself can handle the race between add/remove kobjects and
showing/storing attributes under kobjects. For switching scheduler
via storing to 'queue/scheduler', we use the queue flag of
QUEUE_FLAG_REGISTERED with .sysfs_lock for avoiding the race, then
we can avoid to hold .sysfs_lock during removing/adding kobjects.
[1] lockdep warning
======================================================
WARNING: possible circular locking dependency detected
5.3.0-rc3-00044-g73277fc75ea0 #1380 Not tainted
------------------------------------------------------
rmmod/777 is trying to acquire lock:
00000000ac50e981 (kn->count#202){++++}, at: kernfs_remove_by_name_ns+0x59/0x72
but task is already holding lock:
00000000fb16ae21 (&q->sysfs_lock){+.+.}, at: blk_unregister_queue+0x78/0x10b
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (&q->sysfs_lock){+.+.}:
__lock_acquire+0x95f/0xa2f
lock_acquire+0x1b4/0x1e8
__mutex_lock+0x14a/0xa9b
blk_mq_hw_sysfs_show+0x63/0xb6
sysfs_kf_seq_show+0x11f/0x196
seq_read+0x2cd/0x5f2
vfs_read+0xc7/0x18c
ksys_read+0xc4/0x13e
do_syscall_64+0xa7/0x295
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (kn->count#202){++++}:
check_prev_add+0x5d2/0xc45
validate_chain+0xed3/0xf94
__lock_acquire+0x95f/0xa2f
lock_acquire+0x1b4/0x1e8
__kernfs_remove+0x237/0x40b
kernfs_remove_by_name_ns+0x59/0x72
remove_files+0x61/0x96
sysfs_remove_group+0x81/0xa4
sysfs_remove_groups+0x3b/0x44
kobject_del+0x44/0x94
blk_mq_unregister_dev+0x83/0xdd
blk_unregister_queue+0xa0/0x10b
del_gendisk+0x259/0x3fa
null_del_dev+0x8b/0x1c3 [null_blk]
null_exit+0x5c/0x95 [null_blk]
__se_sys_delete_module+0x204/0x337
do_syscall_64+0xa7/0x295
entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&q->sysfs_lock);
lock(kn->count#202);
lock(&q->sysfs_lock);
lock(kn->count#202);
*** DEADLOCK ***
2 locks held by rmmod/777:
#0: 00000000e69bd9de (&lock){+.+.}, at: null_exit+0x2e/0x95 [null_blk]
#1: 00000000fb16ae21 (&q->sysfs_lock){+.+.}, at: blk_unregister_queue+0x78/0x10b
stack backtrace:
CPU: 0 PID: 777 Comm: rmmod Not tainted 5.3.0-rc3-00044-g73277fc75ea0 #1380
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ?-20180724_192412-buildhw-07.phx4
Call Trace:
dump_stack+0x9a/0xe6
check_noncircular+0x207/0x251
? print_circular_bug+0x32a/0x32a
? find_usage_backwards+0x84/0xb0
check_prev_add+0x5d2/0xc45
validate_chain+0xed3/0xf94
? check_prev_add+0xc45/0xc45
? mark_lock+0x11b/0x804
? check_usage_forwards+0x1ca/0x1ca
__lock_acquire+0x95f/0xa2f
lock_acquire+0x1b4/0x1e8
? kernfs_remove_by_name_ns+0x59/0x72
__kernfs_remove+0x237/0x40b
? kernfs_remove_by_name_ns+0x59/0x72
? kernfs_next_descendant_post+0x7d/0x7d
? strlen+0x10/0x23
? strcmp+0x22/0x44
kernfs_remove_by_name_ns+0x59/0x72
remove_files+0x61/0x96
sysfs_remove_group+0x81/0xa4
sysfs_remove_groups+0x3b/0x44
kobject_del+0x44/0x94
blk_mq_unregister_dev+0x83/0xdd
blk_unregister_queue+0xa0/0x10b
del_gendisk+0x259/0x3fa
? disk_events_poll_msecs_store+0x12b/0x12b
? check_flags+0x1ea/0x204
? mark_held_locks+0x1f/0x7a
null_del_dev+0x8b/0x1c3 [null_blk]
null_exit+0x5c/0x95 [null_blk]
__se_sys_delete_module+0x204/0x337
? free_module+0x39f/0x39f
? blkcg_maybe_throttle_current+0x8a/0x718
? rwlock_bug+0x62/0x62
? __blkcg_punt_bio_submit+0xd0/0xd0
? trace_hardirqs_on_thunk+0x1a/0x20
? mark_held_locks+0x1f/0x7a
? do_syscall_64+0x4c/0x295
do_syscall_64+0xa7/0x295
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x7fb696cdbe6b
Code: 73 01 c3 48 8b 0d 1d 20 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 008
RSP: 002b:00007ffec9588788 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559e589137c0 RCX: 00007fb696cdbe6b
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559e58913828
RBP: 0000000000000000 R08: 00007ffec9587701 R09: 0000000000000000
R10: 00007fb696d4eae0 R11: 0000000000000206 R12: 00007ffec95889b0
R13: 00007ffec95896b3 R14: 0000559e58913260 R15: 0000559e589137c0
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
388 lines
8.4 KiB
C
388 lines
8.4 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#include <linux/kernel.h>
|
|
#include <linux/module.h>
|
|
#include <linux/backing-dev.h>
|
|
#include <linux/bio.h>
|
|
#include <linux/blkdev.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/init.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/workqueue.h>
|
|
#include <linux/smp.h>
|
|
|
|
#include <linux/blk-mq.h>
|
|
#include "blk.h"
|
|
#include "blk-mq.h"
|
|
#include "blk-mq-tag.h"
|
|
|
|
static void blk_mq_sysfs_release(struct kobject *kobj)
|
|
{
|
|
struct blk_mq_ctxs *ctxs = container_of(kobj, struct blk_mq_ctxs, kobj);
|
|
|
|
free_percpu(ctxs->queue_ctx);
|
|
kfree(ctxs);
|
|
}
|
|
|
|
static void blk_mq_ctx_sysfs_release(struct kobject *kobj)
|
|
{
|
|
struct blk_mq_ctx *ctx = container_of(kobj, struct blk_mq_ctx, kobj);
|
|
|
|
/* ctx->ctxs won't be released until all ctx are freed */
|
|
kobject_put(&ctx->ctxs->kobj);
|
|
}
|
|
|
|
static void blk_mq_hw_sysfs_release(struct kobject *kobj)
|
|
{
|
|
struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
|
|
kobj);
|
|
|
|
cancel_delayed_work_sync(&hctx->run_work);
|
|
|
|
if (hctx->flags & BLK_MQ_F_BLOCKING)
|
|
cleanup_srcu_struct(hctx->srcu);
|
|
blk_free_flush_queue(hctx->fq);
|
|
sbitmap_free(&hctx->ctx_map);
|
|
free_cpumask_var(hctx->cpumask);
|
|
kfree(hctx->ctxs);
|
|
kfree(hctx);
|
|
}
|
|
|
|
struct blk_mq_ctx_sysfs_entry {
|
|
struct attribute attr;
|
|
ssize_t (*show)(struct blk_mq_ctx *, char *);
|
|
ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t);
|
|
};
|
|
|
|
struct blk_mq_hw_ctx_sysfs_entry {
|
|
struct attribute attr;
|
|
ssize_t (*show)(struct blk_mq_hw_ctx *, char *);
|
|
ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t);
|
|
};
|
|
|
|
static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,
|
|
char *page)
|
|
{
|
|
struct blk_mq_ctx_sysfs_entry *entry;
|
|
struct blk_mq_ctx *ctx;
|
|
struct request_queue *q;
|
|
ssize_t res;
|
|
|
|
entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
|
|
ctx = container_of(kobj, struct blk_mq_ctx, kobj);
|
|
q = ctx->queue;
|
|
|
|
if (!entry->show)
|
|
return -EIO;
|
|
|
|
res = -ENOENT;
|
|
mutex_lock(&q->sysfs_lock);
|
|
if (!blk_queue_dying(q))
|
|
res = entry->show(ctx, page);
|
|
mutex_unlock(&q->sysfs_lock);
|
|
return res;
|
|
}
|
|
|
|
static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,
|
|
const char *page, size_t length)
|
|
{
|
|
struct blk_mq_ctx_sysfs_entry *entry;
|
|
struct blk_mq_ctx *ctx;
|
|
struct request_queue *q;
|
|
ssize_t res;
|
|
|
|
entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
|
|
ctx = container_of(kobj, struct blk_mq_ctx, kobj);
|
|
q = ctx->queue;
|
|
|
|
if (!entry->store)
|
|
return -EIO;
|
|
|
|
res = -ENOENT;
|
|
mutex_lock(&q->sysfs_lock);
|
|
if (!blk_queue_dying(q))
|
|
res = entry->store(ctx, page, length);
|
|
mutex_unlock(&q->sysfs_lock);
|
|
return res;
|
|
}
|
|
|
|
static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
|
|
struct attribute *attr, char *page)
|
|
{
|
|
struct blk_mq_hw_ctx_sysfs_entry *entry;
|
|
struct blk_mq_hw_ctx *hctx;
|
|
struct request_queue *q;
|
|
ssize_t res;
|
|
|
|
entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
|
|
hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
|
|
q = hctx->queue;
|
|
|
|
if (!entry->show)
|
|
return -EIO;
|
|
|
|
res = -ENOENT;
|
|
mutex_lock(&q->sysfs_lock);
|
|
if (!blk_queue_dying(q))
|
|
res = entry->show(hctx, page);
|
|
mutex_unlock(&q->sysfs_lock);
|
|
return res;
|
|
}
|
|
|
|
static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,
|
|
struct attribute *attr, const char *page,
|
|
size_t length)
|
|
{
|
|
struct blk_mq_hw_ctx_sysfs_entry *entry;
|
|
struct blk_mq_hw_ctx *hctx;
|
|
struct request_queue *q;
|
|
ssize_t res;
|
|
|
|
entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
|
|
hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
|
|
q = hctx->queue;
|
|
|
|
if (!entry->store)
|
|
return -EIO;
|
|
|
|
res = -ENOENT;
|
|
mutex_lock(&q->sysfs_lock);
|
|
if (!blk_queue_dying(q))
|
|
res = entry->store(hctx, page, length);
|
|
mutex_unlock(&q->sysfs_lock);
|
|
return res;
|
|
}
|
|
|
|
static ssize_t blk_mq_hw_sysfs_nr_tags_show(struct blk_mq_hw_ctx *hctx,
|
|
char *page)
|
|
{
|
|
return sprintf(page, "%u\n", hctx->tags->nr_tags);
|
|
}
|
|
|
|
static ssize_t blk_mq_hw_sysfs_nr_reserved_tags_show(struct blk_mq_hw_ctx *hctx,
|
|
char *page)
|
|
{
|
|
return sprintf(page, "%u\n", hctx->tags->nr_reserved_tags);
|
|
}
|
|
|
|
static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
|
|
{
|
|
unsigned int i, first = 1;
|
|
ssize_t ret = 0;
|
|
|
|
for_each_cpu(i, hctx->cpumask) {
|
|
if (first)
|
|
ret += sprintf(ret + page, "%u", i);
|
|
else
|
|
ret += sprintf(ret + page, ", %u", i);
|
|
|
|
first = 0;
|
|
}
|
|
|
|
ret += sprintf(ret + page, "\n");
|
|
return ret;
|
|
}
|
|
|
|
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = {
|
|
.attr = {.name = "nr_tags", .mode = 0444 },
|
|
.show = blk_mq_hw_sysfs_nr_tags_show,
|
|
};
|
|
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = {
|
|
.attr = {.name = "nr_reserved_tags", .mode = 0444 },
|
|
.show = blk_mq_hw_sysfs_nr_reserved_tags_show,
|
|
};
|
|
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
|
|
.attr = {.name = "cpu_list", .mode = 0444 },
|
|
.show = blk_mq_hw_sysfs_cpus_show,
|
|
};
|
|
|
|
static struct attribute *default_hw_ctx_attrs[] = {
|
|
&blk_mq_hw_sysfs_nr_tags.attr,
|
|
&blk_mq_hw_sysfs_nr_reserved_tags.attr,
|
|
&blk_mq_hw_sysfs_cpus.attr,
|
|
NULL,
|
|
};
|
|
ATTRIBUTE_GROUPS(default_hw_ctx);
|
|
|
|
static const struct sysfs_ops blk_mq_sysfs_ops = {
|
|
.show = blk_mq_sysfs_show,
|
|
.store = blk_mq_sysfs_store,
|
|
};
|
|
|
|
static const struct sysfs_ops blk_mq_hw_sysfs_ops = {
|
|
.show = blk_mq_hw_sysfs_show,
|
|
.store = blk_mq_hw_sysfs_store,
|
|
};
|
|
|
|
static struct kobj_type blk_mq_ktype = {
|
|
.sysfs_ops = &blk_mq_sysfs_ops,
|
|
.release = blk_mq_sysfs_release,
|
|
};
|
|
|
|
static struct kobj_type blk_mq_ctx_ktype = {
|
|
.sysfs_ops = &blk_mq_sysfs_ops,
|
|
.release = blk_mq_ctx_sysfs_release,
|
|
};
|
|
|
|
static struct kobj_type blk_mq_hw_ktype = {
|
|
.sysfs_ops = &blk_mq_hw_sysfs_ops,
|
|
.default_groups = default_hw_ctx_groups,
|
|
.release = blk_mq_hw_sysfs_release,
|
|
};
|
|
|
|
static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
|
|
{
|
|
struct blk_mq_ctx *ctx;
|
|
int i;
|
|
|
|
if (!hctx->nr_ctx)
|
|
return;
|
|
|
|
hctx_for_each_ctx(hctx, ctx, i)
|
|
kobject_del(&ctx->kobj);
|
|
|
|
kobject_del(&hctx->kobj);
|
|
}
|
|
|
|
static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
|
|
{
|
|
struct request_queue *q = hctx->queue;
|
|
struct blk_mq_ctx *ctx;
|
|
int i, ret;
|
|
|
|
if (!hctx->nr_ctx)
|
|
return 0;
|
|
|
|
ret = kobject_add(&hctx->kobj, q->mq_kobj, "%u", hctx->queue_num);
|
|
if (ret)
|
|
return ret;
|
|
|
|
hctx_for_each_ctx(hctx, ctx, i) {
|
|
ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
|
|
if (ret)
|
|
break;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
|
|
{
|
|
struct blk_mq_hw_ctx *hctx;
|
|
int i;
|
|
|
|
lockdep_assert_held(&q->sysfs_dir_lock);
|
|
|
|
queue_for_each_hw_ctx(q, hctx, i)
|
|
blk_mq_unregister_hctx(hctx);
|
|
|
|
kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
|
|
kobject_del(q->mq_kobj);
|
|
kobject_put(&dev->kobj);
|
|
|
|
q->mq_sysfs_init_done = false;
|
|
}
|
|
|
|
void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
|
|
{
|
|
kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
|
|
}
|
|
|
|
void blk_mq_sysfs_deinit(struct request_queue *q)
|
|
{
|
|
struct blk_mq_ctx *ctx;
|
|
int cpu;
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
ctx = per_cpu_ptr(q->queue_ctx, cpu);
|
|
kobject_put(&ctx->kobj);
|
|
}
|
|
kobject_put(q->mq_kobj);
|
|
}
|
|
|
|
void blk_mq_sysfs_init(struct request_queue *q)
|
|
{
|
|
struct blk_mq_ctx *ctx;
|
|
int cpu;
|
|
|
|
kobject_init(q->mq_kobj, &blk_mq_ktype);
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
ctx = per_cpu_ptr(q->queue_ctx, cpu);
|
|
|
|
kobject_get(q->mq_kobj);
|
|
kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
|
|
}
|
|
}
|
|
|
|
int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
|
|
{
|
|
struct blk_mq_hw_ctx *hctx;
|
|
int ret, i;
|
|
|
|
WARN_ON_ONCE(!q->kobj.parent);
|
|
lockdep_assert_held(&q->sysfs_dir_lock);
|
|
|
|
ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
kobject_uevent(q->mq_kobj, KOBJ_ADD);
|
|
|
|
queue_for_each_hw_ctx(q, hctx, i) {
|
|
ret = blk_mq_register_hctx(hctx);
|
|
if (ret)
|
|
goto unreg;
|
|
}
|
|
|
|
q->mq_sysfs_init_done = true;
|
|
|
|
out:
|
|
return ret;
|
|
|
|
unreg:
|
|
while (--i >= 0)
|
|
blk_mq_unregister_hctx(q->queue_hw_ctx[i]);
|
|
|
|
kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
|
|
kobject_del(q->mq_kobj);
|
|
kobject_put(&dev->kobj);
|
|
return ret;
|
|
}
|
|
|
|
void blk_mq_sysfs_unregister(struct request_queue *q)
|
|
{
|
|
struct blk_mq_hw_ctx *hctx;
|
|
int i;
|
|
|
|
mutex_lock(&q->sysfs_dir_lock);
|
|
if (!q->mq_sysfs_init_done)
|
|
goto unlock;
|
|
|
|
queue_for_each_hw_ctx(q, hctx, i)
|
|
blk_mq_unregister_hctx(hctx);
|
|
|
|
unlock:
|
|
mutex_unlock(&q->sysfs_dir_lock);
|
|
}
|
|
|
|
int blk_mq_sysfs_register(struct request_queue *q)
|
|
{
|
|
struct blk_mq_hw_ctx *hctx;
|
|
int i, ret = 0;
|
|
|
|
mutex_lock(&q->sysfs_dir_lock);
|
|
if (!q->mq_sysfs_init_done)
|
|
goto unlock;
|
|
|
|
queue_for_each_hw_ctx(q, hctx, i) {
|
|
ret = blk_mq_register_hctx(hctx);
|
|
if (ret)
|
|
break;
|
|
}
|
|
|
|
unlock:
|
|
mutex_unlock(&q->sysfs_dir_lock);
|
|
|
|
return ret;
|
|
}
|