blktrace: Protect q->blk_trace with RCU

KASAN is reporting that __blk_add_trace() has a use-after-free issue
when accessing q->blk_trace. Indeed the switching of block tracing (and
thus eventual freeing of q->blk_trace) is completely unsynchronized with
the currently running tracing and thus it can happen that the blk_trace
structure is being freed just while __blk_add_trace() works on it.
Protect accesses to q->blk_trace by RCU during tracing and make sure we
wait for the end of RCU grace period when shutting down tracing. Luckily
that is rare enough event that we can afford that. Note that postponing
the freeing of blk_trace to an RCU callback should better be avoided as
it could have unexpected user visible side-effects as debugfs files
would be still existing for a short while block tracing has been shut
down.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=205711
CC: stable@vger.kernel.org
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reported-by: Tristan Madani <tristmd@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Jan Kara 2020-02-06 15:28:12 +01:00 committed by Jens Axboe
parent 01e99aeca3
commit c780e86dd4
3 changed files with 97 additions and 37 deletions

View File

@ -524,7 +524,7 @@ struct request_queue {
unsigned int sg_reserved_size; unsigned int sg_reserved_size;
int node; int node;
#ifdef CONFIG_BLK_DEV_IO_TRACE #ifdef CONFIG_BLK_DEV_IO_TRACE
struct blk_trace *blk_trace; struct blk_trace __rcu *blk_trace;
struct mutex blk_trace_mutex; struct mutex blk_trace_mutex;
#endif #endif
/* /*

View File

@ -51,9 +51,13 @@ void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *f
**/ **/
#define blk_add_cgroup_trace_msg(q, cg, fmt, ...) \ #define blk_add_cgroup_trace_msg(q, cg, fmt, ...) \
do { \ do { \
struct blk_trace *bt = (q)->blk_trace; \ struct blk_trace *bt; \
\
rcu_read_lock(); \
bt = rcu_dereference((q)->blk_trace); \
if (unlikely(bt)) \ if (unlikely(bt)) \
__trace_note_message(bt, cg, fmt, ##__VA_ARGS__);\ __trace_note_message(bt, cg, fmt, ##__VA_ARGS__);\
rcu_read_unlock(); \
} while (0) } while (0)
#define blk_add_trace_msg(q, fmt, ...) \ #define blk_add_trace_msg(q, fmt, ...) \
blk_add_cgroup_trace_msg(q, NULL, fmt, ##__VA_ARGS__) blk_add_cgroup_trace_msg(q, NULL, fmt, ##__VA_ARGS__)
@ -61,10 +65,14 @@ void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *f
static inline bool blk_trace_note_message_enabled(struct request_queue *q) static inline bool blk_trace_note_message_enabled(struct request_queue *q)
{ {
struct blk_trace *bt = q->blk_trace; struct blk_trace *bt;
if (likely(!bt)) bool ret;
return false;
return bt->act_mask & BLK_TC_NOTIFY; rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
ret = bt && (bt->act_mask & BLK_TC_NOTIFY);
rcu_read_unlock();
return ret;
} }
extern void blk_add_driver_data(struct request_queue *q, struct request *rq, extern void blk_add_driver_data(struct request_queue *q, struct request *rq,

View File

@ -335,6 +335,7 @@ static void put_probe_ref(void)
static void blk_trace_cleanup(struct blk_trace *bt) static void blk_trace_cleanup(struct blk_trace *bt)
{ {
synchronize_rcu();
blk_trace_free(bt); blk_trace_free(bt);
put_probe_ref(); put_probe_ref();
} }
@ -629,8 +630,10 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
static int __blk_trace_startstop(struct request_queue *q, int start) static int __blk_trace_startstop(struct request_queue *q, int start)
{ {
int ret; int ret;
struct blk_trace *bt = q->blk_trace; struct blk_trace *bt;
bt = rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->blk_trace_mutex));
if (bt == NULL) if (bt == NULL)
return -EINVAL; return -EINVAL;
@ -740,8 +743,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
void blk_trace_shutdown(struct request_queue *q) void blk_trace_shutdown(struct request_queue *q)
{ {
mutex_lock(&q->blk_trace_mutex); mutex_lock(&q->blk_trace_mutex);
if (rcu_dereference_protected(q->blk_trace,
if (q->blk_trace) { lockdep_is_held(&q->blk_trace_mutex))) {
__blk_trace_startstop(q, 0); __blk_trace_startstop(q, 0);
__blk_trace_remove(q); __blk_trace_remove(q);
} }
@ -752,8 +755,10 @@ void blk_trace_shutdown(struct request_queue *q)
#ifdef CONFIG_BLK_CGROUP #ifdef CONFIG_BLK_CGROUP
static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{ {
struct blk_trace *bt = q->blk_trace; struct blk_trace *bt;
/* We don't use the 'bt' value here except as an optimization... */
bt = rcu_dereference_protected(q->blk_trace, 1);
if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
return 0; return 0;
@ -796,10 +801,14 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
static void blk_add_trace_rq(struct request *rq, int error, static void blk_add_trace_rq(struct request *rq, int error,
unsigned int nr_bytes, u32 what, u64 cgid) unsigned int nr_bytes, u32 what, u64 cgid)
{ {
struct blk_trace *bt = rq->q->blk_trace; struct blk_trace *bt;
if (likely(!bt)) rcu_read_lock();
bt = rcu_dereference(rq->q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return; return;
}
if (blk_rq_is_passthrough(rq)) if (blk_rq_is_passthrough(rq))
what |= BLK_TC_ACT(BLK_TC_PC); what |= BLK_TC_ACT(BLK_TC_PC);
@ -808,6 +817,7 @@ static void blk_add_trace_rq(struct request *rq, int error,
__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
rq->cmd_flags, what, error, 0, NULL, cgid); rq->cmd_flags, what, error, 0, NULL, cgid);
rcu_read_unlock();
} }
static void blk_add_trace_rq_insert(void *ignore, static void blk_add_trace_rq_insert(void *ignore,
@ -853,14 +863,19 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
u32 what, int error) u32 what, int error)
{ {
struct blk_trace *bt = q->blk_trace; struct blk_trace *bt;
if (likely(!bt)) rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return; return;
}
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio_op(bio), bio->bi_opf, what, error, 0, NULL, bio_op(bio), bio->bi_opf, what, error, 0, NULL,
blk_trace_bio_get_cgid(q, bio)); blk_trace_bio_get_cgid(q, bio));
rcu_read_unlock();
} }
static void blk_add_trace_bio_bounce(void *ignore, static void blk_add_trace_bio_bounce(void *ignore,
@ -905,11 +920,14 @@ static void blk_add_trace_getrq(void *ignore,
if (bio) if (bio)
blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
else { else {
struct blk_trace *bt = q->blk_trace; struct blk_trace *bt;
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (bt) if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
NULL, 0); NULL, 0);
rcu_read_unlock();
} }
} }
@ -921,27 +939,35 @@ static void blk_add_trace_sleeprq(void *ignore,
if (bio) if (bio)
blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
else { else {
struct blk_trace *bt = q->blk_trace; struct blk_trace *bt;
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (bt) if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
0, 0, NULL, 0); 0, 0, NULL, 0);
rcu_read_unlock();
} }
} }
static void blk_add_trace_plug(void *ignore, struct request_queue *q) static void blk_add_trace_plug(void *ignore, struct request_queue *q)
{ {
struct blk_trace *bt = q->blk_trace; struct blk_trace *bt;
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (bt) if (bt)
__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0);
rcu_read_unlock();
} }
static void blk_add_trace_unplug(void *ignore, struct request_queue *q, static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
unsigned int depth, bool explicit) unsigned int depth, bool explicit)
{ {
struct blk_trace *bt = q->blk_trace; struct blk_trace *bt;
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (bt) { if (bt) {
__be64 rpdu = cpu_to_be64(depth); __be64 rpdu = cpu_to_be64(depth);
u32 what; u32 what;
@ -953,14 +979,17 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0);
} }
rcu_read_unlock();
} }
static void blk_add_trace_split(void *ignore, static void blk_add_trace_split(void *ignore,
struct request_queue *q, struct bio *bio, struct request_queue *q, struct bio *bio,
unsigned int pdu) unsigned int pdu)
{ {
struct blk_trace *bt = q->blk_trace; struct blk_trace *bt;
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (bt) { if (bt) {
__be64 rpdu = cpu_to_be64(pdu); __be64 rpdu = cpu_to_be64(pdu);
@ -969,6 +998,7 @@ static void blk_add_trace_split(void *ignore,
BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
&rpdu, blk_trace_bio_get_cgid(q, bio)); &rpdu, blk_trace_bio_get_cgid(q, bio));
} }
rcu_read_unlock();
} }
/** /**
@ -988,11 +1018,15 @@ static void blk_add_trace_bio_remap(void *ignore,
struct request_queue *q, struct bio *bio, struct request_queue *q, struct bio *bio,
dev_t dev, sector_t from) dev_t dev, sector_t from)
{ {
struct blk_trace *bt = q->blk_trace; struct blk_trace *bt;
struct blk_io_trace_remap r; struct blk_io_trace_remap r;
if (likely(!bt)) rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return; return;
}
r.device_from = cpu_to_be32(dev); r.device_from = cpu_to_be32(dev);
r.device_to = cpu_to_be32(bio_dev(bio)); r.device_to = cpu_to_be32(bio_dev(bio));
@ -1001,6 +1035,7 @@ static void blk_add_trace_bio_remap(void *ignore,
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
rcu_read_unlock();
} }
/** /**
@ -1021,11 +1056,15 @@ static void blk_add_trace_rq_remap(void *ignore,
struct request *rq, dev_t dev, struct request *rq, dev_t dev,
sector_t from) sector_t from)
{ {
struct blk_trace *bt = q->blk_trace; struct blk_trace *bt;
struct blk_io_trace_remap r; struct blk_io_trace_remap r;
if (likely(!bt)) rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return; return;
}
r.device_from = cpu_to_be32(dev); r.device_from = cpu_to_be32(dev);
r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); r.device_to = cpu_to_be32(disk_devt(rq->rq_disk));
@ -1034,6 +1073,7 @@ static void blk_add_trace_rq_remap(void *ignore,
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
rq_data_dir(rq), 0, BLK_TA_REMAP, 0, rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
sizeof(r), &r, blk_trace_request_get_cgid(q, rq)); sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
rcu_read_unlock();
} }
/** /**
@ -1051,14 +1091,19 @@ void blk_add_driver_data(struct request_queue *q,
struct request *rq, struct request *rq,
void *data, size_t len) void *data, size_t len)
{ {
struct blk_trace *bt = q->blk_trace; struct blk_trace *bt;
if (likely(!bt)) rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return; return;
}
__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, 0, len, data, BLK_TA_DRV_DATA, 0, len, data,
blk_trace_request_get_cgid(q, rq)); blk_trace_request_get_cgid(q, rq));
rcu_read_unlock();
} }
EXPORT_SYMBOL_GPL(blk_add_driver_data); EXPORT_SYMBOL_GPL(blk_add_driver_data);
@ -1597,6 +1642,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
return -EINVAL; return -EINVAL;
put_probe_ref(); put_probe_ref();
synchronize_rcu();
blk_trace_free(bt); blk_trace_free(bt);
return 0; return 0;
} }
@ -1758,6 +1804,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
struct hd_struct *p = dev_to_part(dev); struct hd_struct *p = dev_to_part(dev);
struct request_queue *q; struct request_queue *q;
struct block_device *bdev; struct block_device *bdev;
struct blk_trace *bt;
ssize_t ret = -ENXIO; ssize_t ret = -ENXIO;
bdev = bdget(part_devt(p)); bdev = bdget(part_devt(p));
@ -1770,21 +1817,23 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
mutex_lock(&q->blk_trace_mutex); mutex_lock(&q->blk_trace_mutex);
bt = rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->blk_trace_mutex));
if (attr == &dev_attr_enable) { if (attr == &dev_attr_enable) {
ret = sprintf(buf, "%u\n", !!q->blk_trace); ret = sprintf(buf, "%u\n", !!bt);
goto out_unlock_bdev; goto out_unlock_bdev;
} }
if (q->blk_trace == NULL) if (bt == NULL)
ret = sprintf(buf, "disabled\n"); ret = sprintf(buf, "disabled\n");
else if (attr == &dev_attr_act_mask) else if (attr == &dev_attr_act_mask)
ret = blk_trace_mask2str(buf, q->blk_trace->act_mask); ret = blk_trace_mask2str(buf, bt->act_mask);
else if (attr == &dev_attr_pid) else if (attr == &dev_attr_pid)
ret = sprintf(buf, "%u\n", q->blk_trace->pid); ret = sprintf(buf, "%u\n", bt->pid);
else if (attr == &dev_attr_start_lba) else if (attr == &dev_attr_start_lba)
ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); ret = sprintf(buf, "%llu\n", bt->start_lba);
else if (attr == &dev_attr_end_lba) else if (attr == &dev_attr_end_lba)
ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); ret = sprintf(buf, "%llu\n", bt->end_lba);
out_unlock_bdev: out_unlock_bdev:
mutex_unlock(&q->blk_trace_mutex); mutex_unlock(&q->blk_trace_mutex);
@ -1801,6 +1850,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
struct block_device *bdev; struct block_device *bdev;
struct request_queue *q; struct request_queue *q;
struct hd_struct *p; struct hd_struct *p;
struct blk_trace *bt;
u64 value; u64 value;
ssize_t ret = -EINVAL; ssize_t ret = -EINVAL;
@ -1831,8 +1881,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
mutex_lock(&q->blk_trace_mutex); mutex_lock(&q->blk_trace_mutex);
bt = rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->blk_trace_mutex));
if (attr == &dev_attr_enable) { if (attr == &dev_attr_enable) {
if (!!value == !!q->blk_trace) { if (!!value == !!bt) {
ret = 0; ret = 0;
goto out_unlock_bdev; goto out_unlock_bdev;
} }
@ -1844,18 +1896,18 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
} }
ret = 0; ret = 0;
if (q->blk_trace == NULL) if (bt == NULL)
ret = blk_trace_setup_queue(q, bdev); ret = blk_trace_setup_queue(q, bdev);
if (ret == 0) { if (ret == 0) {
if (attr == &dev_attr_act_mask) if (attr == &dev_attr_act_mask)
q->blk_trace->act_mask = value; bt->act_mask = value;
else if (attr == &dev_attr_pid) else if (attr == &dev_attr_pid)
q->blk_trace->pid = value; bt->pid = value;
else if (attr == &dev_attr_start_lba) else if (attr == &dev_attr_start_lba)
q->blk_trace->start_lba = value; bt->start_lba = value;
else if (attr == &dev_attr_end_lba) else if (attr == &dev_attr_end_lba)
q->blk_trace->end_lba = value; bt->end_lba = value;
} }
out_unlock_bdev: out_unlock_bdev: