2008-09-14 12:55:09 +00:00
|
|
|
/*
|
|
|
|
* Functions related to generic timeout handling of requests.
|
|
|
|
*/
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/blkdev.h>
|
2008-09-14 12:56:33 +00:00
|
|
|
#include <linux/fault-inject.h>
|
2008-09-14 12:55:09 +00:00
|
|
|
|
|
|
|
#include "blk.h"
|
2014-09-22 16:21:48 +00:00
|
|
|
#include "blk-mq.h"
|
2008-09-14 12:55:09 +00:00
|
|
|
|
2008-09-14 12:56:33 +00:00
|
|
|
#ifdef CONFIG_FAIL_IO_TIMEOUT
|
|
|
|
|
|
|
|
static DECLARE_FAULT_ATTR(fail_io_timeout);
|
|
|
|
|
|
|
|
static int __init setup_fail_io_timeout(char *str)
|
|
|
|
{
|
|
|
|
return setup_fault_attr(&fail_io_timeout, str);
|
|
|
|
}
|
|
|
|
__setup("fail_io_timeout=", setup_fail_io_timeout);
|
|
|
|
|
|
|
|
int blk_should_fake_timeout(struct request_queue *q)
|
|
|
|
{
|
|
|
|
if (!test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return should_fail(&fail_io_timeout, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __init fail_io_timeout_debugfs(void)
|
|
|
|
{
|
2011-08-03 23:21:01 +00:00
|
|
|
struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout",
|
|
|
|
NULL, &fail_io_timeout);
|
|
|
|
|
2013-11-06 07:55:44 +00:00
|
|
|
return PTR_ERR_OR_ZERO(dir);
|
2008-09-14 12:56:33 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
late_initcall(fail_io_timeout_debugfs);
|
|
|
|
|
|
|
|
ssize_t part_timeout_show(struct device *dev, struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
struct gendisk *disk = dev_to_disk(dev);
|
|
|
|
int set = test_bit(QUEUE_FLAG_FAIL_IO, &disk->queue->queue_flags);
|
|
|
|
|
|
|
|
return sprintf(buf, "%d\n", set != 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
|
|
|
|
const char *buf, size_t count)
|
|
|
|
{
|
|
|
|
struct gendisk *disk = dev_to_disk(dev);
|
|
|
|
int val;
|
|
|
|
|
|
|
|
if (count) {
|
|
|
|
struct request_queue *q = disk->queue;
|
|
|
|
char *p = (char *) buf;
|
|
|
|
|
|
|
|
val = simple_strtoul(p, &p, 10);
|
|
|
|
spin_lock_irq(q->queue_lock);
|
|
|
|
if (val)
|
|
|
|
queue_flag_set(QUEUE_FLAG_FAIL_IO, q);
|
|
|
|
else
|
|
|
|
queue_flag_clear(QUEUE_FLAG_FAIL_IO, q);
|
|
|
|
spin_unlock_irq(q->queue_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_FAIL_IO_TIMEOUT */
|
|
|
|
|
2008-09-14 12:55:09 +00:00
|
|
|
/*
|
|
|
|
* blk_delete_timer - Delete/cancel timer for a given function.
|
|
|
|
* @req: request that we are canceling timer for
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
void blk_delete_timer(struct request *req)
|
|
|
|
{
|
|
|
|
list_del_init(&req->timeout_list);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void blk_rq_timed_out(struct request *req)
|
|
|
|
{
|
|
|
|
struct request_queue *q = req->q;
|
2013-01-30 09:26:17 +00:00
|
|
|
enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
|
2008-09-14 12:55:09 +00:00
|
|
|
|
2013-01-30 09:26:17 +00:00
|
|
|
if (q->rq_timed_out_fn)
|
|
|
|
ret = q->rq_timed_out_fn(req);
|
2008-09-14 12:55:09 +00:00
|
|
|
switch (ret) {
|
|
|
|
case BLK_EH_HANDLED:
|
2014-09-13 23:40:12 +00:00
|
|
|
__blk_complete_request(req);
|
2008-09-14 12:55:09 +00:00
|
|
|
break;
|
|
|
|
case BLK_EH_RESET_TIMER:
|
2014-04-24 14:51:47 +00:00
|
|
|
blk_add_timer(req);
|
block: fix race between request completion and timeout handling
crocode i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support shpchp ioatdma dca be2net sg ses enclosure ext4 mbcache jbd2 sd_mod crc_t10dif ahci megaraid_sas(U) dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan]
Pid: 491, comm: scsi_eh_0 Tainted: G W ---------------- 2.6.32-220.13.1.el6.x86_64 #1 IBM -[8722PAX]-/00D1461
RIP: 0010:[<ffffffff8124e424>] [<ffffffff8124e424>] blk_requeue_request+0x94/0xa0
RSP: 0018:ffff881057eefd60 EFLAGS: 00010012
RAX: ffff881d99e3e8a8 RBX: ffff881d99e3e780 RCX: ffff881d99e3e8a8
RDX: ffff881d99e3e8a8 RSI: ffff881d99e3e780 RDI: ffff881d99e3e780
RBP: ffff881057eefd80 R08: ffff881057eefe90 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff881057f92338
R13: 0000000000000000 R14: ffff881057f92338 R15: ffff883058188000
FS: 0000000000000000(0000) GS:ffff880040200000(0000) knlGS:0000000000000000
CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 00000000006d3ec0 CR3: 000000302cd7d000 CR4: 00000000000406b0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process scsi_eh_0 (pid: 491, threadinfo ffff881057eee000, task ffff881057e29540)
Stack:
0000000000001057 0000000000000286 ffff8810275efdc0 ffff881057f16000
<0> ffff881057eefdd0 ffffffff81362323 ffff881057eefe20 ffffffff8135f393
<0> ffff881057e29af8 ffff8810275efdc0 ffff881057eefe78 ffff881057eefe90
Call Trace:
[<ffffffff81362323>] __scsi_queue_insert+0xa3/0x150
[<ffffffff8135f393>] ? scsi_eh_ready_devs+0x5e3/0x850
[<ffffffff81362a23>] scsi_queue_insert+0x13/0x20
[<ffffffff8135e4d4>] scsi_eh_flush_done_q+0x104/0x160
[<ffffffff8135fb6b>] scsi_error_handler+0x35b/0x660
[<ffffffff8135f810>] ? scsi_error_handler+0x0/0x660
[<ffffffff810908c6>] kthread+0x96/0xa0
[<ffffffff8100c14a>] child_rip+0xa/0x20
[<ffffffff81090830>] ? kthread+0x0/0xa0
[<ffffffff8100c140>] ? child_rip+0x0/0x20
Code: 00 00 eb d1 4c 8b 2d 3c 8f 97 00 4d 85 ed 74 bf 49 8b 45 00 49 83 c5 08 48 89 de 4c 89 e7 ff d0 49 8b 45 00 48 85 c0 75 eb eb a4 <0f> 0b eb fe 0f 1f 84 00 00 00 00 00 55 48 89 e5 0f 1f 44 00 00
RIP [<ffffffff8124e424>] blk_requeue_request+0x94/0xa0
RSP <ffff881057eefd60>
The RIP is this line:
BUG_ON(blk_queued_rq(rq));
After digging through the code, I think there may be a race between the
request completion and the timer handler running.
A timer is started for each request put on the device's queue (see
blk_start_request->blk_add_timer). If the request does not complete
before the timer expires, the timer handler (blk_rq_timed_out_timer)
will mark the request complete atomically:
static inline int blk_mark_rq_complete(struct request *rq)
{
return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
}
and then call blk_rq_timed_out. The latter function will call
scsi_times_out, which will return one of BLK_EH_HANDLED,
BLK_EH_RESET_TIMER or BLK_EH_NOT_HANDLED. If BLK_EH_RESET_TIMER is
returned, blk_clear_rq_complete is called, and blk_add_timer is again
called to simply wait longer for the request to complete.
Now, if the request happens to complete while this is going on, what
happens? Given that we know the completion handler will bail if it
finds the REQ_ATOM_COMPLETE bit set, we need to focus on the completion
handler running after that bit is cleared. So, from the above
paragraph, after the call to blk_clear_rq_complete. If the completion
sets REQ_ATOM_COMPLETE before the BUG_ON in blk_add_timer, we go boom
there (I haven't seen this in the cores). Next, if we get the
completion before the call to list_add_tail, then the timer will
eventually fire for an old req, which may either be freed or reallocated
(there is evidence that this might be the case). Finally, if the
completion comes in *after* the addition to the timeout list, I think
it's harmless. The request will be removed from the timeout list,
req_atom_complete will be set, and all will be well.
This will only actually explain the coredumps *IF* the request
structure was freed, reallocated *and* queued before the error handler
thread had a chance to process it. That is possible, but it may make
sense to keep digging for another race. I think that if this is what
was happening, we would see other instances of this problem showing up
as null pointer or garbage pointer dereferences, for example when the
request structure was not re-used. It looks like we actually do run
into that situation in other reports.
This patch moves the BUG_ON(test_bit(REQ_ATOM_COMPLETE,
&req->atomic_flags)); from blk_add_timer to the only caller that could
trip over it (blk_start_request). It then inverts the calls to
blk_clear_rq_complete and blk_add_timer in blk_rq_timed_out to address
the race. I've boot tested this patch, but nothing more.
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Hannes Reinecke <hare@suse.de>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2013-10-08 18:36:41 +00:00
|
|
|
blk_clear_rq_complete(req);
|
2008-09-14 12:55:09 +00:00
|
|
|
break;
|
|
|
|
case BLK_EH_NOT_HANDLED:
|
|
|
|
/*
|
|
|
|
* LLD handles this for now but in the future
|
|
|
|
* we can send a request msg to abort the command
|
|
|
|
* and we can move more of the generic scsi eh code to
|
|
|
|
* the blk layer.
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
printk(KERN_ERR "block: bad eh return: %d\n", ret);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-09-13 23:40:12 +00:00
|
|
|
static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
|
blk-mq: new multi-queue block IO queueing mechanism
Linux currently has two models for block devices:
- The classic request_fn based approach, where drivers use struct
request units for IO. The block layer provides various helper
functionalities to let drivers share code, things like tag
management, timeout handling, queueing, etc.
- The "stacked" approach, where a driver squeezes in between the
block layer and IO submitter. Since this bypasses the IO stack,
driver generally have to manage everything themselves.
With drivers being written for new high IOPS devices, the classic
request_fn based driver doesn't work well enough. The design dates
back to when both SMP and high IOPS was rare. It has problems with
scaling to bigger machines, and runs into scaling issues even on
smaller machines when you have IOPS in the hundreds of thousands
per device.
The stacked approach is then most often selected as the model
for the driver. But this means that everybody has to re-invent
everything, and along with that we get all the problems again
that the shared approach solved.
This commit introduces blk-mq, block multi queue support. The
design is centered around per-cpu queues for queueing IO, which
then funnel down into x number of hardware submission queues.
We might have a 1:1 mapping between the two, or it might be
an N:M mapping. That all depends on what the hardware supports.
blk-mq provides various helper functions, which include:
- Scalable support for request tagging. Most devices need to
be able to uniquely identify a request both in the driver and
to the hardware. The tagging uses per-cpu caches for freed
tags, to enable cache hot reuse.
- Timeout handling without tracking request on a per-device
basis. Basically the driver should be able to get a notification,
if a request happens to fail.
- Optional support for non 1:1 mappings between issue and
submission queues. blk-mq can redirect IO completions to the
desired location.
- Support for per-request payloads. Drivers almost always need
to associate a request structure with some driver private
command structure. Drivers can tell blk-mq this at init time,
and then any request handed to the driver will have the
required size of memory associated with it.
- Support for merging of IO, and plugging. The stacked model
gets neither of these. Even for high IOPS devices, merging
sequential IO reduces per-command overhead and thus
increases bandwidth.
For now, this is provided as a potential 3rd queueing model, with
the hope being that, as it matures, it can replace both the classic
and stacked model. That would get us back to having just 1 real
model for block devices, leaving the stacked approach to dm/md
devices (as it was originally intended).
Contributions in this patch from the following people:
Shaohua Li <shli@fusionio.com>
Alexander Gordeev <agordeev@redhat.com>
Christoph Hellwig <hch@infradead.org>
Mike Christie <michaelc@cs.wisc.edu>
Matias Bjorling <m@bjorling.me>
Jeff Moyer <jmoyer@redhat.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2013-10-24 08:20:05 +00:00
|
|
|
unsigned int *next_set)
|
|
|
|
{
|
|
|
|
if (time_after_eq(jiffies, rq->deadline)) {
|
|
|
|
list_del_init(&rq->timeout_list);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if we raced with end io completion
|
|
|
|
*/
|
|
|
|
if (!blk_mark_rq_complete(rq))
|
|
|
|
blk_rq_timed_out(rq);
|
|
|
|
} else if (!*next_set || time_after(*next_timeout, rq->deadline)) {
|
|
|
|
*next_timeout = rq->deadline;
|
|
|
|
*next_set = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-30 12:57:30 +00:00
|
|
|
void blk_timeout_work(struct work_struct *work)
|
2008-09-14 12:55:09 +00:00
|
|
|
{
|
2015-10-30 12:57:30 +00:00
|
|
|
struct request_queue *q =
|
|
|
|
container_of(work, struct request_queue, timeout_work);
|
2008-10-30 07:51:58 +00:00
|
|
|
unsigned long flags, next = 0;
|
2008-09-14 12:55:09 +00:00
|
|
|
struct request *rq, *tmp;
|
2010-04-14 18:54:03 +00:00
|
|
|
int next_set = 0;
|
2008-09-14 12:55:09 +00:00
|
|
|
|
|
|
|
spin_lock_irqsave(q->queue_lock, flags);
|
|
|
|
|
blk-mq: new multi-queue block IO queueing mechanism
Linux currently has two models for block devices:
- The classic request_fn based approach, where drivers use struct
request units for IO. The block layer provides various helper
functionalities to let drivers share code, things like tag
management, timeout handling, queueing, etc.
- The "stacked" approach, where a driver squeezes in between the
block layer and IO submitter. Since this bypasses the IO stack,
driver generally have to manage everything themselves.
With drivers being written for new high IOPS devices, the classic
request_fn based driver doesn't work well enough. The design dates
back to when both SMP and high IOPS was rare. It has problems with
scaling to bigger machines, and runs into scaling issues even on
smaller machines when you have IOPS in the hundreds of thousands
per device.
The stacked approach is then most often selected as the model
for the driver. But this means that everybody has to re-invent
everything, and along with that we get all the problems again
that the shared approach solved.
This commit introduces blk-mq, block multi queue support. The
design is centered around per-cpu queues for queueing IO, which
then funnel down into x number of hardware submission queues.
We might have a 1:1 mapping between the two, or it might be
an N:M mapping. That all depends on what the hardware supports.
blk-mq provides various helper functions, which include:
- Scalable support for request tagging. Most devices need to
be able to uniquely identify a request both in the driver and
to the hardware. The tagging uses per-cpu caches for freed
tags, to enable cache hot reuse.
- Timeout handling without tracking request on a per-device
basis. Basically the driver should be able to get a notification,
if a request happens to fail.
- Optional support for non 1:1 mappings between issue and
submission queues. blk-mq can redirect IO completions to the
desired location.
- Support for per-request payloads. Drivers almost always need
to associate a request structure with some driver private
command structure. Drivers can tell blk-mq this at init time,
and then any request handed to the driver will have the
required size of memory associated with it.
- Support for merging of IO, and plugging. The stacked model
gets neither of these. Even for high IOPS devices, merging
sequential IO reduces per-command overhead and thus
increases bandwidth.
For now, this is provided as a potential 3rd queueing model, with
the hope being that, as it matures, it can replace both the classic
and stacked model. That would get us back to having just 1 real
model for block devices, leaving the stacked approach to dm/md
devices (as it was originally intended).
Contributions in this patch from the following people:
Shaohua Li <shli@fusionio.com>
Alexander Gordeev <agordeev@redhat.com>
Christoph Hellwig <hch@infradead.org>
Mike Christie <michaelc@cs.wisc.edu>
Matias Bjorling <m@bjorling.me>
Jeff Moyer <jmoyer@redhat.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2013-10-24 08:20:05 +00:00
|
|
|
list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
|
|
|
|
blk_rq_check_expired(rq, &next, &next_set);
|
2008-09-14 12:55:09 +00:00
|
|
|
|
2010-04-14 18:54:03 +00:00
|
|
|
if (next_set)
|
2008-11-06 07:42:49 +00:00
|
|
|
mod_timer(&q->timeout, round_jiffies_up(next));
|
2008-09-14 12:55:09 +00:00
|
|
|
|
|
|
|
spin_unlock_irqrestore(q->queue_lock, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* blk_abort_request -- Request request recovery for the specified command
|
|
|
|
* @req: pointer to the request of interest
|
|
|
|
*
|
|
|
|
* This function requests that the block layer start recovery for the
|
|
|
|
* request by deleting the timer and calling the q's timeout function.
|
|
|
|
* LLDDs who implement their own error recovery MAY ignore the timeout
|
|
|
|
* event if they generated blk_abort_req. Must hold queue lock.
|
|
|
|
*/
|
|
|
|
void blk_abort_request(struct request *req)
|
|
|
|
{
|
2015-10-30 12:47:04 +00:00
|
|
|
if (req->q->mq_ops) {
|
2018-01-09 16:29:50 +00:00
|
|
|
/*
|
|
|
|
* All we need to ensure is that timeout scan takes place
|
|
|
|
* immediately and that scan sees the new timeout value.
|
|
|
|
* No need for fancy synchronizations.
|
|
|
|
*/
|
|
|
|
req->deadline = jiffies;
|
|
|
|
mod_timer(&req->q->timeout, 0);
|
2015-10-30 12:47:04 +00:00
|
|
|
} else {
|
2018-01-09 16:29:50 +00:00
|
|
|
if (blk_mark_rq_complete(req))
|
|
|
|
return;
|
2015-10-30 12:47:04 +00:00
|
|
|
blk_delete_timer(req);
|
2014-09-22 16:21:48 +00:00
|
|
|
blk_rq_timed_out(req);
|
2015-10-30 12:47:04 +00:00
|
|
|
}
|
2008-09-14 12:55:09 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(blk_abort_request);
|
|
|
|
|
2014-05-13 21:10:52 +00:00
|
|
|
unsigned long blk_rq_timeout(unsigned long timeout)
|
|
|
|
{
|
|
|
|
unsigned long maxt;
|
|
|
|
|
|
|
|
maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT);
|
|
|
|
if (time_after(timeout, maxt))
|
|
|
|
timeout = maxt;
|
|
|
|
|
|
|
|
return timeout;
|
|
|
|
}
|
|
|
|
|
2014-04-25 12:14:48 +00:00
|
|
|
/**
|
|
|
|
* blk_add_timer - Start timeout timer for a single request
|
|
|
|
* @req: request that is about to start running.
|
|
|
|
*
|
|
|
|
* Notes:
|
|
|
|
* Each request has its own timer, and as it is added to the queue, we
|
|
|
|
* set up the timer. When the request completes, we cancel the timer.
|
|
|
|
*/
|
|
|
|
void blk_add_timer(struct request *req)
|
2008-09-14 12:55:09 +00:00
|
|
|
{
|
|
|
|
struct request_queue *q = req->q;
|
|
|
|
unsigned long expiry;
|
|
|
|
|
2017-06-20 18:15:45 +00:00
|
|
|
if (!q->mq_ops)
|
|
|
|
lockdep_assert_held(q->queue_lock);
|
|
|
|
|
2014-09-19 13:53:46 +00:00
|
|
|
/* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */
|
|
|
|
if (!q->mq_ops && !q->rq_timed_out_fn)
|
2008-09-14 12:55:09 +00:00
|
|
|
return;
|
|
|
|
|
|
|
|
BUG_ON(!list_empty(&req->timeout_list));
|
|
|
|
|
2009-04-23 02:05:18 +00:00
|
|
|
/*
|
|
|
|
* Some LLDs, like scsi, peek at the timeout to prevent a
|
|
|
|
* command from being retried forever.
|
|
|
|
*/
|
|
|
|
if (!req->timeout)
|
2008-09-14 12:55:09 +00:00
|
|
|
req->timeout = q->rq_timeout;
|
2009-04-23 02:05:18 +00:00
|
|
|
|
blk-mq: replace timeout synchronization with a RCU and generation based scheme
Currently, blk-mq timeout path synchronizes against the usual
issue/completion path using a complex scheme involving atomic
bitflags, REQ_ATOM_*, memory barriers and subtle memory coherence
rules. Unfortunately, it contains quite a few holes.
There's a complex dancing around REQ_ATOM_STARTED and
REQ_ATOM_COMPLETE between issue/completion and timeout paths; however,
they don't have a synchronization point across request recycle
instances and it isn't clear what the barriers add.
blk_mq_check_expired() can easily read STARTED from N-2'th iteration,
deadline from N-1'th, blk_mark_rq_complete() against Nth instance.
In fact, it's pretty easy to make blk_mq_check_expired() terminate a
later instance of a request. If we induce 5 sec delay before
time_after_eq() test in blk_mq_check_expired(), shorten the timeout to
2s, and issue back-to-back large IOs, blk-mq starts timing out
requests spuriously pretty quickly. Nothing actually timed out. It
just made the call on a recycle instance of a request and then
terminated a later instance long after the original instance finished.
The scenario isn't theoretical either.
This patch replaces the broken synchronization mechanism with a RCU
and generation number based one.
1. Each request has a u64 generation + state value, which can be
updated only by the request owner. Whenever a request becomes
in-flight, the generation number gets bumped up too. This provides
the basis for the timeout path to distinguish different recycle
instances of the request.
Also, marking a request in-flight and setting its deadline are
protected with a seqcount so that the timeout path can fetch both
values coherently.
2. The timeout path fetches the generation, state and deadline. If
the verdict is timeout, it records the generation into a dedicated
request abortion field and does RCU wait.
3. The completion path is also protected by RCU (from the previous
patch) and checks whether the current generation number and state
match the abortion field. If so, it skips completion.
4. The timeout path, after RCU wait, scans requests again and
terminates the ones whose generation and state still match the ones
requested for abortion.
By now, the timeout path knows that either the generation number
and state changed if it lost the race or the completion will yield
to it and can safely timeout the request.
While it's more lines of code, it's conceptually simpler, doesn't
depend on direct use of subtle memory ordering or coherence, and
hopefully doesn't terminate the wrong instance.
While this change makes REQ_ATOM_COMPLETE synchronization unnecessary
between issue/complete and timeout paths, REQ_ATOM_COMPLETE isn't
removed yet as it's still used in other places. Future patches will
move all state tracking to the new mechanism and remove all bitops in
the hot paths.
Note that this patch adds a comment explaining a race condition in
BLK_EH_RESET_TIMER path. The race has always been there and this
patch doesn't change it. It's just documenting the existing race.
v2: - Fixed BLK_EH_RESET_TIMER handling as pointed out by Jianchao.
- s/request->gstate_seqc/request->gstate_seq/ as suggested by Peter.
- READ_ONCE() added in blk_mq_rq_update_state() as suggested by Peter.
v3: - Fixed possible extended seqcount / u64_stats_sync read looping
spotted by Peter.
- MQ_RQ_IDLE was incorrectly being set in complete_request instead
of free_request. Fixed.
v4: - Rebased on top of hctx_lock() refactoring patch.
- Added comment explaining the use of hctx_lock() in completion path.
v5: - Added comments requested by Bart.
- Note the addition of BLK_EH_RESET_TIMER race condition in the
commit message.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: "jianchao.wang" <jianchao.w.wang@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Bart Van Assche <Bart.VanAssche@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-01-09 16:29:48 +00:00
|
|
|
req->deadline = jiffies + req->timeout;
|
2015-11-24 22:58:53 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Only the non-mq case needs to add the request to a protected list.
|
|
|
|
* For the mq case we simply scan the tag map.
|
|
|
|
*/
|
2014-04-25 12:14:48 +00:00
|
|
|
if (!q->mq_ops)
|
|
|
|
list_add_tail(&req->timeout_list, &req->q->timeout_list);
|
2008-09-14 12:55:09 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the timer isn't already pending or this timeout is earlier
|
2008-11-06 07:42:49 +00:00
|
|
|
* than an existing one, modify the timer. Round up to next nearest
|
2008-09-14 12:55:09 +00:00
|
|
|
* second.
|
|
|
|
*/
|
2014-05-13 21:10:52 +00:00
|
|
|
expiry = blk_rq_timeout(round_jiffies_up(req->deadline));
|
2008-09-14 12:55:09 +00:00
|
|
|
|
|
|
|
if (!timer_pending(&q->timeout) ||
|
2014-04-16 17:36:54 +00:00
|
|
|
time_before(expiry, q->timeout.expires)) {
|
|
|
|
unsigned long diff = q->timeout.expires - expiry;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Due to added timer slack to group timers, the timer
|
|
|
|
* will often be a little in front of what we asked for.
|
|
|
|
* So apply some tolerance here too, otherwise we keep
|
|
|
|
* modifying the timer because expires for value X
|
|
|
|
* will be X + something.
|
|
|
|
*/
|
2014-05-30 21:41:39 +00:00
|
|
|
if (!timer_pending(&q->timeout) || (diff >= HZ / 2))
|
2014-04-16 17:36:54 +00:00
|
|
|
mod_timer(&q->timeout, expiry);
|
|
|
|
}
|
blk-mq: new multi-queue block IO queueing mechanism
Linux currently has two models for block devices:
- The classic request_fn based approach, where drivers use struct
request units for IO. The block layer provides various helper
functionalities to let drivers share code, things like tag
management, timeout handling, queueing, etc.
- The "stacked" approach, where a driver squeezes in between the
block layer and IO submitter. Since this bypasses the IO stack,
driver generally have to manage everything themselves.
With drivers being written for new high IOPS devices, the classic
request_fn based driver doesn't work well enough. The design dates
back to when both SMP and high IOPS was rare. It has problems with
scaling to bigger machines, and runs into scaling issues even on
smaller machines when you have IOPS in the hundreds of thousands
per device.
The stacked approach is then most often selected as the model
for the driver. But this means that everybody has to re-invent
everything, and along with that we get all the problems again
that the shared approach solved.
This commit introduces blk-mq, block multi queue support. The
design is centered around per-cpu queues for queueing IO, which
then funnel down into x number of hardware submission queues.
We might have a 1:1 mapping between the two, or it might be
an N:M mapping. That all depends on what the hardware supports.
blk-mq provides various helper functions, which include:
- Scalable support for request tagging. Most devices need to
be able to uniquely identify a request both in the driver and
to the hardware. The tagging uses per-cpu caches for freed
tags, to enable cache hot reuse.
- Timeout handling without tracking request on a per-device
basis. Basically the driver should be able to get a notification,
if a request happens to fail.
- Optional support for non 1:1 mappings between issue and
submission queues. blk-mq can redirect IO completions to the
desired location.
- Support for per-request payloads. Drivers almost always need
to associate a request structure with some driver private
command structure. Drivers can tell blk-mq this at init time,
and then any request handed to the driver will have the
required size of memory associated with it.
- Support for merging of IO, and plugging. The stacked model
gets neither of these. Even for high IOPS devices, merging
sequential IO reduces per-command overhead and thus
increases bandwidth.
For now, this is provided as a potential 3rd queueing model, with
the hope being that, as it matures, it can replace both the classic
and stacked model. That would get us back to having just 1 real
model for block devices, leaving the stacked approach to dm/md
devices (as it was originally intended).
Contributions in this patch from the following people:
Shaohua Li <shli@fusionio.com>
Alexander Gordeev <agordeev@redhat.com>
Christoph Hellwig <hch@infradead.org>
Mike Christie <michaelc@cs.wisc.edu>
Matias Bjorling <m@bjorling.me>
Jeff Moyer <jmoyer@redhat.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2013-10-24 08:20:05 +00:00
|
|
|
|
|
|
|
}
|