2019-05-19 12:08:20 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2007-10-22 01:03:38 +00:00
|
|
|
//#define DEBUG
|
|
|
|
#include <linux/spinlock.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 08:04:11 +00:00
|
|
|
#include <linux/slab.h>
|
2007-10-22 01:03:38 +00:00
|
|
|
#include <linux/blkdev.h>
|
|
|
|
#include <linux/hdreg.h>
|
2011-07-01 19:56:05 +00:00
|
|
|
#include <linux/module.h>
|
2012-01-12 05:14:44 +00:00
|
|
|
#include <linux/mutex.h>
|
2017-02-05 17:15:25 +00:00
|
|
|
#include <linux/interrupt.h>
|
2007-10-22 01:03:38 +00:00
|
|
|
#include <linux/virtio.h>
|
|
|
|
#include <linux/virtio_blk.h>
|
2007-10-24 11:21:21 +00:00
|
|
|
#include <linux/scatterlist.h>
|
2011-02-01 20:43:48 +00:00
|
|
|
#include <linux/string_helpers.h>
|
2011-10-30 19:29:59 +00:00
|
|
|
#include <linux/idr.h>
|
2013-11-01 16:52:52 +00:00
|
|
|
#include <linux/blk-mq.h>
|
2017-02-05 17:15:25 +00:00
|
|
|
#include <linux/blk-mq-virtio.h>
|
2013-11-01 16:52:52 +00:00
|
|
|
#include <linux/numa.h>
|
2022-10-16 03:41:27 +00:00
|
|
|
#include <linux/vmalloc.h>
|
2020-04-17 07:14:34 +00:00
|
|
|
#include <uapi/linux/virtio_ring.h>
|
2007-10-24 11:21:21 +00:00
|
|
|
|
2008-01-31 14:53:53 +00:00
|
|
|
#define PART_BITS 4
|
2014-06-26 09:41:48 +00:00
|
|
|
#define VQ_NAME_LEN 16
|
2018-11-01 22:40:35 +00:00
|
|
|
#define MAX_DISCARD_SEGMENTS 256u
|
2007-10-22 01:03:38 +00:00
|
|
|
|
2021-05-24 15:40:20 +00:00
|
|
|
/* The maximum number of sg elements that fit into a virtqueue */
|
|
|
|
#define VIRTIO_BLK_MAX_SG_ELEMS 32768
|
|
|
|
|
2021-09-01 13:14:34 +00:00
|
|
|
#ifdef CONFIG_ARCH_NO_SG_CHAIN
|
|
|
|
#define VIRTIO_BLK_INLINE_SG_CNT 0
|
|
|
|
#else
|
|
|
|
#define VIRTIO_BLK_INLINE_SG_CNT 2
|
|
|
|
#endif
|
|
|
|
|
2021-09-02 20:46:22 +00:00
|
|
|
static unsigned int num_request_queues;
|
2021-10-24 13:41:40 +00:00
|
|
|
module_param(num_request_queues, uint, 0644);
|
2021-09-02 20:46:22 +00:00
|
|
|
MODULE_PARM_DESC(num_request_queues,
|
|
|
|
"Limit the number of request queues to use for blk device. "
|
|
|
|
"0 for no limit. "
|
|
|
|
"Values > nr_cpu_ids truncated to nr_cpu_ids.");
|
|
|
|
|
virtio-blk: support polling I/O
This patch supports polling I/O via virtio-blk driver. Polling
feature is enabled by module parameter "poll_queues" and it sets
dedicated polling queues for virtio-blk. This patch improves the
polling I/O throughput and latency.
The virtio-blk driver doesn't not have a poll function and a poll
queue and it has been operating in interrupt driven method even if
the polling function is called in the upper layer.
virtio-blk polling is implemented upon 'batched completion' of block
layer. virtblk_poll() queues completed request to io_comp_batch->req_list
and later, virtblk_complete_batch() calls unmap function and ends
the requests in batch.
virtio-blk reads the number of poll queues from module parameter
"poll_queues". If VM sets queue parameter as below,
("num-queues=N" [QEMU property], "poll_queues=M" [module parameter])
It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
queues, the poll queues have no callback function.
Regarding HW-SW queue mapping, the default queue mapping uses the
existing method that condsiders MSI irq vector. But the poll queue
doesn't have an irq, so it uses the regular blk-mq cpu mapping.
For verifying the improvement, I did Fio polling I/O performance test
with io_uring engine with the options below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
queues for VM.
As a result, IOPS and average latency improved about 10%.
Test result:
- Fio io_uring poll without virtio-blk poll support
-- numjobs=1 : IOPS = 339K, avg latency = 188.33us
-- numjobs=2 : IOPS = 367K, avg latency = 347.33us
-- numjobs=4 : IOPS = 383K, avg latency = 682.06us
- Fio io_uring poll with virtio-blk poll support
-- numjobs=1 : IOPS = 385K, avg latency = 165.94us
-- numjobs=2 : IOPS = 408K, avg latency = 313.28us
-- numjobs=4 : IOPS = 424K, avg latency = 613.05us
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-2-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:06 +00:00
|
|
|
static unsigned int poll_queues;
|
|
|
|
module_param(poll_queues, uint, 0644);
|
|
|
|
MODULE_PARM_DESC(poll_queues, "The number of dedicated virtqueues for polling I/O");
|
|
|
|
|
2011-10-30 19:29:59 +00:00
|
|
|
static int major;
|
|
|
|
static DEFINE_IDA(vd_index_ida);
|
|
|
|
|
2013-05-20 00:55:39 +00:00
|
|
|
static struct workqueue_struct *virtblk_wq;
|
2008-01-31 14:53:53 +00:00
|
|
|
|
2014-06-26 09:41:48 +00:00
|
|
|
struct virtio_blk_vq {
|
|
|
|
struct virtqueue *vq;
|
|
|
|
spinlock_t lock;
|
|
|
|
char name[VQ_NAME_LEN];
|
|
|
|
} ____cacheline_aligned_in_smp;
|
|
|
|
|
2015-01-15 11:33:31 +00:00
|
|
|
struct virtio_blk {
|
2020-04-30 14:04:42 +00:00
|
|
|
/*
|
|
|
|
* This mutex must be held by anything that may run after
|
|
|
|
* virtblk_remove() sets vblk->vdev to NULL.
|
|
|
|
*
|
|
|
|
* blk-mq, virtqueue processing, and sysfs attribute code paths are
|
|
|
|
* shut down before vblk->vdev is set to NULL and therefore do not need
|
|
|
|
* to hold this mutex.
|
|
|
|
*/
|
|
|
|
struct mutex vdev_mutex;
|
2007-10-22 01:03:38 +00:00
|
|
|
struct virtio_device *vdev;
|
|
|
|
|
|
|
|
/* The disk structure for the kernel. */
|
|
|
|
struct gendisk *disk;
|
|
|
|
|
2014-04-15 20:14:00 +00:00
|
|
|
/* Block layer tags. */
|
|
|
|
struct blk_mq_tag_set tag_set;
|
|
|
|
|
2011-02-01 20:43:48 +00:00
|
|
|
/* Process context for config space updates */
|
|
|
|
struct work_struct config_work;
|
|
|
|
|
2011-10-30 19:29:59 +00:00
|
|
|
/* Ida index - used to track minor number allocations. */
|
|
|
|
int index;
|
2014-06-26 09:41:48 +00:00
|
|
|
|
|
|
|
/* num of vqs */
|
|
|
|
int num_vqs;
|
virtio-blk: support polling I/O
This patch supports polling I/O via virtio-blk driver. Polling
feature is enabled by module parameter "poll_queues" and it sets
dedicated polling queues for virtio-blk. This patch improves the
polling I/O throughput and latency.
The virtio-blk driver doesn't not have a poll function and a poll
queue and it has been operating in interrupt driven method even if
the polling function is called in the upper layer.
virtio-blk polling is implemented upon 'batched completion' of block
layer. virtblk_poll() queues completed request to io_comp_batch->req_list
and later, virtblk_complete_batch() calls unmap function and ends
the requests in batch.
virtio-blk reads the number of poll queues from module parameter
"poll_queues". If VM sets queue parameter as below,
("num-queues=N" [QEMU property], "poll_queues=M" [module parameter])
It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
queues, the poll queues have no callback function.
Regarding HW-SW queue mapping, the default queue mapping uses the
existing method that condsiders MSI irq vector. But the poll queue
doesn't have an irq, so it uses the regular blk-mq cpu mapping.
For verifying the improvement, I did Fio polling I/O performance test
with io_uring engine with the options below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
queues for VM.
As a result, IOPS and average latency improved about 10%.
Test result:
- Fio io_uring poll without virtio-blk poll support
-- numjobs=1 : IOPS = 339K, avg latency = 188.33us
-- numjobs=2 : IOPS = 367K, avg latency = 347.33us
-- numjobs=4 : IOPS = 383K, avg latency = 682.06us
- Fio io_uring poll with virtio-blk poll support
-- numjobs=1 : IOPS = 385K, avg latency = 165.94us
-- numjobs=2 : IOPS = 408K, avg latency = 313.28us
-- numjobs=4 : IOPS = 424K, avg latency = 613.05us
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-2-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:06 +00:00
|
|
|
int io_queues[HCTX_MAX_TYPES];
|
2014-06-26 09:41:48 +00:00
|
|
|
struct virtio_blk_vq *vqs;
|
2022-10-16 03:41:27 +00:00
|
|
|
|
|
|
|
/* For zoned device */
|
|
|
|
unsigned int zone_sectors;
|
2007-10-22 01:03:38 +00:00
|
|
|
};
|
|
|
|
|
2015-01-15 11:33:31 +00:00
|
|
|
struct virtblk_req {
|
2022-10-16 03:41:27 +00:00
|
|
|
/* Out header */
|
2017-01-28 08:32:53 +00:00
|
|
|
struct virtio_blk_outhdr out_hdr;
|
2022-10-16 03:41:27 +00:00
|
|
|
|
|
|
|
/* In header */
|
|
|
|
union {
|
|
|
|
u8 status;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The zone append command has an extended in header.
|
2023-03-30 21:49:52 +00:00
|
|
|
* The status field in zone_append_in_hdr must always
|
|
|
|
* be the last byte.
|
2022-10-16 03:41:27 +00:00
|
|
|
*/
|
|
|
|
struct {
|
2023-03-30 21:49:52 +00:00
|
|
|
__virtio64 sector;
|
2022-10-16 03:41:27 +00:00
|
|
|
u8 status;
|
2023-03-30 21:49:52 +00:00
|
|
|
} zone_append;
|
|
|
|
} in_hdr;
|
2022-10-16 03:41:27 +00:00
|
|
|
|
|
|
|
size_t in_hdr_len;
|
|
|
|
|
2021-09-01 13:14:34 +00:00
|
|
|
struct sg_table sg_table;
|
virtio-blk: Add bio-based IO path for virtio-blk
This patch introduces bio-based IO path for virtio-blk.
Compared to request-based IO path, bio-based IO path uses driver
provided ->make_request_fn() method to bypasses the IO scheduler. It
handles the bio to device directly without allocating a request in block
layer. This reduces the IO path in guest kernel to achieve high IOPS
and lower latency. The downside is that guest can not use the IO
scheduler to merge and sort requests. However, this is not a big problem
if the backend disk in host side uses faster disk device.
When the bio-based IO path is not enabled, virtio-blk still uses the
original request-based IO path, no performance difference is observed.
Using a slow device e.g. normal SATA disk, the bio-based IO path for
sequential read and write are slower than req-based IO path due to lack
of merge in guest kernel. So we make the bio-based path optional.
Performance evaluation:
-----------------------------
1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 28%, 24%, 21%, 16%
Latency improvement: 32%, 17%, 21%, 16%
Long version:
With bio-based IO path:
seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec
seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec
rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec
rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec
clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30
clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35
clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39
clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45
cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954
cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137
cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648
cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375
With request-based IO path:
seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec
seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec
rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec
rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec
clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49
clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15
clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27
clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68
cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622
cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959
cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082
cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985
2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 11%, 11%, 13%, 10%
Latency improvement: 10%, 10%, 12%, 10%
Long Version:
With bio-based IO path:
read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec
write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec
read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec
write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec
clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29
clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80
clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07
clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25
cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517
cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604
cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612
cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105
With request-based IO path:
read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec
write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec
read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec
write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec
clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84
clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64
clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55
clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95
cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641
cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689
cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223
cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409
3) Fio test is performed in a 8 vcpu guest with normal SATA based guest
using kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : -10%, -10%, 4.4%, 0.5%
Latency improvement: -12%, -15%, 2.5%, 0.8%
Long Version:
With bio-based IO path:
read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec
write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec
read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec
write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec
clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54
clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70
clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28
clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71
cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7
cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0
cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16
cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0
With request-based IO path:
read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec
write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec
read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec
write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec
clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45
clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71
clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97
clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61
cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23
cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0
cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16
cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0
How to use:
-----------------------------
Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk
use_bio=1' to enable ->make_request_fn() based I/O path.
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Asias He <asias@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 08:07:04 +00:00
|
|
|
struct scatterlist sg[];
|
2007-10-22 01:03:38 +00:00
|
|
|
};
|
|
|
|
|
2022-10-16 03:41:27 +00:00
|
|
|
static inline blk_status_t virtblk_result(u8 status)
|
virtio-blk: Add bio-based IO path for virtio-blk
This patch introduces bio-based IO path for virtio-blk.
Compared to request-based IO path, bio-based IO path uses driver
provided ->make_request_fn() method to bypasses the IO scheduler. It
handles the bio to device directly without allocating a request in block
layer. This reduces the IO path in guest kernel to achieve high IOPS
and lower latency. The downside is that guest can not use the IO
scheduler to merge and sort requests. However, this is not a big problem
if the backend disk in host side uses faster disk device.
When the bio-based IO path is not enabled, virtio-blk still uses the
original request-based IO path, no performance difference is observed.
Using a slow device e.g. normal SATA disk, the bio-based IO path for
sequential read and write are slower than req-based IO path due to lack
of merge in guest kernel. So we make the bio-based path optional.
Performance evaluation:
-----------------------------
1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 28%, 24%, 21%, 16%
Latency improvement: 32%, 17%, 21%, 16%
Long version:
With bio-based IO path:
seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec
seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec
rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec
rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec
clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30
clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35
clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39
clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45
cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954
cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137
cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648
cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375
With request-based IO path:
seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec
seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec
rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec
rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec
clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49
clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15
clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27
clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68
cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622
cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959
cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082
cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985
2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 11%, 11%, 13%, 10%
Latency improvement: 10%, 10%, 12%, 10%
Long Version:
With bio-based IO path:
read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec
write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec
read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec
write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec
clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29
clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80
clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07
clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25
cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517
cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604
cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612
cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105
With request-based IO path:
read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec
write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec
read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec
write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec
clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84
clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64
clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55
clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95
cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641
cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689
cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223
cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409
3) Fio test is performed in a 8 vcpu guest with normal SATA based guest
using kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : -10%, -10%, 4.4%, 0.5%
Latency improvement: -12%, -15%, 2.5%, 0.8%
Long Version:
With bio-based IO path:
read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec
write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec
read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec
write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec
clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54
clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70
clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28
clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71
cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7
cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0
cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16
cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0
With request-based IO path:
read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec
write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec
read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec
write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec
clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45
clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71
clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97
clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61
cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23
cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0
cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16
cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0
How to use:
-----------------------------
Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk
use_bio=1' to enable ->make_request_fn() based I/O path.
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Asias He <asias@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 08:07:04 +00:00
|
|
|
{
|
2022-10-16 03:41:27 +00:00
|
|
|
switch (status) {
|
virtio-blk: Add bio-based IO path for virtio-blk
This patch introduces bio-based IO path for virtio-blk.
Compared to request-based IO path, bio-based IO path uses driver
provided ->make_request_fn() method to bypasses the IO scheduler. It
handles the bio to device directly without allocating a request in block
layer. This reduces the IO path in guest kernel to achieve high IOPS
and lower latency. The downside is that guest can not use the IO
scheduler to merge and sort requests. However, this is not a big problem
if the backend disk in host side uses faster disk device.
When the bio-based IO path is not enabled, virtio-blk still uses the
original request-based IO path, no performance difference is observed.
Using a slow device e.g. normal SATA disk, the bio-based IO path for
sequential read and write are slower than req-based IO path due to lack
of merge in guest kernel. So we make the bio-based path optional.
Performance evaluation:
-----------------------------
1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 28%, 24%, 21%, 16%
Latency improvement: 32%, 17%, 21%, 16%
Long version:
With bio-based IO path:
seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec
seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec
rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec
rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec
clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30
clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35
clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39
clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45
cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954
cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137
cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648
cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375
With request-based IO path:
seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec
seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec
rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec
rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec
clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49
clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15
clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27
clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68
cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622
cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959
cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082
cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985
2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 11%, 11%, 13%, 10%
Latency improvement: 10%, 10%, 12%, 10%
Long Version:
With bio-based IO path:
read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec
write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec
read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec
write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec
clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29
clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80
clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07
clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25
cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517
cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604
cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612
cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105
With request-based IO path:
read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec
write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec
read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec
write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec
clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84
clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64
clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55
clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95
cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641
cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689
cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223
cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409
3) Fio test is performed in a 8 vcpu guest with normal SATA based guest
using kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : -10%, -10%, 4.4%, 0.5%
Latency improvement: -12%, -15%, 2.5%, 0.8%
Long Version:
With bio-based IO path:
read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec
write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec
read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec
write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec
clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54
clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70
clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28
clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71
cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7
cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0
cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16
cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0
With request-based IO path:
read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec
write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec
read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec
write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec
clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45
clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71
clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97
clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61
cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23
cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0
cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16
cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0
How to use:
-----------------------------
Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk
use_bio=1' to enable ->make_request_fn() based I/O path.
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Asias He <asias@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 08:07:04 +00:00
|
|
|
case VIRTIO_BLK_S_OK:
|
2017-06-03 07:38:04 +00:00
|
|
|
return BLK_STS_OK;
|
virtio-blk: Add bio-based IO path for virtio-blk
This patch introduces bio-based IO path for virtio-blk.
Compared to request-based IO path, bio-based IO path uses driver
provided ->make_request_fn() method to bypasses the IO scheduler. It
handles the bio to device directly without allocating a request in block
layer. This reduces the IO path in guest kernel to achieve high IOPS
and lower latency. The downside is that guest can not use the IO
scheduler to merge and sort requests. However, this is not a big problem
if the backend disk in host side uses faster disk device.
When the bio-based IO path is not enabled, virtio-blk still uses the
original request-based IO path, no performance difference is observed.
Using a slow device e.g. normal SATA disk, the bio-based IO path for
sequential read and write are slower than req-based IO path due to lack
of merge in guest kernel. So we make the bio-based path optional.
Performance evaluation:
-----------------------------
1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 28%, 24%, 21%, 16%
Latency improvement: 32%, 17%, 21%, 16%
Long version:
With bio-based IO path:
seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec
seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec
rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec
rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec
clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30
clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35
clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39
clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45
cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954
cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137
cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648
cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375
With request-based IO path:
seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec
seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec
rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec
rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec
clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49
clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15
clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27
clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68
cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622
cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959
cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082
cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985
2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 11%, 11%, 13%, 10%
Latency improvement: 10%, 10%, 12%, 10%
Long Version:
With bio-based IO path:
read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec
write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec
read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec
write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec
clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29
clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80
clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07
clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25
cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517
cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604
cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612
cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105
With request-based IO path:
read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec
write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec
read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec
write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec
clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84
clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64
clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55
clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95
cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641
cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689
cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223
cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409
3) Fio test is performed in a 8 vcpu guest with normal SATA based guest
using kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : -10%, -10%, 4.4%, 0.5%
Latency improvement: -12%, -15%, 2.5%, 0.8%
Long Version:
With bio-based IO path:
read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec
write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec
read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec
write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec
clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54
clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70
clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28
clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71
cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7
cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0
cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16
cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0
With request-based IO path:
read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec
write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec
read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec
write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec
clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45
clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71
clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97
clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61
cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23
cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0
cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16
cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0
How to use:
-----------------------------
Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk
use_bio=1' to enable ->make_request_fn() based I/O path.
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Asias He <asias@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 08:07:04 +00:00
|
|
|
case VIRTIO_BLK_S_UNSUPP:
|
2017-06-03 07:38:04 +00:00
|
|
|
return BLK_STS_NOTSUPP;
|
2022-10-16 03:41:27 +00:00
|
|
|
case VIRTIO_BLK_S_ZONE_OPEN_RESOURCE:
|
|
|
|
return BLK_STS_ZONE_OPEN_RESOURCE;
|
|
|
|
case VIRTIO_BLK_S_ZONE_ACTIVE_RESOURCE:
|
|
|
|
return BLK_STS_ZONE_ACTIVE_RESOURCE;
|
|
|
|
case VIRTIO_BLK_S_IOERR:
|
|
|
|
case VIRTIO_BLK_S_ZONE_UNALIGNED_WP:
|
virtio-blk: Add bio-based IO path for virtio-blk
This patch introduces bio-based IO path for virtio-blk.
Compared to request-based IO path, bio-based IO path uses driver
provided ->make_request_fn() method to bypasses the IO scheduler. It
handles the bio to device directly without allocating a request in block
layer. This reduces the IO path in guest kernel to achieve high IOPS
and lower latency. The downside is that guest can not use the IO
scheduler to merge and sort requests. However, this is not a big problem
if the backend disk in host side uses faster disk device.
When the bio-based IO path is not enabled, virtio-blk still uses the
original request-based IO path, no performance difference is observed.
Using a slow device e.g. normal SATA disk, the bio-based IO path for
sequential read and write are slower than req-based IO path due to lack
of merge in guest kernel. So we make the bio-based path optional.
Performance evaluation:
-----------------------------
1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 28%, 24%, 21%, 16%
Latency improvement: 32%, 17%, 21%, 16%
Long version:
With bio-based IO path:
seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec
seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec
rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec
rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec
clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30
clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35
clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39
clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45
cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954
cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137
cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648
cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375
With request-based IO path:
seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec
seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec
rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec
rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec
clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49
clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15
clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27
clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68
cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622
cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959
cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082
cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985
2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 11%, 11%, 13%, 10%
Latency improvement: 10%, 10%, 12%, 10%
Long Version:
With bio-based IO path:
read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec
write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec
read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec
write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec
clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29
clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80
clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07
clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25
cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517
cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604
cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612
cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105
With request-based IO path:
read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec
write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec
read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec
write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec
clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84
clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64
clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55
clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95
cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641
cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689
cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223
cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409
3) Fio test is performed in a 8 vcpu guest with normal SATA based guest
using kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : -10%, -10%, 4.4%, 0.5%
Latency improvement: -12%, -15%, 2.5%, 0.8%
Long Version:
With bio-based IO path:
read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec
write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec
read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec
write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec
clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54
clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70
clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28
clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71
cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7
cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0
cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16
cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0
With request-based IO path:
read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec
write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec
read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec
write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec
clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45
clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71
clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97
clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61
cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23
cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0
cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16
cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0
How to use:
-----------------------------
Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk
use_bio=1' to enable ->make_request_fn() based I/O path.
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Asias He <asias@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 08:07:04 +00:00
|
|
|
default:
|
2017-06-03 07:38:04 +00:00
|
|
|
return BLK_STS_IOERR;
|
virtio-blk: Add bio-based IO path for virtio-blk
This patch introduces bio-based IO path for virtio-blk.
Compared to request-based IO path, bio-based IO path uses driver
provided ->make_request_fn() method to bypasses the IO scheduler. It
handles the bio to device directly without allocating a request in block
layer. This reduces the IO path in guest kernel to achieve high IOPS
and lower latency. The downside is that guest can not use the IO
scheduler to merge and sort requests. However, this is not a big problem
if the backend disk in host side uses faster disk device.
When the bio-based IO path is not enabled, virtio-blk still uses the
original request-based IO path, no performance difference is observed.
Using a slow device e.g. normal SATA disk, the bio-based IO path for
sequential read and write are slower than req-based IO path due to lack
of merge in guest kernel. So we make the bio-based path optional.
Performance evaluation:
-----------------------------
1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 28%, 24%, 21%, 16%
Latency improvement: 32%, 17%, 21%, 16%
Long version:
With bio-based IO path:
seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec
seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec
rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec
rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec
clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30
clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35
clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39
clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45
cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954
cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137
cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648
cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375
With request-based IO path:
seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec
seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec
rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec
rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec
clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49
clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15
clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27
clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68
cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622
cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959
cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082
cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985
2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 11%, 11%, 13%, 10%
Latency improvement: 10%, 10%, 12%, 10%
Long Version:
With bio-based IO path:
read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec
write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec
read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec
write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec
clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29
clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80
clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07
clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25
cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517
cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604
cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612
cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105
With request-based IO path:
read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec
write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec
read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec
write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec
clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84
clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64
clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55
clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95
cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641
cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689
cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223
cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409
3) Fio test is performed in a 8 vcpu guest with normal SATA based guest
using kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : -10%, -10%, 4.4%, 0.5%
Latency improvement: -12%, -15%, 2.5%, 0.8%
Long Version:
With bio-based IO path:
read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec
write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec
read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec
write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec
clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54
clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70
clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28
clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71
cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7
cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0
cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16
cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0
With request-based IO path:
read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec
write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec
read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec
write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec
clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45
clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71
clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97
clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61
cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23
cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0
cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16
cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0
How to use:
-----------------------------
Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk
use_bio=1' to enable ->make_request_fn() based I/O path.
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Asias He <asias@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 08:07:04 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-08-10 16:09:48 +00:00
|
|
|
static inline struct virtio_blk_vq *get_virtio_blk_vq(struct blk_mq_hw_ctx *hctx)
|
|
|
|
{
|
|
|
|
struct virtio_blk *vblk = hctx->queue->queuedata;
|
|
|
|
struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num];
|
|
|
|
|
|
|
|
return vq;
|
|
|
|
}
|
|
|
|
|
virtio-blk: support mq_ops->queue_rqs()
This patch supports mq_ops->queue_rqs() hook. It has an advantage of
batch submission to virtio-blk driver. It also helps polling I/O because
polling uses batched completion of block layer. Batch submission in
queue_rqs() can boost polling performance.
In queue_rqs(), it iterates plug->mq_list, collects requests that
belong to same HW queue until it encounters a request from other
HW queue or sees the end of the list.
Then, virtio-blk adds requests into virtqueue and kicks virtqueue
to submit requests.
If there is an error, it inserts error request to requeue_list and
passes it to ordinary block layer path.
For verification, I did fio test.
(io_uring, randread, direct=1, bs=4K, iodepth=64 numjobs=N)
I set 4 vcpu and 2 virtio-blk queues for VM and run fio test 5 times.
It shows about 2% improvement.
| numjobs=2 | numjobs=4
-----------------------------------------------------------
fio without queue_rqs() | 291K IOPS | 238K IOPS
-----------------------------------------------------------
fio with queue_rqs() | 295K IOPS | 243K IOPS
For polling I/O performance, I also did fio test as below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=4)
I set 4 vcpu and 2 poll queues for VM.
It shows about 2% improvement in polling I/O.
| IOPS | avg latency
-----------------------------------------------------------
fio poll without queue_rqs() | 424K | 613.05 usec
-----------------------------------------------------------
fio poll with queue_rqs() | 435K | 601.01 usec
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-3-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:07 +00:00
|
|
|
static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr)
|
2017-01-28 08:32:53 +00:00
|
|
|
{
|
2022-10-16 03:41:27 +00:00
|
|
|
struct scatterlist out_hdr, in_hdr, *sgs[3];
|
2017-01-28 08:32:53 +00:00
|
|
|
unsigned int num_out = 0, num_in = 0;
|
|
|
|
|
2022-10-16 03:41:27 +00:00
|
|
|
sg_init_one(&out_hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
|
|
|
|
sgs[num_out++] = &out_hdr;
|
2013-03-20 05:14:27 +00:00
|
|
|
|
virtio-blk: support mq_ops->queue_rqs()
This patch supports mq_ops->queue_rqs() hook. It has an advantage of
batch submission to virtio-blk driver. It also helps polling I/O because
polling uses batched completion of block layer. Batch submission in
queue_rqs() can boost polling performance.
In queue_rqs(), it iterates plug->mq_list, collects requests that
belong to same HW queue until it encounters a request from other
HW queue or sees the end of the list.
Then, virtio-blk adds requests into virtqueue and kicks virtqueue
to submit requests.
If there is an error, it inserts error request to requeue_list and
passes it to ordinary block layer path.
For verification, I did fio test.
(io_uring, randread, direct=1, bs=4K, iodepth=64 numjobs=N)
I set 4 vcpu and 2 virtio-blk queues for VM and run fio test 5 times.
It shows about 2% improvement.
| numjobs=2 | numjobs=4
-----------------------------------------------------------
fio without queue_rqs() | 291K IOPS | 238K IOPS
-----------------------------------------------------------
fio with queue_rqs() | 295K IOPS | 243K IOPS
For polling I/O performance, I also did fio test as below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=4)
I set 4 vcpu and 2 poll queues for VM.
It shows about 2% improvement in polling I/O.
| IOPS | avg latency
-----------------------------------------------------------
fio poll without queue_rqs() | 424K | 613.05 usec
-----------------------------------------------------------
fio poll with queue_rqs() | 435K | 601.01 usec
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-3-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:07 +00:00
|
|
|
if (vbr->sg_table.nents) {
|
2014-10-07 14:39:49 +00:00
|
|
|
if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT))
|
virtio-blk: support mq_ops->queue_rqs()
This patch supports mq_ops->queue_rqs() hook. It has an advantage of
batch submission to virtio-blk driver. It also helps polling I/O because
polling uses batched completion of block layer. Batch submission in
queue_rqs() can boost polling performance.
In queue_rqs(), it iterates plug->mq_list, collects requests that
belong to same HW queue until it encounters a request from other
HW queue or sees the end of the list.
Then, virtio-blk adds requests into virtqueue and kicks virtqueue
to submit requests.
If there is an error, it inserts error request to requeue_list and
passes it to ordinary block layer path.
For verification, I did fio test.
(io_uring, randread, direct=1, bs=4K, iodepth=64 numjobs=N)
I set 4 vcpu and 2 virtio-blk queues for VM and run fio test 5 times.
It shows about 2% improvement.
| numjobs=2 | numjobs=4
-----------------------------------------------------------
fio without queue_rqs() | 291K IOPS | 238K IOPS
-----------------------------------------------------------
fio with queue_rqs() | 295K IOPS | 243K IOPS
For polling I/O performance, I also did fio test as below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=4)
I set 4 vcpu and 2 poll queues for VM.
It shows about 2% improvement in polling I/O.
| IOPS | avg latency
-----------------------------------------------------------
fio poll without queue_rqs() | 424K | 613.05 usec
-----------------------------------------------------------
fio poll with queue_rqs() | 435K | 601.01 usec
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-3-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:07 +00:00
|
|
|
sgs[num_out++] = vbr->sg_table.sgl;
|
2013-03-20 05:14:27 +00:00
|
|
|
else
|
virtio-blk: support mq_ops->queue_rqs()
This patch supports mq_ops->queue_rqs() hook. It has an advantage of
batch submission to virtio-blk driver. It also helps polling I/O because
polling uses batched completion of block layer. Batch submission in
queue_rqs() can boost polling performance.
In queue_rqs(), it iterates plug->mq_list, collects requests that
belong to same HW queue until it encounters a request from other
HW queue or sees the end of the list.
Then, virtio-blk adds requests into virtqueue and kicks virtqueue
to submit requests.
If there is an error, it inserts error request to requeue_list and
passes it to ordinary block layer path.
For verification, I did fio test.
(io_uring, randread, direct=1, bs=4K, iodepth=64 numjobs=N)
I set 4 vcpu and 2 virtio-blk queues for VM and run fio test 5 times.
It shows about 2% improvement.
| numjobs=2 | numjobs=4
-----------------------------------------------------------
fio without queue_rqs() | 291K IOPS | 238K IOPS
-----------------------------------------------------------
fio with queue_rqs() | 295K IOPS | 243K IOPS
For polling I/O performance, I also did fio test as below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=4)
I set 4 vcpu and 2 poll queues for VM.
It shows about 2% improvement in polling I/O.
| IOPS | avg latency
-----------------------------------------------------------
fio poll without queue_rqs() | 424K | 613.05 usec
-----------------------------------------------------------
fio poll with queue_rqs() | 435K | 601.01 usec
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-3-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:07 +00:00
|
|
|
sgs[num_out + num_in++] = vbr->sg_table.sgl;
|
2013-03-20 05:14:27 +00:00
|
|
|
}
|
|
|
|
|
2023-03-30 21:49:52 +00:00
|
|
|
sg_init_one(&in_hdr, &vbr->in_hdr.status, vbr->in_hdr_len);
|
2022-10-16 03:41:27 +00:00
|
|
|
sgs[num_out + num_in++] = &in_hdr;
|
2013-03-20 05:14:27 +00:00
|
|
|
|
|
|
|
return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
|
2013-03-20 05:14:27 +00:00
|
|
|
}
|
|
|
|
|
2022-09-21 08:27:29 +00:00
|
|
|
static int virtblk_setup_discard_write_zeroes_erase(struct request *req, bool unmap)
|
2018-11-01 22:40:35 +00:00
|
|
|
{
|
|
|
|
unsigned short segments = blk_rq_nr_discard_segments(req);
|
|
|
|
unsigned short n = 0;
|
|
|
|
struct virtio_blk_discard_write_zeroes *range;
|
|
|
|
struct bio *bio;
|
|
|
|
u32 flags = 0;
|
|
|
|
|
|
|
|
if (unmap)
|
|
|
|
flags |= VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP;
|
|
|
|
|
|
|
|
range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
|
|
|
|
if (!range)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2020-08-17 09:52:40 +00:00
|
|
|
/*
|
|
|
|
* Single max discard segment means multi-range discard isn't
|
|
|
|
* supported, and block layer only runs contiguity merge like
|
|
|
|
* normal RW request. So we can't reply on bio for retrieving
|
|
|
|
* each range info.
|
|
|
|
*/
|
|
|
|
if (queue_max_discard_segments(req->q) == 1) {
|
|
|
|
range[0].flags = cpu_to_le32(flags);
|
|
|
|
range[0].num_sectors = cpu_to_le32(blk_rq_sectors(req));
|
|
|
|
range[0].sector = cpu_to_le64(blk_rq_pos(req));
|
|
|
|
n = 1;
|
|
|
|
} else {
|
|
|
|
__rq_for_each_bio(bio, req) {
|
|
|
|
u64 sector = bio->bi_iter.bi_sector;
|
|
|
|
u32 num_sectors = bio->bi_iter.bi_size >> SECTOR_SHIFT;
|
|
|
|
|
|
|
|
range[n].flags = cpu_to_le32(flags);
|
|
|
|
range[n].num_sectors = cpu_to_le32(num_sectors);
|
|
|
|
range[n].sector = cpu_to_le64(sector);
|
|
|
|
n++;
|
|
|
|
}
|
2018-11-01 22:40:35 +00:00
|
|
|
}
|
|
|
|
|
2020-08-17 09:52:40 +00:00
|
|
|
WARN_ON_ONCE(n != segments);
|
|
|
|
|
2023-02-03 15:06:20 +00:00
|
|
|
bvec_set_virt(&req->special_vec, range, sizeof(*range) * segments);
|
2018-11-01 22:40:35 +00:00
|
|
|
req->rq_flags |= RQF_SPECIAL_PAYLOAD;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-09-01 13:14:34 +00:00
|
|
|
static void virtblk_unmap_data(struct request *req, struct virtblk_req *vbr)
|
virtio-blk: Add bio-based IO path for virtio-blk
This patch introduces bio-based IO path for virtio-blk.
Compared to request-based IO path, bio-based IO path uses driver
provided ->make_request_fn() method to bypasses the IO scheduler. It
handles the bio to device directly without allocating a request in block
layer. This reduces the IO path in guest kernel to achieve high IOPS
and lower latency. The downside is that guest can not use the IO
scheduler to merge and sort requests. However, this is not a big problem
if the backend disk in host side uses faster disk device.
When the bio-based IO path is not enabled, virtio-blk still uses the
original request-based IO path, no performance difference is observed.
Using a slow device e.g. normal SATA disk, the bio-based IO path for
sequential read and write are slower than req-based IO path due to lack
of merge in guest kernel. So we make the bio-based path optional.
Performance evaluation:
-----------------------------
1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 28%, 24%, 21%, 16%
Latency improvement: 32%, 17%, 21%, 16%
Long version:
With bio-based IO path:
seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec
seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec
rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec
rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec
clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30
clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35
clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39
clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45
cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954
cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137
cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648
cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375
With request-based IO path:
seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec
seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec
rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec
rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec
clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49
clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15
clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27
clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68
cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622
cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959
cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082
cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985
2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 11%, 11%, 13%, 10%
Latency improvement: 10%, 10%, 12%, 10%
Long Version:
With bio-based IO path:
read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec
write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec
read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec
write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec
clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29
clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80
clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07
clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25
cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517
cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604
cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612
cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105
With request-based IO path:
read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec
write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec
read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec
write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec
clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84
clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64
clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55
clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95
cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641
cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689
cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223
cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409
3) Fio test is performed in a 8 vcpu guest with normal SATA based guest
using kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : -10%, -10%, 4.4%, 0.5%
Latency improvement: -12%, -15%, 2.5%, 0.8%
Long Version:
With bio-based IO path:
read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec
write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec
read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec
write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec
clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54
clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70
clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28
clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71
cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7
cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0
cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16
cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0
With request-based IO path:
read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec
write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec
read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec
write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec
clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45
clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71
clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97
clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61
cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23
cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0
cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16
cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0
How to use:
-----------------------------
Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk
use_bio=1' to enable ->make_request_fn() based I/O path.
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Asias He <asias@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 08:07:04 +00:00
|
|
|
{
|
2021-09-01 13:14:34 +00:00
|
|
|
if (blk_rq_nr_phys_segments(req))
|
|
|
|
sg_free_table_chained(&vbr->sg_table,
|
|
|
|
VIRTIO_BLK_INLINE_SG_CNT);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int virtblk_map_data(struct blk_mq_hw_ctx *hctx, struct request *req,
|
|
|
|
struct virtblk_req *vbr)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
|
|
|
if (!blk_rq_nr_phys_segments(req))
|
|
|
|
return 0;
|
virtio-blk: Add bio-based IO path for virtio-blk
This patch introduces bio-based IO path for virtio-blk.
Compared to request-based IO path, bio-based IO path uses driver
provided ->make_request_fn() method to bypasses the IO scheduler. It
handles the bio to device directly without allocating a request in block
layer. This reduces the IO path in guest kernel to achieve high IOPS
and lower latency. The downside is that guest can not use the IO
scheduler to merge and sort requests. However, this is not a big problem
if the backend disk in host side uses faster disk device.
When the bio-based IO path is not enabled, virtio-blk still uses the
original request-based IO path, no performance difference is observed.
Using a slow device e.g. normal SATA disk, the bio-based IO path for
sequential read and write are slower than req-based IO path due to lack
of merge in guest kernel. So we make the bio-based path optional.
Performance evaluation:
-----------------------------
1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 28%, 24%, 21%, 16%
Latency improvement: 32%, 17%, 21%, 16%
Long version:
With bio-based IO path:
seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec
seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec
rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec
rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec
clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30
clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35
clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39
clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45
cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954
cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137
cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648
cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375
With request-based IO path:
seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec
seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec
rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec
rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec
clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49
clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15
clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27
clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68
cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622
cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959
cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082
cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985
2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 11%, 11%, 13%, 10%
Latency improvement: 10%, 10%, 12%, 10%
Long Version:
With bio-based IO path:
read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec
write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec
read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec
write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec
clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29
clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80
clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07
clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25
cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517
cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604
cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612
cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105
With request-based IO path:
read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec
write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec
read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec
write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec
clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84
clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64
clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55
clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95
cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641
cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689
cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223
cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409
3) Fio test is performed in a 8 vcpu guest with normal SATA based guest
using kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : -10%, -10%, 4.4%, 0.5%
Latency improvement: -12%, -15%, 2.5%, 0.8%
Long Version:
With bio-based IO path:
read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec
write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec
read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec
write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec
clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54
clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70
clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28
clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71
cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7
cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0
cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16
cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0
With request-based IO path:
read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec
write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec
read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec
write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec
clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45
clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71
clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97
clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61
cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23
cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0
cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16
cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0
How to use:
-----------------------------
Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk
use_bio=1' to enable ->make_request_fn() based I/O path.
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Asias He <asias@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 08:07:04 +00:00
|
|
|
|
2021-09-01 13:14:34 +00:00
|
|
|
vbr->sg_table.sgl = vbr->sg;
|
|
|
|
err = sg_alloc_table_chained(&vbr->sg_table,
|
|
|
|
blk_rq_nr_phys_segments(req),
|
|
|
|
vbr->sg_table.sgl,
|
|
|
|
VIRTIO_BLK_INLINE_SG_CNT);
|
|
|
|
if (unlikely(err))
|
|
|
|
return -ENOMEM;
|
virtio-blk: Add bio-based IO path for virtio-blk
This patch introduces bio-based IO path for virtio-blk.
Compared to request-based IO path, bio-based IO path uses driver
provided ->make_request_fn() method to bypasses the IO scheduler. It
handles the bio to device directly without allocating a request in block
layer. This reduces the IO path in guest kernel to achieve high IOPS
and lower latency. The downside is that guest can not use the IO
scheduler to merge and sort requests. However, this is not a big problem
if the backend disk in host side uses faster disk device.
When the bio-based IO path is not enabled, virtio-blk still uses the
original request-based IO path, no performance difference is observed.
Using a slow device e.g. normal SATA disk, the bio-based IO path for
sequential read and write are slower than req-based IO path due to lack
of merge in guest kernel. So we make the bio-based path optional.
Performance evaluation:
-----------------------------
1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 28%, 24%, 21%, 16%
Latency improvement: 32%, 17%, 21%, 16%
Long version:
With bio-based IO path:
seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec
seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec
rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec
rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec
clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30
clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35
clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39
clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45
cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954
cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137
cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648
cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375
With request-based IO path:
seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec
seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec
rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec
rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec
clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49
clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15
clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27
clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68
cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622
cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959
cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082
cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985
2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 11%, 11%, 13%, 10%
Latency improvement: 10%, 10%, 12%, 10%
Long Version:
With bio-based IO path:
read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec
write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec
read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec
write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec
clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29
clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80
clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07
clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25
cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517
cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604
cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612
cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105
With request-based IO path:
read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec
write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec
read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec
write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec
clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84
clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64
clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55
clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95
cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641
cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689
cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223
cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409
3) Fio test is performed in a 8 vcpu guest with normal SATA based guest
using kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : -10%, -10%, 4.4%, 0.5%
Latency improvement: -12%, -15%, 2.5%, 0.8%
Long Version:
With bio-based IO path:
read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec
write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec
read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec
write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec
clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54
clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70
clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28
clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71
cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7
cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0
cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16
cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0
With request-based IO path:
read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec
write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec
read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec
write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec
clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45
clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71
clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97
clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61
cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23
cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0
cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16
cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0
How to use:
-----------------------------
Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk
use_bio=1' to enable ->make_request_fn() based I/O path.
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Asias He <asias@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 08:07:04 +00:00
|
|
|
|
2021-09-01 13:14:34 +00:00
|
|
|
return blk_rq_map_sg(hctx->queue, req, vbr->sg_table.sgl);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void virtblk_cleanup_cmd(struct request *req)
|
|
|
|
{
|
2021-08-04 09:56:27 +00:00
|
|
|
if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
|
|
|
|
kfree(bvec_virt(&req->special_vec));
|
2021-09-01 13:14:34 +00:00
|
|
|
}
|
|
|
|
|
2021-10-25 07:54:03 +00:00
|
|
|
static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev,
|
|
|
|
struct request *req,
|
|
|
|
struct virtblk_req *vbr)
|
2021-09-01 13:14:34 +00:00
|
|
|
{
|
2023-03-30 21:49:52 +00:00
|
|
|
size_t in_hdr_len = sizeof(vbr->in_hdr.status);
|
2021-09-01 13:14:34 +00:00
|
|
|
bool unmap = false;
|
|
|
|
u32 type;
|
2022-10-16 03:41:27 +00:00
|
|
|
u64 sector = 0;
|
2021-09-01 13:14:34 +00:00
|
|
|
|
2023-03-30 21:49:52 +00:00
|
|
|
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) && op_is_zone_mgmt(req_op(req)))
|
|
|
|
return BLK_STS_NOTSUPP;
|
|
|
|
|
2022-10-16 03:41:27 +00:00
|
|
|
/* Set fields for all request types */
|
|
|
|
vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req));
|
2021-09-01 13:14:34 +00:00
|
|
|
|
|
|
|
switch (req_op(req)) {
|
|
|
|
case REQ_OP_READ:
|
|
|
|
type = VIRTIO_BLK_T_IN;
|
2022-10-16 03:41:27 +00:00
|
|
|
sector = blk_rq_pos(req);
|
2021-09-01 13:14:34 +00:00
|
|
|
break;
|
|
|
|
case REQ_OP_WRITE:
|
|
|
|
type = VIRTIO_BLK_T_OUT;
|
2022-10-16 03:41:27 +00:00
|
|
|
sector = blk_rq_pos(req);
|
2021-09-01 13:14:34 +00:00
|
|
|
break;
|
|
|
|
case REQ_OP_FLUSH:
|
|
|
|
type = VIRTIO_BLK_T_FLUSH;
|
|
|
|
break;
|
|
|
|
case REQ_OP_DISCARD:
|
|
|
|
type = VIRTIO_BLK_T_DISCARD;
|
|
|
|
break;
|
|
|
|
case REQ_OP_WRITE_ZEROES:
|
|
|
|
type = VIRTIO_BLK_T_WRITE_ZEROES;
|
|
|
|
unmap = !(req->cmd_flags & REQ_NOUNMAP);
|
|
|
|
break;
|
2022-09-21 08:27:29 +00:00
|
|
|
case REQ_OP_SECURE_ERASE:
|
|
|
|
type = VIRTIO_BLK_T_SECURE_ERASE;
|
|
|
|
break;
|
2022-10-16 03:41:27 +00:00
|
|
|
case REQ_OP_ZONE_OPEN:
|
|
|
|
type = VIRTIO_BLK_T_ZONE_OPEN;
|
|
|
|
sector = blk_rq_pos(req);
|
|
|
|
break;
|
|
|
|
case REQ_OP_ZONE_CLOSE:
|
|
|
|
type = VIRTIO_BLK_T_ZONE_CLOSE;
|
|
|
|
sector = blk_rq_pos(req);
|
|
|
|
break;
|
|
|
|
case REQ_OP_ZONE_FINISH:
|
|
|
|
type = VIRTIO_BLK_T_ZONE_FINISH;
|
|
|
|
sector = blk_rq_pos(req);
|
2021-09-01 13:14:34 +00:00
|
|
|
break;
|
2022-10-16 03:41:27 +00:00
|
|
|
case REQ_OP_ZONE_APPEND:
|
|
|
|
type = VIRTIO_BLK_T_ZONE_APPEND;
|
|
|
|
sector = blk_rq_pos(req);
|
2023-03-30 21:49:52 +00:00
|
|
|
in_hdr_len = sizeof(vbr->in_hdr.zone_append);
|
2021-09-01 13:14:34 +00:00
|
|
|
break;
|
2022-10-16 03:41:27 +00:00
|
|
|
case REQ_OP_ZONE_RESET:
|
|
|
|
type = VIRTIO_BLK_T_ZONE_RESET;
|
|
|
|
sector = blk_rq_pos(req);
|
|
|
|
break;
|
|
|
|
case REQ_OP_ZONE_RESET_ALL:
|
|
|
|
type = VIRTIO_BLK_T_ZONE_RESET_ALL;
|
|
|
|
break;
|
|
|
|
case REQ_OP_DRV_IN:
|
2023-03-30 21:49:52 +00:00
|
|
|
/*
|
|
|
|
* Out header has already been prepared by the caller (virtblk_get_id()
|
|
|
|
* or virtblk_submit_zone_report()), nothing to do here.
|
|
|
|
*/
|
2022-10-16 03:41:27 +00:00
|
|
|
return 0;
|
2021-09-01 13:14:34 +00:00
|
|
|
default:
|
|
|
|
WARN_ON_ONCE(1);
|
|
|
|
return BLK_STS_IOERR;
|
|
|
|
}
|
|
|
|
|
2022-10-16 03:41:27 +00:00
|
|
|
/* Set fields for non-REQ_OP_DRV_IN request types */
|
|
|
|
vbr->in_hdr_len = in_hdr_len;
|
2021-09-01 13:14:34 +00:00
|
|
|
vbr->out_hdr.type = cpu_to_virtio32(vdev, type);
|
2022-10-16 03:41:27 +00:00
|
|
|
vbr->out_hdr.sector = cpu_to_virtio64(vdev, sector);
|
2021-09-01 13:14:34 +00:00
|
|
|
|
2022-09-21 08:27:29 +00:00
|
|
|
if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES ||
|
|
|
|
type == VIRTIO_BLK_T_SECURE_ERASE) {
|
|
|
|
if (virtblk_setup_discard_write_zeroes_erase(req, unmap))
|
2021-09-01 13:14:34 +00:00
|
|
|
return BLK_STS_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-03-30 21:49:52 +00:00
|
|
|
/*
|
|
|
|
* The status byte is always the last byte of the virtblk request
|
|
|
|
* in-header. This helper fetches its value for all in-header formats
|
|
|
|
* that are currently defined.
|
|
|
|
*/
|
|
|
|
static inline u8 virtblk_vbr_status(struct virtblk_req *vbr)
|
|
|
|
{
|
|
|
|
return *((u8 *)&vbr->in_hdr + vbr->in_hdr_len - 1);
|
|
|
|
}
|
|
|
|
|
2021-09-01 13:14:34 +00:00
|
|
|
static inline void virtblk_request_done(struct request *req)
|
|
|
|
{
|
|
|
|
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
|
2023-03-30 21:49:52 +00:00
|
|
|
blk_status_t status = virtblk_result(virtblk_vbr_status(vbr));
|
|
|
|
struct virtio_blk *vblk = req->mq_hctx->queue->queuedata;
|
2021-09-01 13:14:34 +00:00
|
|
|
|
|
|
|
virtblk_unmap_data(req, vbr);
|
|
|
|
virtblk_cleanup_cmd(req);
|
2022-10-16 03:41:27 +00:00
|
|
|
|
|
|
|
if (req_op(req) == REQ_OP_ZONE_APPEND)
|
2023-03-30 21:49:52 +00:00
|
|
|
req->__sector = virtio64_to_cpu(vblk->vdev,
|
|
|
|
vbr->in_hdr.zone_append.sector);
|
2022-10-16 03:41:27 +00:00
|
|
|
|
|
|
|
blk_mq_end_request(req, status);
|
virtio-blk: Add bio-based IO path for virtio-blk
This patch introduces bio-based IO path for virtio-blk.
Compared to request-based IO path, bio-based IO path uses driver
provided ->make_request_fn() method to bypasses the IO scheduler. It
handles the bio to device directly without allocating a request in block
layer. This reduces the IO path in guest kernel to achieve high IOPS
and lower latency. The downside is that guest can not use the IO
scheduler to merge and sort requests. However, this is not a big problem
if the backend disk in host side uses faster disk device.
When the bio-based IO path is not enabled, virtio-blk still uses the
original request-based IO path, no performance difference is observed.
Using a slow device e.g. normal SATA disk, the bio-based IO path for
sequential read and write are slower than req-based IO path due to lack
of merge in guest kernel. So we make the bio-based path optional.
Performance evaluation:
-----------------------------
1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 28%, 24%, 21%, 16%
Latency improvement: 32%, 17%, 21%, 16%
Long version:
With bio-based IO path:
seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec
seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec
rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec
rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec
clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30
clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35
clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39
clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45
cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954
cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137
cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648
cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375
With request-based IO path:
seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec
seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec
rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec
rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec
clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49
clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15
clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27
clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68
cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622
cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959
cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082
cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985
2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 11%, 11%, 13%, 10%
Latency improvement: 10%, 10%, 12%, 10%
Long Version:
With bio-based IO path:
read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec
write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec
read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec
write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec
clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29
clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80
clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07
clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25
cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517
cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604
cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612
cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105
With request-based IO path:
read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec
write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec
read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec
write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec
clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84
clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64
clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55
clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95
cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641
cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689
cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223
cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409
3) Fio test is performed in a 8 vcpu guest with normal SATA based guest
using kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : -10%, -10%, 4.4%, 0.5%
Latency improvement: -12%, -15%, 2.5%, 0.8%
Long Version:
With bio-based IO path:
read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec
write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec
read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec
write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec
clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54
clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70
clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28
clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71
cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7
cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0
cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16
cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0
With request-based IO path:
read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec
write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec
read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec
write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec
clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45
clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71
clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97
clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61
cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23
cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0
cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16
cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0
How to use:
-----------------------------
Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk
use_bio=1' to enable ->make_request_fn() based I/O path.
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Asias He <asias@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 08:07:04 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void virtblk_done(struct virtqueue *vq)
|
2007-10-22 01:03:38 +00:00
|
|
|
{
|
|
|
|
struct virtio_blk *vblk = vq->vdev->priv;
|
Revert "virtio-blk: support completion batching for the IRQ path"
This reverts commit 07b679f70d73483930e8d3c293942416d9cd5c13.
This change appears to have broken things...
We now see applications hanging during disk accesses.
e.g.
multi-port virtio-blk device running in h/w (FPGA)
Host running a simple 'fio' test.
[global]
thread=1
direct=1
ioengine=libaio
norandommap=1
group_reporting=1
bs=4K
rw=read
iodepth=128
runtime=1
numjobs=4
time_based
[job0]
filename=/dev/vda
[job1]
filename=/dev/vdb
[job2]
filename=/dev/vdc
...
[job15]
filename=/dev/vdp
i.e. 16 disks; 4 queues per disk; simple burst of 4KB reads
This is repeatedly run in a loop.
After a few, normally <10 seconds, fio hangs.
With 64 queues (16 disks), failure occurs within a few seconds; with 8 queues (2 disks) it may take ~hour before hanging.
Last message:
fio-3.19
Starting 8 threads
Jobs: 1 (f=1): [_(7),R(1)][68.3%][eta 03h:11m:06s]
I think this means at the end of the run 1 queue was left incomplete.
'diskstats' (run while fio is hung) shows no outstanding transactions.
e.g.
$ cat /proc/diskstats
...
252 0 vda 1843140071 0 14745120568 712568645 0 0 0 0 0 3117947 712568645 0 0 0 0 0 0
252 16 vdb 1816291511 0 14530332088 704905623 0 0 0 0 0 3117711 704905623 0 0 0 0 0 0
...
Other stats (in the h/w, and added to the virtio-blk driver ([a]virtio_queue_rq(), [b]virtblk_handle_req(), [c]virtblk_request_done()) all agree, and show every request had a completion, and that virtblk_request_done() never gets called.
e.g.
PF= 0 vq=0 1 2 3
[a]request_count - 839416590 813148916 105586179 84988123
[b]completion1_count - 839416590 813148916 105586179 84988123
[c]completion2_count - 0 0 0 0
PF= 1 vq=0 1 2 3
[a]request_count - 823335887 812516140 104582672 75856549
[b]completion1_count - 823335887 812516140 104582672 75856549
[c]completion2_count - 0 0 0 0
i.e. the issue is after the virtio-blk driver.
This change was introduced in kernel 6.3.0.
I am seeing this using 6.3.3.
If I run with an earlier kernel (5.15), it does not occur.
If I make a simple patch to the 6.3.3 virtio-blk driver, to skip the blk_mq_add_to_batch()call, it does not fail.
e.g.
kernel 5.15 - this is OK
virtio_blk.c,virtblk_done() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
blk_mq_complete_request(req);
}
kernel 6.3.3 - this fails
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
If I do, kernel 6.3.3 - this is OK
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
virtblk_request_done(req); //force this here...
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
Perhaps you might like to fix/test/revert this change...
Martin
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202306090826.C1fZmdMe-lkp@intel.com/
Cc: Suwan Kim <suwan.kim027@gmail.com>
Tested-by: edliaw@google.com
Reported-by: "Roberts, Martin" <martin.roberts@intel.com>
Message-Id: <336455b4f630f329380a8f53ee8cad3868764d5c.1686295549.git.mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2023-06-08 21:42:53 +00:00
|
|
|
bool req_done = false;
|
|
|
|
int qid = vq->index;
|
|
|
|
struct virtblk_req *vbr;
|
2007-10-22 01:03:38 +00:00
|
|
|
unsigned long flags;
|
Revert "virtio-blk: support completion batching for the IRQ path"
This reverts commit 07b679f70d73483930e8d3c293942416d9cd5c13.
This change appears to have broken things...
We now see applications hanging during disk accesses.
e.g.
multi-port virtio-blk device running in h/w (FPGA)
Host running a simple 'fio' test.
[global]
thread=1
direct=1
ioengine=libaio
norandommap=1
group_reporting=1
bs=4K
rw=read
iodepth=128
runtime=1
numjobs=4
time_based
[job0]
filename=/dev/vda
[job1]
filename=/dev/vdb
[job2]
filename=/dev/vdc
...
[job15]
filename=/dev/vdp
i.e. 16 disks; 4 queues per disk; simple burst of 4KB reads
This is repeatedly run in a loop.
After a few, normally <10 seconds, fio hangs.
With 64 queues (16 disks), failure occurs within a few seconds; with 8 queues (2 disks) it may take ~hour before hanging.
Last message:
fio-3.19
Starting 8 threads
Jobs: 1 (f=1): [_(7),R(1)][68.3%][eta 03h:11m:06s]
I think this means at the end of the run 1 queue was left incomplete.
'diskstats' (run while fio is hung) shows no outstanding transactions.
e.g.
$ cat /proc/diskstats
...
252 0 vda 1843140071 0 14745120568 712568645 0 0 0 0 0 3117947 712568645 0 0 0 0 0 0
252 16 vdb 1816291511 0 14530332088 704905623 0 0 0 0 0 3117711 704905623 0 0 0 0 0 0
...
Other stats (in the h/w, and added to the virtio-blk driver ([a]virtio_queue_rq(), [b]virtblk_handle_req(), [c]virtblk_request_done()) all agree, and show every request had a completion, and that virtblk_request_done() never gets called.
e.g.
PF= 0 vq=0 1 2 3
[a]request_count - 839416590 813148916 105586179 84988123
[b]completion1_count - 839416590 813148916 105586179 84988123
[c]completion2_count - 0 0 0 0
PF= 1 vq=0 1 2 3
[a]request_count - 823335887 812516140 104582672 75856549
[b]completion1_count - 823335887 812516140 104582672 75856549
[c]completion2_count - 0 0 0 0
i.e. the issue is after the virtio-blk driver.
This change was introduced in kernel 6.3.0.
I am seeing this using 6.3.3.
If I run with an earlier kernel (5.15), it does not occur.
If I make a simple patch to the 6.3.3 virtio-blk driver, to skip the blk_mq_add_to_batch()call, it does not fail.
e.g.
kernel 5.15 - this is OK
virtio_blk.c,virtblk_done() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
blk_mq_complete_request(req);
}
kernel 6.3.3 - this fails
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
If I do, kernel 6.3.3 - this is OK
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
virtblk_request_done(req); //force this here...
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
Perhaps you might like to fix/test/revert this change...
Martin
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202306090826.C1fZmdMe-lkp@intel.com/
Cc: Suwan Kim <suwan.kim027@gmail.com>
Tested-by: edliaw@google.com
Reported-by: "Roberts, Martin" <martin.roberts@intel.com>
Message-Id: <336455b4f630f329380a8f53ee8cad3868764d5c.1686295549.git.mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2023-06-08 21:42:53 +00:00
|
|
|
unsigned int len;
|
2007-10-22 01:03:38 +00:00
|
|
|
|
Revert "virtio-blk: support completion batching for the IRQ path"
This reverts commit 07b679f70d73483930e8d3c293942416d9cd5c13.
This change appears to have broken things...
We now see applications hanging during disk accesses.
e.g.
multi-port virtio-blk device running in h/w (FPGA)
Host running a simple 'fio' test.
[global]
thread=1
direct=1
ioengine=libaio
norandommap=1
group_reporting=1
bs=4K
rw=read
iodepth=128
runtime=1
numjobs=4
time_based
[job0]
filename=/dev/vda
[job1]
filename=/dev/vdb
[job2]
filename=/dev/vdc
...
[job15]
filename=/dev/vdp
i.e. 16 disks; 4 queues per disk; simple burst of 4KB reads
This is repeatedly run in a loop.
After a few, normally <10 seconds, fio hangs.
With 64 queues (16 disks), failure occurs within a few seconds; with 8 queues (2 disks) it may take ~hour before hanging.
Last message:
fio-3.19
Starting 8 threads
Jobs: 1 (f=1): [_(7),R(1)][68.3%][eta 03h:11m:06s]
I think this means at the end of the run 1 queue was left incomplete.
'diskstats' (run while fio is hung) shows no outstanding transactions.
e.g.
$ cat /proc/diskstats
...
252 0 vda 1843140071 0 14745120568 712568645 0 0 0 0 0 3117947 712568645 0 0 0 0 0 0
252 16 vdb 1816291511 0 14530332088 704905623 0 0 0 0 0 3117711 704905623 0 0 0 0 0 0
...
Other stats (in the h/w, and added to the virtio-blk driver ([a]virtio_queue_rq(), [b]virtblk_handle_req(), [c]virtblk_request_done()) all agree, and show every request had a completion, and that virtblk_request_done() never gets called.
e.g.
PF= 0 vq=0 1 2 3
[a]request_count - 839416590 813148916 105586179 84988123
[b]completion1_count - 839416590 813148916 105586179 84988123
[c]completion2_count - 0 0 0 0
PF= 1 vq=0 1 2 3
[a]request_count - 823335887 812516140 104582672 75856549
[b]completion1_count - 823335887 812516140 104582672 75856549
[c]completion2_count - 0 0 0 0
i.e. the issue is after the virtio-blk driver.
This change was introduced in kernel 6.3.0.
I am seeing this using 6.3.3.
If I run with an earlier kernel (5.15), it does not occur.
If I make a simple patch to the 6.3.3 virtio-blk driver, to skip the blk_mq_add_to_batch()call, it does not fail.
e.g.
kernel 5.15 - this is OK
virtio_blk.c,virtblk_done() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
blk_mq_complete_request(req);
}
kernel 6.3.3 - this fails
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
If I do, kernel 6.3.3 - this is OK
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
virtblk_request_done(req); //force this here...
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
Perhaps you might like to fix/test/revert this change...
Martin
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202306090826.C1fZmdMe-lkp@intel.com/
Cc: Suwan Kim <suwan.kim027@gmail.com>
Tested-by: edliaw@google.com
Reported-by: "Roberts, Martin" <martin.roberts@intel.com>
Message-Id: <336455b4f630f329380a8f53ee8cad3868764d5c.1686295549.git.mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2023-06-08 21:42:53 +00:00
|
|
|
spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
|
virtio-blk: Disable callback in virtblk_done()
This reduces unnecessary interrupts that host could send to guest while
guest is in the progress of irq handling.
If one vcpu is handling the irq, while another interrupt comes, in
handle_edge_irq(), the guest will mask the interrupt via mask_msi_irq()
which is a very heavy operation that goes all the way down to host.
Here are some performance numbers on qemu:
Before:
-------------------------------------
seq-read : io=0 B, bw=269730KB/s, iops=67432 , runt= 62200msec
seq-write : io=0 B, bw=339716KB/s, iops=84929 , runt= 49386msec
rand-read : io=0 B, bw=270435KB/s, iops=67608 , runt= 62038msec
rand-write: io=0 B, bw=354436KB/s, iops=88608 , runt= 47335msec
clat (usec): min=101 , max=138052 , avg=14822.09, stdev=11771.01
clat (usec): min=96 , max=81543 , avg=11798.94, stdev=7735.60
clat (usec): min=128 , max=140043 , avg=14835.85, stdev=11765.33
clat (usec): min=109 , max=147207 , avg=11337.09, stdev=5990.35
cpu : usr=15.93%, sys=60.37%, ctx=7764972, majf=0, minf=54
cpu : usr=32.73%, sys=120.49%, ctx=7372945, majf=0, minf=1
cpu : usr=18.84%, sys=58.18%, ctx=7775420, majf=0, minf=1
cpu : usr=24.20%, sys=59.85%, ctx=8307886, majf=0, minf=0
vdb: ios=8389107/8368136, merge=0/0, ticks=19457874/14616506,
in_queue=34206098, util=99.68%
43: interrupt in total: 887320
fio --exec_prerun="echo 3 > /proc/sys/vm/drop_caches" --group_reporting
--ioscheduler=noop --thread --bs=4k --size=512MB --direct=1 --numjobs=16
--ioengine=libaio --iodepth=64 --loops=3 --ramp_time=0
--filename=/dev/vdb --name=seq-read --stonewall --rw=read
--name=seq-write --stonewall --rw=write --name=rnd-read --stonewall
--rw=randread --name=rnd-write --stonewall --rw=randwrite
After:
-------------------------------------
seq-read : io=0 B, bw=309503KB/s, iops=77375 , runt= 54207msec
seq-write : io=0 B, bw=448205KB/s, iops=112051 , runt= 37432msec
rand-read : io=0 B, bw=311254KB/s, iops=77813 , runt= 53902msec
rand-write: io=0 B, bw=377152KB/s, iops=94287 , runt= 44484msec
clat (usec): min=81 , max=90588 , avg=12946.06, stdev=9085.94
clat (usec): min=57 , max=72264 , avg=8967.97, stdev=5951.04
clat (usec): min=29 , max=101046 , avg=12889.95, stdev=9067.91
clat (usec): min=52 , max=106152 , avg=10660.56, stdev=4778.19
cpu : usr=15.05%, sys=57.92%, ctx=7710941, majf=0, minf=54
cpu : usr=26.78%, sys=101.40%, ctx=7387891, majf=0, minf=2
cpu : usr=19.03%, sys=58.17%, ctx=7681976, majf=0, minf=8
cpu : usr=24.65%, sys=58.34%, ctx=8442632, majf=0, minf=4
vdb: ios=8389086/8361888, merge=0/0, ticks=17243780/12742010,
in_queue=30078377, util=99.59%
43: interrupt in total: 1259639
fio --exec_prerun="echo 3 > /proc/sys/vm/drop_caches" --group_reporting
--ioscheduler=noop --thread --bs=4k --size=512MB --direct=1 --numjobs=16
--ioengine=libaio --iodepth=64 --loops=3 --ramp_time=0
--filename=/dev/vdb --name=seq-read --stonewall --rw=read
--name=seq-write --stonewall --rw=write --name=rnd-read --stonewall
--rw=randread --name=rnd-write --stonewall --rw=randwrite
Signed-off-by: Asias He <asias@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-09-25 02:36:17 +00:00
|
|
|
do {
|
|
|
|
virtqueue_disable_cb(vq);
|
Revert "virtio-blk: support completion batching for the IRQ path"
This reverts commit 07b679f70d73483930e8d3c293942416d9cd5c13.
This change appears to have broken things...
We now see applications hanging during disk accesses.
e.g.
multi-port virtio-blk device running in h/w (FPGA)
Host running a simple 'fio' test.
[global]
thread=1
direct=1
ioengine=libaio
norandommap=1
group_reporting=1
bs=4K
rw=read
iodepth=128
runtime=1
numjobs=4
time_based
[job0]
filename=/dev/vda
[job1]
filename=/dev/vdb
[job2]
filename=/dev/vdc
...
[job15]
filename=/dev/vdp
i.e. 16 disks; 4 queues per disk; simple burst of 4KB reads
This is repeatedly run in a loop.
After a few, normally <10 seconds, fio hangs.
With 64 queues (16 disks), failure occurs within a few seconds; with 8 queues (2 disks) it may take ~hour before hanging.
Last message:
fio-3.19
Starting 8 threads
Jobs: 1 (f=1): [_(7),R(1)][68.3%][eta 03h:11m:06s]
I think this means at the end of the run 1 queue was left incomplete.
'diskstats' (run while fio is hung) shows no outstanding transactions.
e.g.
$ cat /proc/diskstats
...
252 0 vda 1843140071 0 14745120568 712568645 0 0 0 0 0 3117947 712568645 0 0 0 0 0 0
252 16 vdb 1816291511 0 14530332088 704905623 0 0 0 0 0 3117711 704905623 0 0 0 0 0 0
...
Other stats (in the h/w, and added to the virtio-blk driver ([a]virtio_queue_rq(), [b]virtblk_handle_req(), [c]virtblk_request_done()) all agree, and show every request had a completion, and that virtblk_request_done() never gets called.
e.g.
PF= 0 vq=0 1 2 3
[a]request_count - 839416590 813148916 105586179 84988123
[b]completion1_count - 839416590 813148916 105586179 84988123
[c]completion2_count - 0 0 0 0
PF= 1 vq=0 1 2 3
[a]request_count - 823335887 812516140 104582672 75856549
[b]completion1_count - 823335887 812516140 104582672 75856549
[c]completion2_count - 0 0 0 0
i.e. the issue is after the virtio-blk driver.
This change was introduced in kernel 6.3.0.
I am seeing this using 6.3.3.
If I run with an earlier kernel (5.15), it does not occur.
If I make a simple patch to the 6.3.3 virtio-blk driver, to skip the blk_mq_add_to_batch()call, it does not fail.
e.g.
kernel 5.15 - this is OK
virtio_blk.c,virtblk_done() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
blk_mq_complete_request(req);
}
kernel 6.3.3 - this fails
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
If I do, kernel 6.3.3 - this is OK
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
virtblk_request_done(req); //force this here...
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
Perhaps you might like to fix/test/revert this change...
Martin
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202306090826.C1fZmdMe-lkp@intel.com/
Cc: Suwan Kim <suwan.kim027@gmail.com>
Tested-by: edliaw@google.com
Reported-by: "Roberts, Martin" <martin.roberts@intel.com>
Message-Id: <336455b4f630f329380a8f53ee8cad3868764d5c.1686295549.git.mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2023-06-08 21:42:53 +00:00
|
|
|
while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
|
|
|
|
struct request *req = blk_mq_rq_from_pdu(vbr);
|
2017-01-28 08:32:52 +00:00
|
|
|
|
Revert "virtio-blk: support completion batching for the IRQ path"
This reverts commit 07b679f70d73483930e8d3c293942416d9cd5c13.
This change appears to have broken things...
We now see applications hanging during disk accesses.
e.g.
multi-port virtio-blk device running in h/w (FPGA)
Host running a simple 'fio' test.
[global]
thread=1
direct=1
ioengine=libaio
norandommap=1
group_reporting=1
bs=4K
rw=read
iodepth=128
runtime=1
numjobs=4
time_based
[job0]
filename=/dev/vda
[job1]
filename=/dev/vdb
[job2]
filename=/dev/vdc
...
[job15]
filename=/dev/vdp
i.e. 16 disks; 4 queues per disk; simple burst of 4KB reads
This is repeatedly run in a loop.
After a few, normally <10 seconds, fio hangs.
With 64 queues (16 disks), failure occurs within a few seconds; with 8 queues (2 disks) it may take ~hour before hanging.
Last message:
fio-3.19
Starting 8 threads
Jobs: 1 (f=1): [_(7),R(1)][68.3%][eta 03h:11m:06s]
I think this means at the end of the run 1 queue was left incomplete.
'diskstats' (run while fio is hung) shows no outstanding transactions.
e.g.
$ cat /proc/diskstats
...
252 0 vda 1843140071 0 14745120568 712568645 0 0 0 0 0 3117947 712568645 0 0 0 0 0 0
252 16 vdb 1816291511 0 14530332088 704905623 0 0 0 0 0 3117711 704905623 0 0 0 0 0 0
...
Other stats (in the h/w, and added to the virtio-blk driver ([a]virtio_queue_rq(), [b]virtblk_handle_req(), [c]virtblk_request_done()) all agree, and show every request had a completion, and that virtblk_request_done() never gets called.
e.g.
PF= 0 vq=0 1 2 3
[a]request_count - 839416590 813148916 105586179 84988123
[b]completion1_count - 839416590 813148916 105586179 84988123
[c]completion2_count - 0 0 0 0
PF= 1 vq=0 1 2 3
[a]request_count - 823335887 812516140 104582672 75856549
[b]completion1_count - 823335887 812516140 104582672 75856549
[c]completion2_count - 0 0 0 0
i.e. the issue is after the virtio-blk driver.
This change was introduced in kernel 6.3.0.
I am seeing this using 6.3.3.
If I run with an earlier kernel (5.15), it does not occur.
If I make a simple patch to the 6.3.3 virtio-blk driver, to skip the blk_mq_add_to_batch()call, it does not fail.
e.g.
kernel 5.15 - this is OK
virtio_blk.c,virtblk_done() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
blk_mq_complete_request(req);
}
kernel 6.3.3 - this fails
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
If I do, kernel 6.3.3 - this is OK
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
virtblk_request_done(req); //force this here...
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
Perhaps you might like to fix/test/revert this change...
Martin
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202306090826.C1fZmdMe-lkp@intel.com/
Cc: Suwan Kim <suwan.kim027@gmail.com>
Tested-by: edliaw@google.com
Reported-by: "Roberts, Martin" <martin.roberts@intel.com>
Message-Id: <336455b4f630f329380a8f53ee8cad3868764d5c.1686295549.git.mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2023-06-08 21:42:53 +00:00
|
|
|
if (likely(!blk_should_fake_timeout(req->q)))
|
|
|
|
blk_mq_complete_request(req);
|
|
|
|
req_done = true;
|
|
|
|
}
|
virtio-blk: Disable callback in virtblk_done()
This reduces unnecessary interrupts that host could send to guest while
guest is in the progress of irq handling.
If one vcpu is handling the irq, while another interrupt comes, in
handle_edge_irq(), the guest will mask the interrupt via mask_msi_irq()
which is a very heavy operation that goes all the way down to host.
Here are some performance numbers on qemu:
Before:
-------------------------------------
seq-read : io=0 B, bw=269730KB/s, iops=67432 , runt= 62200msec
seq-write : io=0 B, bw=339716KB/s, iops=84929 , runt= 49386msec
rand-read : io=0 B, bw=270435KB/s, iops=67608 , runt= 62038msec
rand-write: io=0 B, bw=354436KB/s, iops=88608 , runt= 47335msec
clat (usec): min=101 , max=138052 , avg=14822.09, stdev=11771.01
clat (usec): min=96 , max=81543 , avg=11798.94, stdev=7735.60
clat (usec): min=128 , max=140043 , avg=14835.85, stdev=11765.33
clat (usec): min=109 , max=147207 , avg=11337.09, stdev=5990.35
cpu : usr=15.93%, sys=60.37%, ctx=7764972, majf=0, minf=54
cpu : usr=32.73%, sys=120.49%, ctx=7372945, majf=0, minf=1
cpu : usr=18.84%, sys=58.18%, ctx=7775420, majf=0, minf=1
cpu : usr=24.20%, sys=59.85%, ctx=8307886, majf=0, minf=0
vdb: ios=8389107/8368136, merge=0/0, ticks=19457874/14616506,
in_queue=34206098, util=99.68%
43: interrupt in total: 887320
fio --exec_prerun="echo 3 > /proc/sys/vm/drop_caches" --group_reporting
--ioscheduler=noop --thread --bs=4k --size=512MB --direct=1 --numjobs=16
--ioengine=libaio --iodepth=64 --loops=3 --ramp_time=0
--filename=/dev/vdb --name=seq-read --stonewall --rw=read
--name=seq-write --stonewall --rw=write --name=rnd-read --stonewall
--rw=randread --name=rnd-write --stonewall --rw=randwrite
After:
-------------------------------------
seq-read : io=0 B, bw=309503KB/s, iops=77375 , runt= 54207msec
seq-write : io=0 B, bw=448205KB/s, iops=112051 , runt= 37432msec
rand-read : io=0 B, bw=311254KB/s, iops=77813 , runt= 53902msec
rand-write: io=0 B, bw=377152KB/s, iops=94287 , runt= 44484msec
clat (usec): min=81 , max=90588 , avg=12946.06, stdev=9085.94
clat (usec): min=57 , max=72264 , avg=8967.97, stdev=5951.04
clat (usec): min=29 , max=101046 , avg=12889.95, stdev=9067.91
clat (usec): min=52 , max=106152 , avg=10660.56, stdev=4778.19
cpu : usr=15.05%, sys=57.92%, ctx=7710941, majf=0, minf=54
cpu : usr=26.78%, sys=101.40%, ctx=7387891, majf=0, minf=2
cpu : usr=19.03%, sys=58.17%, ctx=7681976, majf=0, minf=8
cpu : usr=24.65%, sys=58.34%, ctx=8442632, majf=0, minf=4
vdb: ios=8389086/8361888, merge=0/0, ticks=17243780/12742010,
in_queue=30078377, util=99.59%
43: interrupt in total: 1259639
fio --exec_prerun="echo 3 > /proc/sys/vm/drop_caches" --group_reporting
--ioscheduler=noop --thread --bs=4k --size=512MB --direct=1 --numjobs=16
--ioengine=libaio --iodepth=64 --loops=3 --ramp_time=0
--filename=/dev/vdb --name=seq-read --stonewall --rw=read
--name=seq-write --stonewall --rw=write --name=rnd-read --stonewall
--rw=randread --name=rnd-write --stonewall --rw=randwrite
Signed-off-by: Asias He <asias@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-09-25 02:36:17 +00:00
|
|
|
} while (!virtqueue_enable_cb(vq));
|
2013-11-01 16:52:52 +00:00
|
|
|
|
Revert "virtio-blk: support completion batching for the IRQ path"
This reverts commit 07b679f70d73483930e8d3c293942416d9cd5c13.
This change appears to have broken things...
We now see applications hanging during disk accesses.
e.g.
multi-port virtio-blk device running in h/w (FPGA)
Host running a simple 'fio' test.
[global]
thread=1
direct=1
ioengine=libaio
norandommap=1
group_reporting=1
bs=4K
rw=read
iodepth=128
runtime=1
numjobs=4
time_based
[job0]
filename=/dev/vda
[job1]
filename=/dev/vdb
[job2]
filename=/dev/vdc
...
[job15]
filename=/dev/vdp
i.e. 16 disks; 4 queues per disk; simple burst of 4KB reads
This is repeatedly run in a loop.
After a few, normally <10 seconds, fio hangs.
With 64 queues (16 disks), failure occurs within a few seconds; with 8 queues (2 disks) it may take ~hour before hanging.
Last message:
fio-3.19
Starting 8 threads
Jobs: 1 (f=1): [_(7),R(1)][68.3%][eta 03h:11m:06s]
I think this means at the end of the run 1 queue was left incomplete.
'diskstats' (run while fio is hung) shows no outstanding transactions.
e.g.
$ cat /proc/diskstats
...
252 0 vda 1843140071 0 14745120568 712568645 0 0 0 0 0 3117947 712568645 0 0 0 0 0 0
252 16 vdb 1816291511 0 14530332088 704905623 0 0 0 0 0 3117711 704905623 0 0 0 0 0 0
...
Other stats (in the h/w, and added to the virtio-blk driver ([a]virtio_queue_rq(), [b]virtblk_handle_req(), [c]virtblk_request_done()) all agree, and show every request had a completion, and that virtblk_request_done() never gets called.
e.g.
PF= 0 vq=0 1 2 3
[a]request_count - 839416590 813148916 105586179 84988123
[b]completion1_count - 839416590 813148916 105586179 84988123
[c]completion2_count - 0 0 0 0
PF= 1 vq=0 1 2 3
[a]request_count - 823335887 812516140 104582672 75856549
[b]completion1_count - 823335887 812516140 104582672 75856549
[c]completion2_count - 0 0 0 0
i.e. the issue is after the virtio-blk driver.
This change was introduced in kernel 6.3.0.
I am seeing this using 6.3.3.
If I run with an earlier kernel (5.15), it does not occur.
If I make a simple patch to the 6.3.3 virtio-blk driver, to skip the blk_mq_add_to_batch()call, it does not fail.
e.g.
kernel 5.15 - this is OK
virtio_blk.c,virtblk_done() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
blk_mq_complete_request(req);
}
kernel 6.3.3 - this fails
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
If I do, kernel 6.3.3 - this is OK
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
virtblk_request_done(req); //force this here...
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
Perhaps you might like to fix/test/revert this change...
Martin
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202306090826.C1fZmdMe-lkp@intel.com/
Cc: Suwan Kim <suwan.kim027@gmail.com>
Tested-by: edliaw@google.com
Reported-by: "Roberts, Martin" <martin.roberts@intel.com>
Message-Id: <336455b4f630f329380a8f53ee8cad3868764d5c.1686295549.git.mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2023-06-08 21:42:53 +00:00
|
|
|
/* In case queue is stopped waiting for more buffers. */
|
|
|
|
if (req_done)
|
2014-04-16 07:44:54 +00:00
|
|
|
blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
|
Revert "virtio-blk: support completion batching for the IRQ path"
This reverts commit 07b679f70d73483930e8d3c293942416d9cd5c13.
This change appears to have broken things...
We now see applications hanging during disk accesses.
e.g.
multi-port virtio-blk device running in h/w (FPGA)
Host running a simple 'fio' test.
[global]
thread=1
direct=1
ioengine=libaio
norandommap=1
group_reporting=1
bs=4K
rw=read
iodepth=128
runtime=1
numjobs=4
time_based
[job0]
filename=/dev/vda
[job1]
filename=/dev/vdb
[job2]
filename=/dev/vdc
...
[job15]
filename=/dev/vdp
i.e. 16 disks; 4 queues per disk; simple burst of 4KB reads
This is repeatedly run in a loop.
After a few, normally <10 seconds, fio hangs.
With 64 queues (16 disks), failure occurs within a few seconds; with 8 queues (2 disks) it may take ~hour before hanging.
Last message:
fio-3.19
Starting 8 threads
Jobs: 1 (f=1): [_(7),R(1)][68.3%][eta 03h:11m:06s]
I think this means at the end of the run 1 queue was left incomplete.
'diskstats' (run while fio is hung) shows no outstanding transactions.
e.g.
$ cat /proc/diskstats
...
252 0 vda 1843140071 0 14745120568 712568645 0 0 0 0 0 3117947 712568645 0 0 0 0 0 0
252 16 vdb 1816291511 0 14530332088 704905623 0 0 0 0 0 3117711 704905623 0 0 0 0 0 0
...
Other stats (in the h/w, and added to the virtio-blk driver ([a]virtio_queue_rq(), [b]virtblk_handle_req(), [c]virtblk_request_done()) all agree, and show every request had a completion, and that virtblk_request_done() never gets called.
e.g.
PF= 0 vq=0 1 2 3
[a]request_count - 839416590 813148916 105586179 84988123
[b]completion1_count - 839416590 813148916 105586179 84988123
[c]completion2_count - 0 0 0 0
PF= 1 vq=0 1 2 3
[a]request_count - 823335887 812516140 104582672 75856549
[b]completion1_count - 823335887 812516140 104582672 75856549
[c]completion2_count - 0 0 0 0
i.e. the issue is after the virtio-blk driver.
This change was introduced in kernel 6.3.0.
I am seeing this using 6.3.3.
If I run with an earlier kernel (5.15), it does not occur.
If I make a simple patch to the 6.3.3 virtio-blk driver, to skip the blk_mq_add_to_batch()call, it does not fail.
e.g.
kernel 5.15 - this is OK
virtio_blk.c,virtblk_done() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
blk_mq_complete_request(req);
}
kernel 6.3.3 - this fails
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
If I do, kernel 6.3.3 - this is OK
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
virtblk_request_done(req); //force this here...
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
Perhaps you might like to fix/test/revert this change...
Martin
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202306090826.C1fZmdMe-lkp@intel.com/
Cc: Suwan Kim <suwan.kim027@gmail.com>
Tested-by: edliaw@google.com
Reported-by: "Roberts, Martin" <martin.roberts@intel.com>
Message-Id: <336455b4f630f329380a8f53ee8cad3868764d5c.1686295549.git.mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2023-06-08 21:42:53 +00:00
|
|
|
spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
|
virtio-blk: Add bio-based IO path for virtio-blk
This patch introduces bio-based IO path for virtio-blk.
Compared to request-based IO path, bio-based IO path uses driver
provided ->make_request_fn() method to bypasses the IO scheduler. It
handles the bio to device directly without allocating a request in block
layer. This reduces the IO path in guest kernel to achieve high IOPS
and lower latency. The downside is that guest can not use the IO
scheduler to merge and sort requests. However, this is not a big problem
if the backend disk in host side uses faster disk device.
When the bio-based IO path is not enabled, virtio-blk still uses the
original request-based IO path, no performance difference is observed.
Using a slow device e.g. normal SATA disk, the bio-based IO path for
sequential read and write are slower than req-based IO path due to lack
of merge in guest kernel. So we make the bio-based path optional.
Performance evaluation:
-----------------------------
1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 28%, 24%, 21%, 16%
Latency improvement: 32%, 17%, 21%, 16%
Long version:
With bio-based IO path:
seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec
seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec
rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec
rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec
clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30
clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35
clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39
clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45
cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954
cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137
cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648
cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375
With request-based IO path:
seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec
seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec
rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec
rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec
clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49
clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15
clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27
clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68
cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622
cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959
cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082
cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985
2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 11%, 11%, 13%, 10%
Latency improvement: 10%, 10%, 12%, 10%
Long Version:
With bio-based IO path:
read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec
write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec
read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec
write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec
clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29
clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80
clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07
clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25
cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517
cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604
cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612
cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105
With request-based IO path:
read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec
write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec
read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec
write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec
clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84
clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64
clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55
clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95
cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641
cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689
cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223
cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409
3) Fio test is performed in a 8 vcpu guest with normal SATA based guest
using kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : -10%, -10%, 4.4%, 0.5%
Latency improvement: -12%, -15%, 2.5%, 0.8%
Long Version:
With bio-based IO path:
read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec
write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec
read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec
write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec
clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54
clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70
clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28
clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71
cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7
cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0
cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16
cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0
With request-based IO path:
read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec
write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec
read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec
write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec
clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45
clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71
clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97
clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61
cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23
cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0
cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16
cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0
How to use:
-----------------------------
Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk
use_bio=1' to enable ->make_request_fn() based I/O path.
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Asias He <asias@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 08:07:04 +00:00
|
|
|
}
|
|
|
|
|
2018-11-26 18:00:12 +00:00
|
|
|
static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx)
|
|
|
|
{
|
|
|
|
struct virtio_blk *vblk = hctx->queue->queuedata;
|
|
|
|
struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num];
|
|
|
|
bool kick;
|
|
|
|
|
|
|
|
spin_lock_irq(&vq->lock);
|
|
|
|
kick = virtqueue_kick_prepare(vq->vq);
|
|
|
|
spin_unlock_irq(&vq->lock);
|
|
|
|
|
|
|
|
if (kick)
|
|
|
|
virtqueue_notify(vq->vq);
|
|
|
|
}
|
|
|
|
|
2022-10-16 03:41:26 +00:00
|
|
|
static blk_status_t virtblk_fail_to_queue(struct request *req, int rc)
|
|
|
|
{
|
|
|
|
virtblk_cleanup_cmd(req);
|
|
|
|
switch (rc) {
|
|
|
|
case -ENOSPC:
|
|
|
|
return BLK_STS_DEV_RESOURCE;
|
|
|
|
case -ENOMEM:
|
|
|
|
return BLK_STS_RESOURCE;
|
|
|
|
default:
|
|
|
|
return BLK_STS_IOERR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
virtio-blk: support mq_ops->queue_rqs()
This patch supports mq_ops->queue_rqs() hook. It has an advantage of
batch submission to virtio-blk driver. It also helps polling I/O because
polling uses batched completion of block layer. Batch submission in
queue_rqs() can boost polling performance.
In queue_rqs(), it iterates plug->mq_list, collects requests that
belong to same HW queue until it encounters a request from other
HW queue or sees the end of the list.
Then, virtio-blk adds requests into virtqueue and kicks virtqueue
to submit requests.
If there is an error, it inserts error request to requeue_list and
passes it to ordinary block layer path.
For verification, I did fio test.
(io_uring, randread, direct=1, bs=4K, iodepth=64 numjobs=N)
I set 4 vcpu and 2 virtio-blk queues for VM and run fio test 5 times.
It shows about 2% improvement.
| numjobs=2 | numjobs=4
-----------------------------------------------------------
fio without queue_rqs() | 291K IOPS | 238K IOPS
-----------------------------------------------------------
fio with queue_rqs() | 295K IOPS | 243K IOPS
For polling I/O performance, I also did fio test as below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=4)
I set 4 vcpu and 2 poll queues for VM.
It shows about 2% improvement in polling I/O.
| IOPS | avg latency
-----------------------------------------------------------
fio poll without queue_rqs() | 424K | 613.05 usec
-----------------------------------------------------------
fio poll with queue_rqs() | 435K | 601.01 usec
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-3-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:07 +00:00
|
|
|
static blk_status_t virtblk_prep_rq(struct blk_mq_hw_ctx *hctx,
|
|
|
|
struct virtio_blk *vblk,
|
|
|
|
struct request *req,
|
|
|
|
struct virtblk_req *vbr)
|
|
|
|
{
|
|
|
|
blk_status_t status;
|
2022-10-21 20:41:26 +00:00
|
|
|
int num;
|
virtio-blk: support mq_ops->queue_rqs()
This patch supports mq_ops->queue_rqs() hook. It has an advantage of
batch submission to virtio-blk driver. It also helps polling I/O because
polling uses batched completion of block layer. Batch submission in
queue_rqs() can boost polling performance.
In queue_rqs(), it iterates plug->mq_list, collects requests that
belong to same HW queue until it encounters a request from other
HW queue or sees the end of the list.
Then, virtio-blk adds requests into virtqueue and kicks virtqueue
to submit requests.
If there is an error, it inserts error request to requeue_list and
passes it to ordinary block layer path.
For verification, I did fio test.
(io_uring, randread, direct=1, bs=4K, iodepth=64 numjobs=N)
I set 4 vcpu and 2 virtio-blk queues for VM and run fio test 5 times.
It shows about 2% improvement.
| numjobs=2 | numjobs=4
-----------------------------------------------------------
fio without queue_rqs() | 291K IOPS | 238K IOPS
-----------------------------------------------------------
fio with queue_rqs() | 295K IOPS | 243K IOPS
For polling I/O performance, I also did fio test as below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=4)
I set 4 vcpu and 2 poll queues for VM.
It shows about 2% improvement in polling I/O.
| IOPS | avg latency
-----------------------------------------------------------
fio poll without queue_rqs() | 424K | 613.05 usec
-----------------------------------------------------------
fio poll with queue_rqs() | 435K | 601.01 usec
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-3-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:07 +00:00
|
|
|
|
|
|
|
status = virtblk_setup_cmd(vblk->vdev, req, vbr);
|
|
|
|
if (unlikely(status))
|
|
|
|
return status;
|
|
|
|
|
2022-10-21 20:41:26 +00:00
|
|
|
num = virtblk_map_data(hctx, req, vbr);
|
|
|
|
if (unlikely(num < 0))
|
2022-10-16 03:41:26 +00:00
|
|
|
return virtblk_fail_to_queue(req, -ENOMEM);
|
2022-10-21 20:41:26 +00:00
|
|
|
vbr->sg_table.nents = num;
|
virtio-blk: support mq_ops->queue_rqs()
This patch supports mq_ops->queue_rqs() hook. It has an advantage of
batch submission to virtio-blk driver. It also helps polling I/O because
polling uses batched completion of block layer. Batch submission in
queue_rqs() can boost polling performance.
In queue_rqs(), it iterates plug->mq_list, collects requests that
belong to same HW queue until it encounters a request from other
HW queue or sees the end of the list.
Then, virtio-blk adds requests into virtqueue and kicks virtqueue
to submit requests.
If there is an error, it inserts error request to requeue_list and
passes it to ordinary block layer path.
For verification, I did fio test.
(io_uring, randread, direct=1, bs=4K, iodepth=64 numjobs=N)
I set 4 vcpu and 2 virtio-blk queues for VM and run fio test 5 times.
It shows about 2% improvement.
| numjobs=2 | numjobs=4
-----------------------------------------------------------
fio without queue_rqs() | 291K IOPS | 238K IOPS
-----------------------------------------------------------
fio with queue_rqs() | 295K IOPS | 243K IOPS
For polling I/O performance, I also did fio test as below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=4)
I set 4 vcpu and 2 poll queues for VM.
It shows about 2% improvement in polling I/O.
| IOPS | avg latency
-----------------------------------------------------------
fio poll without queue_rqs() | 424K | 613.05 usec
-----------------------------------------------------------
fio poll with queue_rqs() | 435K | 601.01 usec
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-3-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:07 +00:00
|
|
|
|
2022-08-30 15:01:53 +00:00
|
|
|
blk_mq_start_request(req);
|
|
|
|
|
virtio-blk: support mq_ops->queue_rqs()
This patch supports mq_ops->queue_rqs() hook. It has an advantage of
batch submission to virtio-blk driver. It also helps polling I/O because
polling uses batched completion of block layer. Batch submission in
queue_rqs() can boost polling performance.
In queue_rqs(), it iterates plug->mq_list, collects requests that
belong to same HW queue until it encounters a request from other
HW queue or sees the end of the list.
Then, virtio-blk adds requests into virtqueue and kicks virtqueue
to submit requests.
If there is an error, it inserts error request to requeue_list and
passes it to ordinary block layer path.
For verification, I did fio test.
(io_uring, randread, direct=1, bs=4K, iodepth=64 numjobs=N)
I set 4 vcpu and 2 virtio-blk queues for VM and run fio test 5 times.
It shows about 2% improvement.
| numjobs=2 | numjobs=4
-----------------------------------------------------------
fio without queue_rqs() | 291K IOPS | 238K IOPS
-----------------------------------------------------------
fio with queue_rqs() | 295K IOPS | 243K IOPS
For polling I/O performance, I also did fio test as below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=4)
I set 4 vcpu and 2 poll queues for VM.
It shows about 2% improvement in polling I/O.
| IOPS | avg latency
-----------------------------------------------------------
fio poll without queue_rqs() | 424K | 613.05 usec
-----------------------------------------------------------
fio poll with queue_rqs() | 435K | 601.01 usec
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-3-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:07 +00:00
|
|
|
return BLK_STS_OK;
|
|
|
|
}
|
|
|
|
|
2017-06-03 07:38:05 +00:00
|
|
|
static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
|
2014-10-29 17:14:52 +00:00
|
|
|
const struct blk_mq_queue_data *bd)
|
2007-10-22 01:03:38 +00:00
|
|
|
{
|
2013-11-01 16:52:52 +00:00
|
|
|
struct virtio_blk *vblk = hctx->queue->queuedata;
|
2014-10-29 17:14:52 +00:00
|
|
|
struct request *req = bd->rq;
|
2014-04-14 08:30:07 +00:00
|
|
|
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
|
2013-11-01 16:52:52 +00:00
|
|
|
unsigned long flags;
|
2014-06-26 09:41:48 +00:00
|
|
|
int qid = hctx->queue_num;
|
2014-05-30 02:49:29 +00:00
|
|
|
bool notify = false;
|
2021-10-25 07:54:03 +00:00
|
|
|
blk_status_t status;
|
|
|
|
int err;
|
2007-10-22 01:03:38 +00:00
|
|
|
|
virtio-blk: support mq_ops->queue_rqs()
This patch supports mq_ops->queue_rqs() hook. It has an advantage of
batch submission to virtio-blk driver. It also helps polling I/O because
polling uses batched completion of block layer. Batch submission in
queue_rqs() can boost polling performance.
In queue_rqs(), it iterates plug->mq_list, collects requests that
belong to same HW queue until it encounters a request from other
HW queue or sees the end of the list.
Then, virtio-blk adds requests into virtqueue and kicks virtqueue
to submit requests.
If there is an error, it inserts error request to requeue_list and
passes it to ordinary block layer path.
For verification, I did fio test.
(io_uring, randread, direct=1, bs=4K, iodepth=64 numjobs=N)
I set 4 vcpu and 2 virtio-blk queues for VM and run fio test 5 times.
It shows about 2% improvement.
| numjobs=2 | numjobs=4
-----------------------------------------------------------
fio without queue_rqs() | 291K IOPS | 238K IOPS
-----------------------------------------------------------
fio with queue_rqs() | 295K IOPS | 243K IOPS
For polling I/O performance, I also did fio test as below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=4)
I set 4 vcpu and 2 poll queues for VM.
It shows about 2% improvement in polling I/O.
| IOPS | avg latency
-----------------------------------------------------------
fio poll without queue_rqs() | 424K | 613.05 usec
-----------------------------------------------------------
fio poll with queue_rqs() | 435K | 601.01 usec
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-3-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:07 +00:00
|
|
|
status = virtblk_prep_rq(hctx, vblk, req, vbr);
|
2021-10-25 07:54:03 +00:00
|
|
|
if (unlikely(status))
|
|
|
|
return status;
|
2017-01-31 15:57:31 +00:00
|
|
|
|
2014-06-26 09:41:48 +00:00
|
|
|
spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
|
virtio-blk: support mq_ops->queue_rqs()
This patch supports mq_ops->queue_rqs() hook. It has an advantage of
batch submission to virtio-blk driver. It also helps polling I/O because
polling uses batched completion of block layer. Batch submission in
queue_rqs() can boost polling performance.
In queue_rqs(), it iterates plug->mq_list, collects requests that
belong to same HW queue until it encounters a request from other
HW queue or sees the end of the list.
Then, virtio-blk adds requests into virtqueue and kicks virtqueue
to submit requests.
If there is an error, it inserts error request to requeue_list and
passes it to ordinary block layer path.
For verification, I did fio test.
(io_uring, randread, direct=1, bs=4K, iodepth=64 numjobs=N)
I set 4 vcpu and 2 virtio-blk queues for VM and run fio test 5 times.
It shows about 2% improvement.
| numjobs=2 | numjobs=4
-----------------------------------------------------------
fio without queue_rqs() | 291K IOPS | 238K IOPS
-----------------------------------------------------------
fio with queue_rqs() | 295K IOPS | 243K IOPS
For polling I/O performance, I also did fio test as below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=4)
I set 4 vcpu and 2 poll queues for VM.
It shows about 2% improvement in polling I/O.
| IOPS | avg latency
-----------------------------------------------------------
fio poll without queue_rqs() | 424K | 613.05 usec
-----------------------------------------------------------
fio poll with queue_rqs() | 435K | 601.01 usec
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-3-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:07 +00:00
|
|
|
err = virtblk_add_req(vblk->vqs[qid].vq, vbr);
|
2014-03-13 00:53:39 +00:00
|
|
|
if (err) {
|
2014-06-26 09:41:48 +00:00
|
|
|
virtqueue_kick(vblk->vqs[qid].vq);
|
2020-02-13 12:37:27 +00:00
|
|
|
/* Don't stop the queue if -ENOMEM: we may have failed to
|
|
|
|
* bounce the buffer due to global resource outage.
|
|
|
|
*/
|
|
|
|
if (err == -ENOSPC)
|
|
|
|
blk_mq_stop_hw_queue(hctx);
|
2014-06-26 09:41:48 +00:00
|
|
|
spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
|
2021-09-01 13:14:34 +00:00
|
|
|
virtblk_unmap_data(req, vbr);
|
2022-10-16 03:41:26 +00:00
|
|
|
return virtblk_fail_to_queue(req, err);
|
virtio-blk: Add bio-based IO path for virtio-blk
This patch introduces bio-based IO path for virtio-blk.
Compared to request-based IO path, bio-based IO path uses driver
provided ->make_request_fn() method to bypasses the IO scheduler. It
handles the bio to device directly without allocating a request in block
layer. This reduces the IO path in guest kernel to achieve high IOPS
and lower latency. The downside is that guest can not use the IO
scheduler to merge and sort requests. However, this is not a big problem
if the backend disk in host side uses faster disk device.
When the bio-based IO path is not enabled, virtio-blk still uses the
original request-based IO path, no performance difference is observed.
Using a slow device e.g. normal SATA disk, the bio-based IO path for
sequential read and write are slower than req-based IO path due to lack
of merge in guest kernel. So we make the bio-based path optional.
Performance evaluation:
-----------------------------
1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 28%, 24%, 21%, 16%
Latency improvement: 32%, 17%, 21%, 16%
Long version:
With bio-based IO path:
seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec
seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec
rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec
rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec
clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30
clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35
clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39
clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45
cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954
cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137
cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648
cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375
With request-based IO path:
seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec
seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec
rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec
rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec
clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49
clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15
clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27
clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68
cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622
cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959
cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082
cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985
2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 11%, 11%, 13%, 10%
Latency improvement: 10%, 10%, 12%, 10%
Long Version:
With bio-based IO path:
read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec
write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec
read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec
write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec
clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29
clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80
clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07
clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25
cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517
cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604
cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612
cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105
With request-based IO path:
read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec
write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec
read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec
write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec
clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84
clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64
clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55
clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95
cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641
cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689
cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223
cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409
3) Fio test is performed in a 8 vcpu guest with normal SATA based guest
using kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : -10%, -10%, 4.4%, 0.5%
Latency improvement: -12%, -15%, 2.5%, 0.8%
Long Version:
With bio-based IO path:
read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec
write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec
read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec
write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec
clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54
clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70
clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28
clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71
cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7
cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0
cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16
cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0
With request-based IO path:
read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec
write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec
read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec
write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec
clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45
clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71
clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97
clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61
cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23
cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0
cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16
cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0
How to use:
-----------------------------
Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk
use_bio=1' to enable ->make_request_fn() based I/O path.
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Asias He <asias@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 08:07:04 +00:00
|
|
|
}
|
|
|
|
|
2014-10-29 17:14:52 +00:00
|
|
|
if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
|
2014-05-30 02:49:29 +00:00
|
|
|
notify = true;
|
2014-06-26 09:41:48 +00:00
|
|
|
spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
|
2014-05-30 02:49:29 +00:00
|
|
|
|
|
|
|
if (notify)
|
2014-06-26 09:41:48 +00:00
|
|
|
virtqueue_notify(vblk->vqs[qid].vq);
|
2017-06-03 07:38:05 +00:00
|
|
|
return BLK_STS_OK;
|
virtio-blk: Add bio-based IO path for virtio-blk
This patch introduces bio-based IO path for virtio-blk.
Compared to request-based IO path, bio-based IO path uses driver
provided ->make_request_fn() method to bypasses the IO scheduler. It
handles the bio to device directly without allocating a request in block
layer. This reduces the IO path in guest kernel to achieve high IOPS
and lower latency. The downside is that guest can not use the IO
scheduler to merge and sort requests. However, this is not a big problem
if the backend disk in host side uses faster disk device.
When the bio-based IO path is not enabled, virtio-blk still uses the
original request-based IO path, no performance difference is observed.
Using a slow device e.g. normal SATA disk, the bio-based IO path for
sequential read and write are slower than req-based IO path due to lack
of merge in guest kernel. So we make the bio-based path optional.
Performance evaluation:
-----------------------------
1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 28%, 24%, 21%, 16%
Latency improvement: 32%, 17%, 21%, 16%
Long version:
With bio-based IO path:
seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec
seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec
rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec
rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec
clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30
clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35
clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39
clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45
cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954
cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137
cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648
cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375
With request-based IO path:
seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec
seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec
rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec
rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec
clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49
clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15
clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27
clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68
cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622
cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959
cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082
cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985
2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using
kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : 11%, 11%, 13%, 10%
Latency improvement: 10%, 10%, 12%, 10%
Long Version:
With bio-based IO path:
read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec
write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec
read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec
write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec
clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29
clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80
clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07
clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25
cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517
cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604
cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612
cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105
With request-based IO path:
read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec
write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec
read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec
write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec
clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84
clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64
clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55
clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95
cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641
cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689
cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223
cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409
3) Fio test is performed in a 8 vcpu guest with normal SATA based guest
using kvm tool.
Short version:
With bio-based IO path, sequential read/write, random read/write
IOPS boost : -10%, -10%, 4.4%, 0.5%
Latency improvement: -12%, -15%, 2.5%, 0.8%
Long Version:
With bio-based IO path:
read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec
write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec
read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec
write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec
clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54
clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70
clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28
clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71
cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7
cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0
cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16
cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0
With request-based IO path:
read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec
write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec
read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec
write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec
clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45
clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71
clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97
clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61
cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23
cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0
cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16
cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0
How to use:
-----------------------------
Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk
use_bio=1' to enable ->make_request_fn() based I/O path.
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Asias He <asias@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 08:07:04 +00:00
|
|
|
}
|
|
|
|
|
virtio-blk: support mq_ops->queue_rqs()
This patch supports mq_ops->queue_rqs() hook. It has an advantage of
batch submission to virtio-blk driver. It also helps polling I/O because
polling uses batched completion of block layer. Batch submission in
queue_rqs() can boost polling performance.
In queue_rqs(), it iterates plug->mq_list, collects requests that
belong to same HW queue until it encounters a request from other
HW queue or sees the end of the list.
Then, virtio-blk adds requests into virtqueue and kicks virtqueue
to submit requests.
If there is an error, it inserts error request to requeue_list and
passes it to ordinary block layer path.
For verification, I did fio test.
(io_uring, randread, direct=1, bs=4K, iodepth=64 numjobs=N)
I set 4 vcpu and 2 virtio-blk queues for VM and run fio test 5 times.
It shows about 2% improvement.
| numjobs=2 | numjobs=4
-----------------------------------------------------------
fio without queue_rqs() | 291K IOPS | 238K IOPS
-----------------------------------------------------------
fio with queue_rqs() | 295K IOPS | 243K IOPS
For polling I/O performance, I also did fio test as below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=4)
I set 4 vcpu and 2 poll queues for VM.
It shows about 2% improvement in polling I/O.
| IOPS | avg latency
-----------------------------------------------------------
fio poll without queue_rqs() | 424K | 613.05 usec
-----------------------------------------------------------
fio poll with queue_rqs() | 435K | 601.01 usec
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-3-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:07 +00:00
|
|
|
static bool virtblk_prep_rq_batch(struct request *req)
|
|
|
|
{
|
|
|
|
struct virtio_blk *vblk = req->mq_hctx->queue->queuedata;
|
|
|
|
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
|
|
|
|
|
|
|
|
return virtblk_prep_rq(req->mq_hctx, vblk, req, vbr) == BLK_STS_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool virtblk_add_req_batch(struct virtio_blk_vq *vq,
|
2022-08-30 15:01:53 +00:00
|
|
|
struct request **rqlist)
|
virtio-blk: support mq_ops->queue_rqs()
This patch supports mq_ops->queue_rqs() hook. It has an advantage of
batch submission to virtio-blk driver. It also helps polling I/O because
polling uses batched completion of block layer. Batch submission in
queue_rqs() can boost polling performance.
In queue_rqs(), it iterates plug->mq_list, collects requests that
belong to same HW queue until it encounters a request from other
HW queue or sees the end of the list.
Then, virtio-blk adds requests into virtqueue and kicks virtqueue
to submit requests.
If there is an error, it inserts error request to requeue_list and
passes it to ordinary block layer path.
For verification, I did fio test.
(io_uring, randread, direct=1, bs=4K, iodepth=64 numjobs=N)
I set 4 vcpu and 2 virtio-blk queues for VM and run fio test 5 times.
It shows about 2% improvement.
| numjobs=2 | numjobs=4
-----------------------------------------------------------
fio without queue_rqs() | 291K IOPS | 238K IOPS
-----------------------------------------------------------
fio with queue_rqs() | 295K IOPS | 243K IOPS
For polling I/O performance, I also did fio test as below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=4)
I set 4 vcpu and 2 poll queues for VM.
It shows about 2% improvement in polling I/O.
| IOPS | avg latency
-----------------------------------------------------------
fio poll without queue_rqs() | 424K | 613.05 usec
-----------------------------------------------------------
fio poll with queue_rqs() | 435K | 601.01 usec
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-3-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:07 +00:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
int err;
|
|
|
|
bool kick;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&vq->lock, flags);
|
|
|
|
|
|
|
|
while (!rq_list_empty(*rqlist)) {
|
|
|
|
struct request *req = rq_list_pop(rqlist);
|
|
|
|
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
|
|
|
|
|
|
|
|
err = virtblk_add_req(vq->vq, vbr);
|
|
|
|
if (err) {
|
|
|
|
virtblk_unmap_data(req, vbr);
|
|
|
|
virtblk_cleanup_cmd(req);
|
2022-08-30 15:01:53 +00:00
|
|
|
blk_mq_requeue_request(req, true);
|
virtio-blk: support mq_ops->queue_rqs()
This patch supports mq_ops->queue_rqs() hook. It has an advantage of
batch submission to virtio-blk driver. It also helps polling I/O because
polling uses batched completion of block layer. Batch submission in
queue_rqs() can boost polling performance.
In queue_rqs(), it iterates plug->mq_list, collects requests that
belong to same HW queue until it encounters a request from other
HW queue or sees the end of the list.
Then, virtio-blk adds requests into virtqueue and kicks virtqueue
to submit requests.
If there is an error, it inserts error request to requeue_list and
passes it to ordinary block layer path.
For verification, I did fio test.
(io_uring, randread, direct=1, bs=4K, iodepth=64 numjobs=N)
I set 4 vcpu and 2 virtio-blk queues for VM and run fio test 5 times.
It shows about 2% improvement.
| numjobs=2 | numjobs=4
-----------------------------------------------------------
fio without queue_rqs() | 291K IOPS | 238K IOPS
-----------------------------------------------------------
fio with queue_rqs() | 295K IOPS | 243K IOPS
For polling I/O performance, I also did fio test as below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=4)
I set 4 vcpu and 2 poll queues for VM.
It shows about 2% improvement in polling I/O.
| IOPS | avg latency
-----------------------------------------------------------
fio poll without queue_rqs() | 424K | 613.05 usec
-----------------------------------------------------------
fio poll with queue_rqs() | 435K | 601.01 usec
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-3-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
kick = virtqueue_kick_prepare(vq->vq);
|
|
|
|
spin_unlock_irqrestore(&vq->lock, flags);
|
|
|
|
|
|
|
|
return kick;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void virtio_queue_rqs(struct request **rqlist)
|
|
|
|
{
|
|
|
|
struct request *req, *next, *prev = NULL;
|
|
|
|
struct request *requeue_list = NULL;
|
|
|
|
|
|
|
|
rq_list_for_each_safe(rqlist, req, next) {
|
2022-08-10 16:09:48 +00:00
|
|
|
struct virtio_blk_vq *vq = get_virtio_blk_vq(req->mq_hctx);
|
virtio-blk: support mq_ops->queue_rqs()
This patch supports mq_ops->queue_rqs() hook. It has an advantage of
batch submission to virtio-blk driver. It also helps polling I/O because
polling uses batched completion of block layer. Batch submission in
queue_rqs() can boost polling performance.
In queue_rqs(), it iterates plug->mq_list, collects requests that
belong to same HW queue until it encounters a request from other
HW queue or sees the end of the list.
Then, virtio-blk adds requests into virtqueue and kicks virtqueue
to submit requests.
If there is an error, it inserts error request to requeue_list and
passes it to ordinary block layer path.
For verification, I did fio test.
(io_uring, randread, direct=1, bs=4K, iodepth=64 numjobs=N)
I set 4 vcpu and 2 virtio-blk queues for VM and run fio test 5 times.
It shows about 2% improvement.
| numjobs=2 | numjobs=4
-----------------------------------------------------------
fio without queue_rqs() | 291K IOPS | 238K IOPS
-----------------------------------------------------------
fio with queue_rqs() | 295K IOPS | 243K IOPS
For polling I/O performance, I also did fio test as below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=4)
I set 4 vcpu and 2 poll queues for VM.
It shows about 2% improvement in polling I/O.
| IOPS | avg latency
-----------------------------------------------------------
fio poll without queue_rqs() | 424K | 613.05 usec
-----------------------------------------------------------
fio poll with queue_rqs() | 435K | 601.01 usec
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-3-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:07 +00:00
|
|
|
bool kick;
|
|
|
|
|
|
|
|
if (!virtblk_prep_rq_batch(req)) {
|
|
|
|
rq_list_move(rqlist, &requeue_list, req, prev);
|
|
|
|
req = prev;
|
|
|
|
if (!req)
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!next || req->mq_hctx != next->mq_hctx) {
|
|
|
|
req->rq_next = NULL;
|
2022-08-30 15:01:53 +00:00
|
|
|
kick = virtblk_add_req_batch(vq, rqlist);
|
virtio-blk: support mq_ops->queue_rqs()
This patch supports mq_ops->queue_rqs() hook. It has an advantage of
batch submission to virtio-blk driver. It also helps polling I/O because
polling uses batched completion of block layer. Batch submission in
queue_rqs() can boost polling performance.
In queue_rqs(), it iterates plug->mq_list, collects requests that
belong to same HW queue until it encounters a request from other
HW queue or sees the end of the list.
Then, virtio-blk adds requests into virtqueue and kicks virtqueue
to submit requests.
If there is an error, it inserts error request to requeue_list and
passes it to ordinary block layer path.
For verification, I did fio test.
(io_uring, randread, direct=1, bs=4K, iodepth=64 numjobs=N)
I set 4 vcpu and 2 virtio-blk queues for VM and run fio test 5 times.
It shows about 2% improvement.
| numjobs=2 | numjobs=4
-----------------------------------------------------------
fio without queue_rqs() | 291K IOPS | 238K IOPS
-----------------------------------------------------------
fio with queue_rqs() | 295K IOPS | 243K IOPS
For polling I/O performance, I also did fio test as below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=4)
I set 4 vcpu and 2 poll queues for VM.
It shows about 2% improvement in polling I/O.
| IOPS | avg latency
-----------------------------------------------------------
fio poll without queue_rqs() | 424K | 613.05 usec
-----------------------------------------------------------
fio poll with queue_rqs() | 435K | 601.01 usec
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-3-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:07 +00:00
|
|
|
if (kick)
|
|
|
|
virtqueue_notify(vq->vq);
|
|
|
|
|
|
|
|
*rqlist = next;
|
|
|
|
prev = NULL;
|
|
|
|
} else
|
|
|
|
prev = req;
|
|
|
|
}
|
|
|
|
|
|
|
|
*rqlist = requeue_list;
|
|
|
|
}
|
|
|
|
|
2022-10-16 03:41:27 +00:00
|
|
|
#ifdef CONFIG_BLK_DEV_ZONED
|
|
|
|
static void *virtblk_alloc_report_buffer(struct virtio_blk *vblk,
|
|
|
|
unsigned int nr_zones,
|
|
|
|
size_t *buflen)
|
|
|
|
{
|
|
|
|
struct request_queue *q = vblk->disk->queue;
|
|
|
|
size_t bufsize;
|
|
|
|
void *buf;
|
|
|
|
|
|
|
|
nr_zones = min_t(unsigned int, nr_zones,
|
2023-03-30 21:49:52 +00:00
|
|
|
get_capacity(vblk->disk) >> ilog2(vblk->zone_sectors));
|
2022-10-16 03:41:27 +00:00
|
|
|
|
|
|
|
bufsize = sizeof(struct virtio_blk_zone_report) +
|
|
|
|
nr_zones * sizeof(struct virtio_blk_zone_descriptor);
|
|
|
|
bufsize = min_t(size_t, bufsize,
|
|
|
|
queue_max_hw_sectors(q) << SECTOR_SHIFT);
|
|
|
|
bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
|
|
|
|
|
|
|
|
while (bufsize >= sizeof(struct virtio_blk_zone_report)) {
|
|
|
|
buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
|
|
|
|
if (buf) {
|
|
|
|
*buflen = bufsize;
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
bufsize >>= 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int virtblk_submit_zone_report(struct virtio_blk *vblk,
|
|
|
|
char *report_buf, size_t report_len,
|
|
|
|
sector_t sector)
|
|
|
|
{
|
|
|
|
struct request_queue *q = vblk->disk->queue;
|
|
|
|
struct request *req;
|
|
|
|
struct virtblk_req *vbr;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
req = blk_mq_alloc_request(q, REQ_OP_DRV_IN, 0);
|
|
|
|
if (IS_ERR(req))
|
|
|
|
return PTR_ERR(req);
|
|
|
|
|
|
|
|
vbr = blk_mq_rq_to_pdu(req);
|
2023-03-30 21:49:52 +00:00
|
|
|
vbr->in_hdr_len = sizeof(vbr->in_hdr.status);
|
2022-10-16 03:41:27 +00:00
|
|
|
vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_ZONE_REPORT);
|
|
|
|
vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, sector);
|
|
|
|
|
|
|
|
err = blk_rq_map_kern(q, req, report_buf, report_len, GFP_KERNEL);
|
|
|
|
if (err)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
blk_execute_rq(req, false);
|
2023-03-30 21:49:52 +00:00
|
|
|
err = blk_status_to_errno(virtblk_result(vbr->in_hdr.status));
|
2022-10-16 03:41:27 +00:00
|
|
|
out:
|
|
|
|
blk_mq_free_request(req);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int virtblk_parse_zone(struct virtio_blk *vblk,
|
|
|
|
struct virtio_blk_zone_descriptor *entry,
|
2023-03-30 21:49:52 +00:00
|
|
|
unsigned int idx, report_zones_cb cb, void *data)
|
2022-10-16 03:41:27 +00:00
|
|
|
{
|
|
|
|
struct blk_zone zone = { };
|
|
|
|
|
2023-03-30 21:49:52 +00:00
|
|
|
zone.start = virtio64_to_cpu(vblk->vdev, entry->z_start);
|
|
|
|
if (zone.start + vblk->zone_sectors <= get_capacity(vblk->disk))
|
|
|
|
zone.len = vblk->zone_sectors;
|
|
|
|
else
|
|
|
|
zone.len = get_capacity(vblk->disk) - zone.start;
|
|
|
|
zone.capacity = virtio64_to_cpu(vblk->vdev, entry->z_cap);
|
|
|
|
zone.wp = virtio64_to_cpu(vblk->vdev, entry->z_wp);
|
|
|
|
|
|
|
|
switch (entry->z_type) {
|
|
|
|
case VIRTIO_BLK_ZT_SWR:
|
|
|
|
zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
|
|
|
|
break;
|
|
|
|
case VIRTIO_BLK_ZT_SWP:
|
|
|
|
zone.type = BLK_ZONE_TYPE_SEQWRITE_PREF;
|
|
|
|
break;
|
|
|
|
case VIRTIO_BLK_ZT_CONV:
|
|
|
|
zone.type = BLK_ZONE_TYPE_CONVENTIONAL;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
dev_err(&vblk->vdev->dev, "zone %llu: invalid type %#x\n",
|
|
|
|
zone.start, entry->z_type);
|
|
|
|
return -EIO;
|
2022-10-16 03:41:27 +00:00
|
|
|
}
|
|
|
|
|
2023-03-30 21:49:52 +00:00
|
|
|
switch (entry->z_state) {
|
|
|
|
case VIRTIO_BLK_ZS_EMPTY:
|
|
|
|
zone.cond = BLK_ZONE_COND_EMPTY;
|
|
|
|
break;
|
|
|
|
case VIRTIO_BLK_ZS_CLOSED:
|
|
|
|
zone.cond = BLK_ZONE_COND_CLOSED;
|
|
|
|
break;
|
|
|
|
case VIRTIO_BLK_ZS_FULL:
|
|
|
|
zone.cond = BLK_ZONE_COND_FULL;
|
2022-10-16 03:41:27 +00:00
|
|
|
zone.wp = zone.start + zone.len;
|
2023-03-30 21:49:52 +00:00
|
|
|
break;
|
|
|
|
case VIRTIO_BLK_ZS_EOPEN:
|
|
|
|
zone.cond = BLK_ZONE_COND_EXP_OPEN;
|
|
|
|
break;
|
|
|
|
case VIRTIO_BLK_ZS_IOPEN:
|
|
|
|
zone.cond = BLK_ZONE_COND_IMP_OPEN;
|
|
|
|
break;
|
|
|
|
case VIRTIO_BLK_ZS_NOT_WP:
|
|
|
|
zone.cond = BLK_ZONE_COND_NOT_WP;
|
|
|
|
break;
|
|
|
|
case VIRTIO_BLK_ZS_RDONLY:
|
|
|
|
zone.cond = BLK_ZONE_COND_READONLY;
|
|
|
|
zone.wp = ULONG_MAX;
|
|
|
|
break;
|
|
|
|
case VIRTIO_BLK_ZS_OFFLINE:
|
|
|
|
zone.cond = BLK_ZONE_COND_OFFLINE;
|
|
|
|
zone.wp = ULONG_MAX;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
dev_err(&vblk->vdev->dev, "zone %llu: invalid condition %#x\n",
|
|
|
|
zone.start, entry->z_state);
|
|
|
|
return -EIO;
|
|
|
|
}
|
2022-10-16 03:41:27 +00:00
|
|
|
|
2023-03-30 21:49:52 +00:00
|
|
|
/*
|
|
|
|
* The callback below checks the validity of the reported
|
|
|
|
* entry data, no need to further validate it here.
|
|
|
|
*/
|
2022-10-16 03:41:27 +00:00
|
|
|
return cb(&zone, idx, data);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int virtblk_report_zones(struct gendisk *disk, sector_t sector,
|
|
|
|
unsigned int nr_zones, report_zones_cb cb,
|
|
|
|
void *data)
|
|
|
|
{
|
|
|
|
struct virtio_blk *vblk = disk->private_data;
|
|
|
|
struct virtio_blk_zone_report *report;
|
2023-03-30 21:49:52 +00:00
|
|
|
unsigned long long nz, i;
|
2022-10-16 03:41:27 +00:00
|
|
|
size_t buflen;
|
2023-03-30 21:49:52 +00:00
|
|
|
unsigned int zone_idx = 0;
|
|
|
|
int ret;
|
2022-10-16 03:41:27 +00:00
|
|
|
|
|
|
|
if (WARN_ON_ONCE(!vblk->zone_sectors))
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
2023-03-30 21:49:52 +00:00
|
|
|
report = virtblk_alloc_report_buffer(vblk, nr_zones, &buflen);
|
2022-10-16 03:41:27 +00:00
|
|
|
if (!report)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2023-03-30 21:49:52 +00:00
|
|
|
mutex_lock(&vblk->vdev_mutex);
|
|
|
|
|
|
|
|
if (!vblk->vdev) {
|
|
|
|
ret = -ENXIO;
|
|
|
|
goto fail_report;
|
|
|
|
}
|
|
|
|
|
2022-10-16 03:41:27 +00:00
|
|
|
while (zone_idx < nr_zones && sector < get_capacity(vblk->disk)) {
|
|
|
|
memset(report, 0, buflen);
|
|
|
|
|
|
|
|
ret = virtblk_submit_zone_report(vblk, (char *)report,
|
|
|
|
buflen, sector);
|
2023-03-30 21:49:52 +00:00
|
|
|
if (ret)
|
|
|
|
goto fail_report;
|
|
|
|
|
|
|
|
nz = min_t(u64, virtio64_to_cpu(vblk->vdev, report->nr_zones),
|
|
|
|
nr_zones);
|
2022-10-16 03:41:27 +00:00
|
|
|
if (!nz)
|
|
|
|
break;
|
|
|
|
|
|
|
|
for (i = 0; i < nz && zone_idx < nr_zones; i++) {
|
|
|
|
ret = virtblk_parse_zone(vblk, &report->zones[i],
|
2023-03-30 21:49:52 +00:00
|
|
|
zone_idx, cb, data);
|
2022-10-16 03:41:27 +00:00
|
|
|
if (ret)
|
2023-03-30 21:49:52 +00:00
|
|
|
goto fail_report;
|
|
|
|
|
|
|
|
sector = virtio64_to_cpu(vblk->vdev,
|
|
|
|
report->zones[i].z_start) +
|
|
|
|
vblk->zone_sectors;
|
2022-10-16 03:41:27 +00:00
|
|
|
zone_idx++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (zone_idx > 0)
|
|
|
|
ret = zone_idx;
|
|
|
|
else
|
|
|
|
ret = -EINVAL;
|
2023-03-30 21:49:52 +00:00
|
|
|
fail_report:
|
|
|
|
mutex_unlock(&vblk->vdev_mutex);
|
2022-10-16 03:41:27 +00:00
|
|
|
kvfree(report);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2024-02-13 07:34:22 +00:00
|
|
|
static int virtblk_read_zoned_limits(struct virtio_blk *vblk,
|
|
|
|
struct queue_limits *lim)
|
2022-10-16 03:41:27 +00:00
|
|
|
{
|
2024-02-13 07:34:22 +00:00
|
|
|
struct virtio_device *vdev = vblk->vdev;
|
2023-03-30 21:49:52 +00:00
|
|
|
u32 v, wg;
|
2022-10-16 03:41:27 +00:00
|
|
|
|
|
|
|
dev_dbg(&vdev->dev, "probing host-managed zoned device\n");
|
|
|
|
|
2024-07-04 05:28:15 +00:00
|
|
|
lim->features |= BLK_FEAT_ZONED;
|
2022-10-16 03:41:27 +00:00
|
|
|
|
|
|
|
virtio_cread(vdev, struct virtio_blk_config,
|
|
|
|
zoned.max_open_zones, &v);
|
2024-03-01 19:26:37 +00:00
|
|
|
lim->max_open_zones = v;
|
2023-03-30 21:49:52 +00:00
|
|
|
dev_dbg(&vdev->dev, "max open zones = %u\n", v);
|
2022-10-16 03:41:27 +00:00
|
|
|
|
|
|
|
virtio_cread(vdev, struct virtio_blk_config,
|
|
|
|
zoned.max_active_zones, &v);
|
2024-03-01 19:26:37 +00:00
|
|
|
lim->max_active_zones = v;
|
2023-03-30 21:49:52 +00:00
|
|
|
dev_dbg(&vdev->dev, "max active zones = %u\n", v);
|
2022-10-16 03:41:27 +00:00
|
|
|
|
|
|
|
virtio_cread(vdev, struct virtio_blk_config,
|
2023-03-30 21:49:52 +00:00
|
|
|
zoned.write_granularity, &wg);
|
|
|
|
if (!wg) {
|
2022-10-16 03:41:27 +00:00
|
|
|
dev_warn(&vdev->dev, "zero write granularity reported\n");
|
|
|
|
return -ENODEV;
|
|
|
|
}
|
2024-02-13 07:34:22 +00:00
|
|
|
lim->physical_block_size = wg;
|
|
|
|
lim->io_min = wg;
|
2022-10-16 03:41:27 +00:00
|
|
|
|
2023-03-30 21:49:52 +00:00
|
|
|
dev_dbg(&vdev->dev, "write granularity = %u\n", wg);
|
2022-10-16 03:41:27 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* virtio ZBD specification doesn't require zones to be a power of
|
|
|
|
* two sectors in size, but the code in this driver expects that.
|
|
|
|
*/
|
2023-03-30 21:49:52 +00:00
|
|
|
virtio_cread(vdev, struct virtio_blk_config, zoned.zone_sectors,
|
|
|
|
&vblk->zone_sectors);
|
2022-10-16 03:41:27 +00:00
|
|
|
if (vblk->zone_sectors == 0 || !is_power_of_2(vblk->zone_sectors)) {
|
|
|
|
dev_err(&vdev->dev,
|
|
|
|
"zoned device with non power of two zone size %u\n",
|
|
|
|
vblk->zone_sectors);
|
|
|
|
return -ENODEV;
|
|
|
|
}
|
2024-02-13 07:34:22 +00:00
|
|
|
lim->chunk_sectors = vblk->zone_sectors;
|
2022-10-16 03:41:27 +00:00
|
|
|
dev_dbg(&vdev->dev, "zone sectors = %u\n", vblk->zone_sectors);
|
|
|
|
|
|
|
|
if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
|
|
|
|
dev_warn(&vblk->vdev->dev,
|
|
|
|
"ignoring negotiated F_DISCARD for zoned device\n");
|
2024-02-13 07:34:22 +00:00
|
|
|
lim->max_hw_discard_sectors = 0;
|
2022-10-16 03:41:27 +00:00
|
|
|
}
|
|
|
|
|
2023-07-03 02:48:11 +00:00
|
|
|
virtio_cread(vdev, struct virtio_blk_config,
|
|
|
|
zoned.max_append_sectors, &v);
|
|
|
|
if (!v) {
|
|
|
|
dev_warn(&vdev->dev, "zero max_append_sectors reported\n");
|
|
|
|
return -ENODEV;
|
2022-10-16 03:41:27 +00:00
|
|
|
}
|
2023-07-03 02:48:11 +00:00
|
|
|
if ((v << SECTOR_SHIFT) < wg) {
|
|
|
|
dev_err(&vdev->dev,
|
|
|
|
"write granularity %u exceeds max_append_sectors %u limit\n",
|
|
|
|
wg, v);
|
|
|
|
return -ENODEV;
|
|
|
|
}
|
2024-02-13 07:34:22 +00:00
|
|
|
lim->max_zone_append_sectors = v;
|
2023-07-03 02:48:11 +00:00
|
|
|
dev_dbg(&vdev->dev, "max append sectors = %u\n", v);
|
2022-10-16 03:41:27 +00:00
|
|
|
|
2024-02-13 07:34:22 +00:00
|
|
|
return 0;
|
2022-10-16 03:41:27 +00:00
|
|
|
}
|
|
|
|
#else
|
|
|
|
/*
|
2024-02-13 07:34:22 +00:00
|
|
|
* Zoned block device support is not configured in this kernel, host-managed
|
|
|
|
* zoned devices can't be supported.
|
2022-10-16 03:41:27 +00:00
|
|
|
*/
|
|
|
|
#define virtblk_report_zones NULL
|
2024-02-13 07:34:22 +00:00
|
|
|
static inline int virtblk_read_zoned_limits(struct virtio_blk *vblk,
|
|
|
|
struct queue_limits *lim)
|
2022-10-16 03:41:27 +00:00
|
|
|
{
|
2024-02-13 07:34:22 +00:00
|
|
|
dev_err(&vblk->vdev->dev,
|
2023-12-17 16:53:55 +00:00
|
|
|
"virtio_blk: zoned devices are not supported");
|
|
|
|
return -EOPNOTSUPP;
|
2022-10-16 03:41:27 +00:00
|
|
|
}
|
|
|
|
#endif /* CONFIG_BLK_DEV_ZONED */
|
|
|
|
|
2010-03-25 05:33:33 +00:00
|
|
|
/* return id (s/n) string for *disk to *id_str
|
|
|
|
*/
|
|
|
|
static int virtblk_get_id(struct gendisk *disk, char *id_str)
|
|
|
|
{
|
|
|
|
struct virtio_blk *vblk = disk->private_data;
|
2016-07-19 09:31:49 +00:00
|
|
|
struct request_queue *q = vblk->disk->queue;
|
2010-03-25 05:33:33 +00:00
|
|
|
struct request *req;
|
2022-10-16 03:41:27 +00:00
|
|
|
struct virtblk_req *vbr;
|
2010-10-09 01:42:13 +00:00
|
|
|
int err;
|
2010-03-25 05:33:33 +00:00
|
|
|
|
2021-10-25 07:05:07 +00:00
|
|
|
req = blk_mq_alloc_request(q, REQ_OP_DRV_IN, 0);
|
2016-07-19 09:31:49 +00:00
|
|
|
if (IS_ERR(req))
|
2010-03-25 05:33:33 +00:00
|
|
|
return PTR_ERR(req);
|
2016-07-19 09:31:49 +00:00
|
|
|
|
2022-10-16 03:41:27 +00:00
|
|
|
vbr = blk_mq_rq_to_pdu(req);
|
2023-03-30 21:49:52 +00:00
|
|
|
vbr->in_hdr_len = sizeof(vbr->in_hdr.status);
|
2022-10-16 03:41:27 +00:00
|
|
|
vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID);
|
|
|
|
vbr->out_hdr.sector = 0;
|
|
|
|
|
2016-07-19 09:31:49 +00:00
|
|
|
err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
|
|
|
|
if (err)
|
|
|
|
goto out;
|
|
|
|
|
2021-11-26 12:18:01 +00:00
|
|
|
blk_execute_rq(req, false);
|
2023-03-30 21:49:52 +00:00
|
|
|
err = blk_status_to_errno(virtblk_result(vbr->in_hdr.status));
|
2016-07-19 09:31:49 +00:00
|
|
|
out:
|
2021-10-25 07:05:07 +00:00
|
|
|
blk_mq_free_request(req);
|
2010-10-09 01:42:13 +00:00
|
|
|
return err;
|
2010-03-25 05:33:33 +00:00
|
|
|
}
|
|
|
|
|
2008-01-23 16:56:50 +00:00
|
|
|
/* We provide getgeo only to please some old bootloader/partitioning tools */
|
|
|
|
static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
|
|
|
|
{
|
2008-04-16 18:56:37 +00:00
|
|
|
struct virtio_blk *vblk = bd->bd_disk->private_data;
|
2020-04-30 14:04:42 +00:00
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
mutex_lock(&vblk->vdev_mutex);
|
|
|
|
|
|
|
|
if (!vblk->vdev) {
|
|
|
|
ret = -ENXIO;
|
|
|
|
goto out;
|
|
|
|
}
|
2008-04-16 18:56:37 +00:00
|
|
|
|
|
|
|
/* see if the host passed in geometry config */
|
2013-10-14 07:41:51 +00:00
|
|
|
if (virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_GEOMETRY)) {
|
|
|
|
virtio_cread(vblk->vdev, struct virtio_blk_config,
|
|
|
|
geometry.cylinders, &geo->cylinders);
|
|
|
|
virtio_cread(vblk->vdev, struct virtio_blk_config,
|
|
|
|
geometry.heads, &geo->heads);
|
|
|
|
virtio_cread(vblk->vdev, struct virtio_blk_config,
|
|
|
|
geometry.sectors, &geo->sectors);
|
2008-04-16 18:56:37 +00:00
|
|
|
} else {
|
|
|
|
/* some standard values, similar to sd */
|
|
|
|
geo->heads = 1 << 6;
|
|
|
|
geo->sectors = 1 << 5;
|
|
|
|
geo->cylinders = get_capacity(bd->bd_disk) >> 11;
|
|
|
|
}
|
2020-04-30 14:04:42 +00:00
|
|
|
out:
|
|
|
|
mutex_unlock(&vblk->vdev_mutex);
|
|
|
|
return ret;
|
2008-01-23 16:56:50 +00:00
|
|
|
}
|
|
|
|
|
2022-02-15 09:45:14 +00:00
|
|
|
static void virtblk_free_disk(struct gendisk *disk)
|
|
|
|
{
|
|
|
|
struct virtio_blk *vblk = disk->private_data;
|
|
|
|
|
2022-11-30 12:30:03 +00:00
|
|
|
ida_free(&vd_index_ida, vblk->index);
|
2022-02-15 09:45:14 +00:00
|
|
|
mutex_destroy(&vblk->vdev_mutex);
|
|
|
|
kfree(vblk);
|
|
|
|
}
|
|
|
|
|
2009-09-22 00:01:13 +00:00
|
|
|
static const struct block_device_operations virtblk_fops = {
|
2022-02-15 09:45:14 +00:00
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.getgeo = virtblk_getgeo,
|
|
|
|
.free_disk = virtblk_free_disk,
|
2022-10-16 03:41:27 +00:00
|
|
|
.report_zones = virtblk_report_zones,
|
2007-10-22 01:03:38 +00:00
|
|
|
};
|
|
|
|
|
2008-02-01 08:05:00 +00:00
|
|
|
static int index_to_minor(int index)
|
|
|
|
{
|
|
|
|
return index << PART_BITS;
|
|
|
|
}
|
|
|
|
|
2011-10-30 19:29:59 +00:00
|
|
|
static int minor_to_index(int minor)
|
|
|
|
{
|
|
|
|
return minor >> PART_BITS;
|
|
|
|
}
|
|
|
|
|
2018-09-28 06:17:23 +00:00
|
|
|
static ssize_t serial_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf)
|
2010-06-24 03:19:57 +00:00
|
|
|
{
|
|
|
|
struct gendisk *disk = dev_to_disk(dev);
|
|
|
|
int err;
|
|
|
|
|
|
|
|
/* sysfs gives us a PAGE_SIZE buffer */
|
|
|
|
BUILD_BUG_ON(PAGE_SIZE < VIRTIO_BLK_ID_BYTES);
|
|
|
|
|
|
|
|
buf[VIRTIO_BLK_ID_BYTES] = '\0';
|
|
|
|
err = virtblk_get_id(disk, buf);
|
|
|
|
if (!err)
|
|
|
|
return strlen(buf);
|
|
|
|
|
|
|
|
if (err == -EIO) /* Unsupported? Make it empty. */
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
2014-10-23 13:08:44 +00:00
|
|
|
|
2018-09-28 06:17:23 +00:00
|
|
|
static DEVICE_ATTR_RO(serial);
|
2010-06-24 03:19:57 +00:00
|
|
|
|
2018-01-03 16:03:39 +00:00
|
|
|
/* The queue's logical block size must be set before calling this */
|
|
|
|
static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize)
|
2011-02-01 20:43:48 +00:00
|
|
|
{
|
|
|
|
struct virtio_device *vdev = vblk->vdev;
|
|
|
|
struct request_queue *q = vblk->disk->queue;
|
|
|
|
char cap_str_2[10], cap_str_10[10];
|
2017-07-26 14:32:23 +00:00
|
|
|
unsigned long long nblocks;
|
sd, mmc, virtio_blk, string_helpers: fix block size units
The current string_get_size() overflows when the device size goes over
2^64 bytes because the string helper routine computes the suffix from
the size in bytes. However, the entirety of Linux thinks in terms of
blocks, not bytes, so this will artificially induce an overflow on very
large devices. Fix this by making the function string_get_size() take
blocks and the block size instead of bytes. This should allow us to
keep working until the current SCSI standard overflows.
Also fix virtio_blk and mmc (both of which were also artificially
multiplying by the block size to pass a byte side to string_get_size()).
The mathematics of this is pretty simple: we're taking a product of
size in blocks (S) and block size (B) and trying to re-express this in
exponential form: S*B = R*N^E (where N, the exponent is either 1000 or
1024) and R < N. Mathematically, S = RS*N^ES and B=RB*N^EB, so if RS*RB
< N it's easy to see that S*B = RS*RB*N^(ES+EB). However, if RS*BS > N,
we can see that this can be re-expressed as RS*BS = R*N (where R =
RS*BS/N < N) so the whole exponent becomes R*N^(ES+EB+1)
[jejb: fix incorrect 32 bit do_div spotted by kbuild test robot <fengguang.wu@intel.com>]
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: James Bottomley <JBottomley@Odin.com>
2015-03-06 02:47:01 +00:00
|
|
|
u64 capacity;
|
2011-02-01 20:43:48 +00:00
|
|
|
|
|
|
|
/* Host must always specify the capacity. */
|
2013-10-14 07:41:51 +00:00
|
|
|
virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity);
|
2011-02-01 20:43:48 +00:00
|
|
|
|
2017-07-26 14:32:23 +00:00
|
|
|
nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9);
|
|
|
|
|
|
|
|
string_get_size(nblocks, queue_logical_block_size(q),
|
sd, mmc, virtio_blk, string_helpers: fix block size units
The current string_get_size() overflows when the device size goes over
2^64 bytes because the string helper routine computes the suffix from
the size in bytes. However, the entirety of Linux thinks in terms of
blocks, not bytes, so this will artificially induce an overflow on very
large devices. Fix this by making the function string_get_size() take
blocks and the block size instead of bytes. This should allow us to
keep working until the current SCSI standard overflows.
Also fix virtio_blk and mmc (both of which were also artificially
multiplying by the block size to pass a byte side to string_get_size()).
The mathematics of this is pretty simple: we're taking a product of
size in blocks (S) and block size (B) and trying to re-express this in
exponential form: S*B = R*N^E (where N, the exponent is either 1000 or
1024) and R < N. Mathematically, S = RS*N^ES and B=RB*N^EB, so if RS*RB
< N it's easy to see that S*B = RS*RB*N^(ES+EB). However, if RS*BS > N,
we can see that this can be re-expressed as RS*BS = R*N (where R =
RS*BS/N < N) so the whole exponent becomes R*N^(ES+EB+1)
[jejb: fix incorrect 32 bit do_div spotted by kbuild test robot <fengguang.wu@intel.com>]
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: James Bottomley <JBottomley@Odin.com>
2015-03-06 02:47:01 +00:00
|
|
|
STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
|
2017-07-26 14:32:23 +00:00
|
|
|
string_get_size(nblocks, queue_logical_block_size(q),
|
sd, mmc, virtio_blk, string_helpers: fix block size units
The current string_get_size() overflows when the device size goes over
2^64 bytes because the string helper routine computes the suffix from
the size in bytes. However, the entirety of Linux thinks in terms of
blocks, not bytes, so this will artificially induce an overflow on very
large devices. Fix this by making the function string_get_size() take
blocks and the block size instead of bytes. This should allow us to
keep working until the current SCSI standard overflows.
Also fix virtio_blk and mmc (both of which were also artificially
multiplying by the block size to pass a byte side to string_get_size()).
The mathematics of this is pretty simple: we're taking a product of
size in blocks (S) and block size (B) and trying to re-express this in
exponential form: S*B = R*N^E (where N, the exponent is either 1000 or
1024) and R < N. Mathematically, S = RS*N^ES and B=RB*N^EB, so if RS*RB
< N it's easy to see that S*B = RS*RB*N^(ES+EB). However, if RS*BS > N,
we can see that this can be re-expressed as RS*BS = R*N (where R =
RS*BS/N < N) so the whole exponent becomes R*N^(ES+EB+1)
[jejb: fix incorrect 32 bit do_div spotted by kbuild test robot <fengguang.wu@intel.com>]
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: James Bottomley <JBottomley@Odin.com>
2015-03-06 02:47:01 +00:00
|
|
|
STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
|
2011-02-01 20:43:48 +00:00
|
|
|
|
|
|
|
dev_notice(&vdev->dev,
|
2018-01-03 16:03:39 +00:00
|
|
|
"[%s] %s%llu %d-byte logical blocks (%s/%s)\n",
|
|
|
|
vblk->disk->disk_name,
|
|
|
|
resize ? "new size: " : "",
|
2017-07-26 14:32:23 +00:00
|
|
|
nblocks,
|
|
|
|
queue_logical_block_size(q),
|
|
|
|
cap_str_10,
|
|
|
|
cap_str_2);
|
2011-02-01 20:43:48 +00:00
|
|
|
|
2020-11-16 14:56:56 +00:00
|
|
|
set_capacity_and_notify(vblk->disk, capacity);
|
2018-01-03 16:03:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void virtblk_config_changed_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct virtio_blk *vblk =
|
|
|
|
container_of(work, struct virtio_blk, config_work);
|
|
|
|
|
|
|
|
virtblk_update_capacity(vblk, true);
|
2011-02-01 20:43:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void virtblk_config_changed(struct virtio_device *vdev)
|
|
|
|
{
|
|
|
|
struct virtio_blk *vblk = vdev->priv;
|
|
|
|
|
|
|
|
queue_work(virtblk_wq, &vblk->config_work);
|
|
|
|
}
|
|
|
|
|
2011-12-22 11:28:29 +00:00
|
|
|
static int init_vq(struct virtio_blk *vblk)
|
|
|
|
{
|
2016-09-13 11:43:50 +00:00
|
|
|
int err;
|
virtio_blk: fix snprintf truncation compiler warning
Commit 4e0400525691 ("virtio-blk: support polling I/O") triggers the
following gcc 13 W=1 warnings:
drivers/block/virtio_blk.c: In function ‘init_vq’:
drivers/block/virtio_blk.c:1077:68: warning: ‘%d’ directive output may be truncated writing between 1 and 11 bytes into a region of size 7 [-Wformat-truncation=]
1077 | snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
| ^~
drivers/block/virtio_blk.c:1077:58: note: directive argument in the range [-2147483648, 65534]
1077 | snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
| ^~~~~~~~~~~~~
drivers/block/virtio_blk.c:1077:17: note: ‘snprintf’ output between 11 and 21 bytes into a destination of size 16
1077 | snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This is a false positive because the lower bound -2147483648 is
incorrect. The true range of i is [0, num_vqs - 1] where 0 < num_vqs <
65536.
The code mixes int, unsigned short, and unsigned int types in addition
to using "%d" for an unsigned value. Use unsigned short and "%u"
consistently to solve the compiler warning.
Cc: Suwan Kim <suwan.kim027@gmail.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202312041509.DIyvEt9h-lkp@intel.com/
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20231204140743.1487843-1-stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2023-12-04 14:07:43 +00:00
|
|
|
unsigned short i;
|
2024-07-08 07:48:05 +00:00
|
|
|
struct virtqueue_info *vqs_info;
|
2014-06-26 09:41:48 +00:00
|
|
|
struct virtqueue **vqs;
|
|
|
|
unsigned short num_vqs;
|
virtio_blk: fix snprintf truncation compiler warning
Commit 4e0400525691 ("virtio-blk: support polling I/O") triggers the
following gcc 13 W=1 warnings:
drivers/block/virtio_blk.c: In function ‘init_vq’:
drivers/block/virtio_blk.c:1077:68: warning: ‘%d’ directive output may be truncated writing between 1 and 11 bytes into a region of size 7 [-Wformat-truncation=]
1077 | snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
| ^~
drivers/block/virtio_blk.c:1077:58: note: directive argument in the range [-2147483648, 65534]
1077 | snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
| ^~~~~~~~~~~~~
drivers/block/virtio_blk.c:1077:17: note: ‘snprintf’ output between 11 and 21 bytes into a destination of size 16
1077 | snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This is a false positive because the lower bound -2147483648 is
incorrect. The true range of i is [0, num_vqs - 1] where 0 < num_vqs <
65536.
The code mixes int, unsigned short, and unsigned int types in addition
to using "%d" for an unsigned value. Use unsigned short and "%u"
consistently to solve the compiler warning.
Cc: Suwan Kim <suwan.kim027@gmail.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202312041509.DIyvEt9h-lkp@intel.com/
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20231204140743.1487843-1-stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2023-12-04 14:07:43 +00:00
|
|
|
unsigned short num_poll_vqs;
|
2014-06-26 09:41:48 +00:00
|
|
|
struct virtio_device *vdev = vblk->vdev;
|
2017-02-05 17:15:25 +00:00
|
|
|
struct irq_affinity desc = { 0, };
|
2014-06-26 09:41:48 +00:00
|
|
|
|
|
|
|
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ,
|
|
|
|
struct virtio_blk_config, num_queues,
|
|
|
|
&num_vqs);
|
|
|
|
if (err)
|
|
|
|
num_vqs = 1;
|
virtio-blk: support polling I/O
This patch supports polling I/O via virtio-blk driver. Polling
feature is enabled by module parameter "poll_queues" and it sets
dedicated polling queues for virtio-blk. This patch improves the
polling I/O throughput and latency.
The virtio-blk driver doesn't not have a poll function and a poll
queue and it has been operating in interrupt driven method even if
the polling function is called in the upper layer.
virtio-blk polling is implemented upon 'batched completion' of block
layer. virtblk_poll() queues completed request to io_comp_batch->req_list
and later, virtblk_complete_batch() calls unmap function and ends
the requests in batch.
virtio-blk reads the number of poll queues from module parameter
"poll_queues". If VM sets queue parameter as below,
("num-queues=N" [QEMU property], "poll_queues=M" [module parameter])
It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
queues, the poll queues have no callback function.
Regarding HW-SW queue mapping, the default queue mapping uses the
existing method that condsiders MSI irq vector. But the poll queue
doesn't have an irq, so it uses the regular blk-mq cpu mapping.
For verifying the improvement, I did Fio polling I/O performance test
with io_uring engine with the options below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
queues for VM.
As a result, IOPS and average latency improved about 10%.
Test result:
- Fio io_uring poll without virtio-blk poll support
-- numjobs=1 : IOPS = 339K, avg latency = 188.33us
-- numjobs=2 : IOPS = 367K, avg latency = 347.33us
-- numjobs=4 : IOPS = 383K, avg latency = 682.06us
- Fio io_uring poll with virtio-blk poll support
-- numjobs=1 : IOPS = 385K, avg latency = 165.94us
-- numjobs=2 : IOPS = 408K, avg latency = 313.28us
-- numjobs=4 : IOPS = 424K, avg latency = 613.05us
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-2-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:06 +00:00
|
|
|
|
2021-10-19 07:01:43 +00:00
|
|
|
if (!err && !num_vqs) {
|
2021-10-25 10:22:40 +00:00
|
|
|
dev_err(&vdev->dev, "MQ advertised but zero queues reported\n");
|
2021-10-19 07:01:43 +00:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
2014-06-26 09:41:48 +00:00
|
|
|
|
2021-09-02 20:46:22 +00:00
|
|
|
num_vqs = min_t(unsigned int,
|
|
|
|
min_not_zero(num_request_queues, nr_cpu_ids),
|
|
|
|
num_vqs);
|
2019-03-27 10:36:34 +00:00
|
|
|
|
virtio-blk: support polling I/O
This patch supports polling I/O via virtio-blk driver. Polling
feature is enabled by module parameter "poll_queues" and it sets
dedicated polling queues for virtio-blk. This patch improves the
polling I/O throughput and latency.
The virtio-blk driver doesn't not have a poll function and a poll
queue and it has been operating in interrupt driven method even if
the polling function is called in the upper layer.
virtio-blk polling is implemented upon 'batched completion' of block
layer. virtblk_poll() queues completed request to io_comp_batch->req_list
and later, virtblk_complete_batch() calls unmap function and ends
the requests in batch.
virtio-blk reads the number of poll queues from module parameter
"poll_queues". If VM sets queue parameter as below,
("num-queues=N" [QEMU property], "poll_queues=M" [module parameter])
It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
queues, the poll queues have no callback function.
Regarding HW-SW queue mapping, the default queue mapping uses the
existing method that condsiders MSI irq vector. But the poll queue
doesn't have an irq, so it uses the regular blk-mq cpu mapping.
For verifying the improvement, I did Fio polling I/O performance test
with io_uring engine with the options below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
queues for VM.
As a result, IOPS and average latency improved about 10%.
Test result:
- Fio io_uring poll without virtio-blk poll support
-- numjobs=1 : IOPS = 339K, avg latency = 188.33us
-- numjobs=2 : IOPS = 367K, avg latency = 347.33us
-- numjobs=4 : IOPS = 383K, avg latency = 682.06us
- Fio io_uring poll with virtio-blk poll support
-- numjobs=1 : IOPS = 385K, avg latency = 165.94us
-- numjobs=2 : IOPS = 408K, avg latency = 313.28us
-- numjobs=4 : IOPS = 424K, avg latency = 613.05us
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-2-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:06 +00:00
|
|
|
num_poll_vqs = min_t(unsigned int, poll_queues, num_vqs - 1);
|
|
|
|
|
|
|
|
vblk->io_queues[HCTX_TYPE_DEFAULT] = num_vqs - num_poll_vqs;
|
|
|
|
vblk->io_queues[HCTX_TYPE_READ] = 0;
|
|
|
|
vblk->io_queues[HCTX_TYPE_POLL] = num_poll_vqs;
|
|
|
|
|
|
|
|
dev_info(&vdev->dev, "%d/%d/%d default/read/poll queues\n",
|
|
|
|
vblk->io_queues[HCTX_TYPE_DEFAULT],
|
|
|
|
vblk->io_queues[HCTX_TYPE_READ],
|
|
|
|
vblk->io_queues[HCTX_TYPE_POLL]);
|
|
|
|
|
2016-09-13 09:32:22 +00:00
|
|
|
vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL);
|
2016-08-09 08:39:20 +00:00
|
|
|
if (!vblk->vqs)
|
|
|
|
return -ENOMEM;
|
2014-06-26 09:41:48 +00:00
|
|
|
|
2024-07-08 07:48:05 +00:00
|
|
|
vqs_info = kcalloc(num_vqs, sizeof(*vqs_info), GFP_KERNEL);
|
2016-09-13 09:32:22 +00:00
|
|
|
vqs = kmalloc_array(num_vqs, sizeof(*vqs), GFP_KERNEL);
|
2024-07-08 07:48:05 +00:00
|
|
|
if (!vqs_info || !vqs) {
|
2016-08-09 08:39:20 +00:00
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
2011-12-22 11:28:29 +00:00
|
|
|
|
virtio-blk: support polling I/O
This patch supports polling I/O via virtio-blk driver. Polling
feature is enabled by module parameter "poll_queues" and it sets
dedicated polling queues for virtio-blk. This patch improves the
polling I/O throughput and latency.
The virtio-blk driver doesn't not have a poll function and a poll
queue and it has been operating in interrupt driven method even if
the polling function is called in the upper layer.
virtio-blk polling is implemented upon 'batched completion' of block
layer. virtblk_poll() queues completed request to io_comp_batch->req_list
and later, virtblk_complete_batch() calls unmap function and ends
the requests in batch.
virtio-blk reads the number of poll queues from module parameter
"poll_queues". If VM sets queue parameter as below,
("num-queues=N" [QEMU property], "poll_queues=M" [module parameter])
It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
queues, the poll queues have no callback function.
Regarding HW-SW queue mapping, the default queue mapping uses the
existing method that condsiders MSI irq vector. But the poll queue
doesn't have an irq, so it uses the regular blk-mq cpu mapping.
For verifying the improvement, I did Fio polling I/O performance test
with io_uring engine with the options below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
queues for VM.
As a result, IOPS and average latency improved about 10%.
Test result:
- Fio io_uring poll without virtio-blk poll support
-- numjobs=1 : IOPS = 339K, avg latency = 188.33us
-- numjobs=2 : IOPS = 367K, avg latency = 347.33us
-- numjobs=4 : IOPS = 383K, avg latency = 682.06us
- Fio io_uring poll with virtio-blk poll support
-- numjobs=1 : IOPS = 385K, avg latency = 165.94us
-- numjobs=2 : IOPS = 408K, avg latency = 313.28us
-- numjobs=4 : IOPS = 424K, avg latency = 613.05us
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-2-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:06 +00:00
|
|
|
for (i = 0; i < num_vqs - num_poll_vqs; i++) {
|
2024-07-08 07:48:05 +00:00
|
|
|
vqs_info[i].callback = virtblk_done;
|
virtio_blk: fix snprintf truncation compiler warning
Commit 4e0400525691 ("virtio-blk: support polling I/O") triggers the
following gcc 13 W=1 warnings:
drivers/block/virtio_blk.c: In function ‘init_vq’:
drivers/block/virtio_blk.c:1077:68: warning: ‘%d’ directive output may be truncated writing between 1 and 11 bytes into a region of size 7 [-Wformat-truncation=]
1077 | snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
| ^~
drivers/block/virtio_blk.c:1077:58: note: directive argument in the range [-2147483648, 65534]
1077 | snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
| ^~~~~~~~~~~~~
drivers/block/virtio_blk.c:1077:17: note: ‘snprintf’ output between 11 and 21 bytes into a destination of size 16
1077 | snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This is a false positive because the lower bound -2147483648 is
incorrect. The true range of i is [0, num_vqs - 1] where 0 < num_vqs <
65536.
The code mixes int, unsigned short, and unsigned int types in addition
to using "%d" for an unsigned value. Use unsigned short and "%u"
consistently to solve the compiler warning.
Cc: Suwan Kim <suwan.kim027@gmail.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202312041509.DIyvEt9h-lkp@intel.com/
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20231204140743.1487843-1-stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2023-12-04 14:07:43 +00:00
|
|
|
snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%u", i);
|
2024-07-08 07:48:05 +00:00
|
|
|
vqs_info[i].name = vblk->vqs[i].name;
|
2014-06-26 09:41:48 +00:00
|
|
|
}
|
|
|
|
|
virtio-blk: support polling I/O
This patch supports polling I/O via virtio-blk driver. Polling
feature is enabled by module parameter "poll_queues" and it sets
dedicated polling queues for virtio-blk. This patch improves the
polling I/O throughput and latency.
The virtio-blk driver doesn't not have a poll function and a poll
queue and it has been operating in interrupt driven method even if
the polling function is called in the upper layer.
virtio-blk polling is implemented upon 'batched completion' of block
layer. virtblk_poll() queues completed request to io_comp_batch->req_list
and later, virtblk_complete_batch() calls unmap function and ends
the requests in batch.
virtio-blk reads the number of poll queues from module parameter
"poll_queues". If VM sets queue parameter as below,
("num-queues=N" [QEMU property], "poll_queues=M" [module parameter])
It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
queues, the poll queues have no callback function.
Regarding HW-SW queue mapping, the default queue mapping uses the
existing method that condsiders MSI irq vector. But the poll queue
doesn't have an irq, so it uses the regular blk-mq cpu mapping.
For verifying the improvement, I did Fio polling I/O performance test
with io_uring engine with the options below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
queues for VM.
As a result, IOPS and average latency improved about 10%.
Test result:
- Fio io_uring poll without virtio-blk poll support
-- numjobs=1 : IOPS = 339K, avg latency = 188.33us
-- numjobs=2 : IOPS = 367K, avg latency = 347.33us
-- numjobs=4 : IOPS = 383K, avg latency = 682.06us
- Fio io_uring poll with virtio-blk poll support
-- numjobs=1 : IOPS = 385K, avg latency = 165.94us
-- numjobs=2 : IOPS = 408K, avg latency = 313.28us
-- numjobs=4 : IOPS = 424K, avg latency = 613.05us
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-2-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:06 +00:00
|
|
|
for (; i < num_vqs; i++) {
|
virtio_blk: fix snprintf truncation compiler warning
Commit 4e0400525691 ("virtio-blk: support polling I/O") triggers the
following gcc 13 W=1 warnings:
drivers/block/virtio_blk.c: In function ‘init_vq’:
drivers/block/virtio_blk.c:1077:68: warning: ‘%d’ directive output may be truncated writing between 1 and 11 bytes into a region of size 7 [-Wformat-truncation=]
1077 | snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
| ^~
drivers/block/virtio_blk.c:1077:58: note: directive argument in the range [-2147483648, 65534]
1077 | snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
| ^~~~~~~~~~~~~
drivers/block/virtio_blk.c:1077:17: note: ‘snprintf’ output between 11 and 21 bytes into a destination of size 16
1077 | snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This is a false positive because the lower bound -2147483648 is
incorrect. The true range of i is [0, num_vqs - 1] where 0 < num_vqs <
65536.
The code mixes int, unsigned short, and unsigned int types in addition
to using "%d" for an unsigned value. Use unsigned short and "%u"
consistently to solve the compiler warning.
Cc: Suwan Kim <suwan.kim027@gmail.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202312041509.DIyvEt9h-lkp@intel.com/
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20231204140743.1487843-1-stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2023-12-04 14:07:43 +00:00
|
|
|
snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%u", i);
|
2024-07-08 07:48:05 +00:00
|
|
|
vqs_info[i].name = vblk->vqs[i].name;
|
virtio-blk: support polling I/O
This patch supports polling I/O via virtio-blk driver. Polling
feature is enabled by module parameter "poll_queues" and it sets
dedicated polling queues for virtio-blk. This patch improves the
polling I/O throughput and latency.
The virtio-blk driver doesn't not have a poll function and a poll
queue and it has been operating in interrupt driven method even if
the polling function is called in the upper layer.
virtio-blk polling is implemented upon 'batched completion' of block
layer. virtblk_poll() queues completed request to io_comp_batch->req_list
and later, virtblk_complete_batch() calls unmap function and ends
the requests in batch.
virtio-blk reads the number of poll queues from module parameter
"poll_queues". If VM sets queue parameter as below,
("num-queues=N" [QEMU property], "poll_queues=M" [module parameter])
It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
queues, the poll queues have no callback function.
Regarding HW-SW queue mapping, the default queue mapping uses the
existing method that condsiders MSI irq vector. But the poll queue
doesn't have an irq, so it uses the regular blk-mq cpu mapping.
For verifying the improvement, I did Fio polling I/O performance test
with io_uring engine with the options below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
queues for VM.
As a result, IOPS and average latency improved about 10%.
Test result:
- Fio io_uring poll without virtio-blk poll support
-- numjobs=1 : IOPS = 339K, avg latency = 188.33us
-- numjobs=2 : IOPS = 367K, avg latency = 347.33us
-- numjobs=4 : IOPS = 383K, avg latency = 682.06us
- Fio io_uring poll with virtio-blk poll support
-- numjobs=1 : IOPS = 385K, avg latency = 165.94us
-- numjobs=2 : IOPS = 408K, avg latency = 313.28us
-- numjobs=4 : IOPS = 424K, avg latency = 613.05us
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-2-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:06 +00:00
|
|
|
}
|
|
|
|
|
2014-06-26 09:41:48 +00:00
|
|
|
/* Discover virtqueues and write information to configuration. */
|
2024-07-08 07:48:14 +00:00
|
|
|
err = virtio_find_vqs(vdev, num_vqs, vqs, vqs_info, &desc);
|
2014-06-26 09:41:48 +00:00
|
|
|
if (err)
|
2016-08-09 08:39:20 +00:00
|
|
|
goto out;
|
2011-12-22 11:28:29 +00:00
|
|
|
|
2014-06-26 09:41:48 +00:00
|
|
|
for (i = 0; i < num_vqs; i++) {
|
|
|
|
spin_lock_init(&vblk->vqs[i].lock);
|
|
|
|
vblk->vqs[i].vq = vqs[i];
|
|
|
|
}
|
|
|
|
vblk->num_vqs = num_vqs;
|
|
|
|
|
2016-08-09 08:39:20 +00:00
|
|
|
out:
|
2014-06-26 09:41:48 +00:00
|
|
|
kfree(vqs);
|
2024-07-08 07:48:05 +00:00
|
|
|
kfree(vqs_info);
|
2014-06-26 09:41:48 +00:00
|
|
|
if (err)
|
|
|
|
kfree(vblk->vqs);
|
2011-12-22 11:28:29 +00:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2012-04-10 07:28:05 +00:00
|
|
|
/*
|
|
|
|
* Legacy naming scheme used for virtio devices. We are stuck with it for
|
|
|
|
* virtio blk but don't ever use it for any new driver.
|
|
|
|
*/
|
|
|
|
static int virtblk_name_format(char *prefix, int index, char *buf, int buflen)
|
|
|
|
{
|
|
|
|
const int base = 'z' - 'a' + 1;
|
|
|
|
char *begin = buf + strlen(prefix);
|
|
|
|
char *end = buf + buflen;
|
|
|
|
char *p;
|
|
|
|
int unit;
|
|
|
|
|
|
|
|
p = end - 1;
|
|
|
|
*p = '\0';
|
|
|
|
unit = base;
|
|
|
|
do {
|
|
|
|
if (p == begin)
|
|
|
|
return -EINVAL;
|
|
|
|
*--p = 'a' + (index % unit);
|
|
|
|
index = (index / unit) - 1;
|
|
|
|
} while (index >= 0);
|
|
|
|
|
|
|
|
memmove(begin, p, end - p);
|
|
|
|
memcpy(buf, prefix, strlen(prefix));
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-07-03 13:19:37 +00:00
|
|
|
static int virtblk_get_cache_mode(struct virtio_device *vdev)
|
|
|
|
{
|
|
|
|
u8 writeback;
|
|
|
|
int err;
|
|
|
|
|
2013-10-14 07:41:51 +00:00
|
|
|
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE,
|
|
|
|
struct virtio_blk_config, wce,
|
|
|
|
&writeback);
|
2016-02-24 15:07:27 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If WCE is not configurable and flush is not available,
|
|
|
|
* assume no writeback cache is in use.
|
|
|
|
*/
|
2012-07-03 13:19:37 +00:00
|
|
|
if (err)
|
2016-02-24 15:07:27 +00:00
|
|
|
writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH);
|
2012-07-03 13:19:37 +00:00
|
|
|
|
|
|
|
return writeback;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const char *const virtblk_cache_types[] = {
|
|
|
|
"write through", "write back"
|
|
|
|
};
|
|
|
|
|
|
|
|
static ssize_t
|
2018-09-28 06:17:23 +00:00
|
|
|
cache_type_store(struct device *dev, struct device_attribute *attr,
|
|
|
|
const char *buf, size_t count)
|
2012-07-03 13:19:37 +00:00
|
|
|
{
|
|
|
|
struct gendisk *disk = dev_to_disk(dev);
|
|
|
|
struct virtio_blk *vblk = disk->private_data;
|
|
|
|
struct virtio_device *vdev = vblk->vdev;
|
2024-06-17 06:04:40 +00:00
|
|
|
struct queue_limits lim;
|
2012-07-03 13:19:37 +00:00
|
|
|
int i;
|
|
|
|
|
|
|
|
BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE));
|
2017-06-09 12:07:42 +00:00
|
|
|
i = sysfs_match_string(virtblk_cache_types, buf);
|
2012-07-03 13:19:37 +00:00
|
|
|
if (i < 0)
|
2017-06-09 12:07:42 +00:00
|
|
|
return i;
|
2012-07-03 13:19:37 +00:00
|
|
|
|
2013-10-14 07:41:51 +00:00
|
|
|
virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i);
|
2024-06-17 06:04:40 +00:00
|
|
|
|
|
|
|
lim = queue_limits_start_update(disk->queue);
|
|
|
|
if (virtblk_get_cache_mode(vdev))
|
|
|
|
lim.features |= BLK_FEAT_WRITE_CACHE;
|
|
|
|
else
|
|
|
|
lim.features &= ~BLK_FEAT_WRITE_CACHE;
|
|
|
|
blk_mq_freeze_queue(disk->queue);
|
|
|
|
i = queue_limits_commit_update(disk->queue, &lim);
|
|
|
|
blk_mq_unfreeze_queue(disk->queue);
|
|
|
|
if (i)
|
|
|
|
return i;
|
2012-07-03 13:19:37 +00:00
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t
|
2018-09-28 06:17:23 +00:00
|
|
|
cache_type_show(struct device *dev, struct device_attribute *attr, char *buf)
|
2012-07-03 13:19:37 +00:00
|
|
|
{
|
|
|
|
struct gendisk *disk = dev_to_disk(dev);
|
|
|
|
struct virtio_blk *vblk = disk->private_data;
|
|
|
|
u8 writeback = virtblk_get_cache_mode(vblk->vdev);
|
|
|
|
|
|
|
|
BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types));
|
2021-10-21 06:51:11 +00:00
|
|
|
return sysfs_emit(buf, "%s\n", virtblk_cache_types[writeback]);
|
2012-07-03 13:19:37 +00:00
|
|
|
}
|
|
|
|
|
2018-09-28 06:17:23 +00:00
|
|
|
static DEVICE_ATTR_RW(cache_type);
|
|
|
|
|
|
|
|
static struct attribute *virtblk_attrs[] = {
|
|
|
|
&dev_attr_serial.attr,
|
|
|
|
&dev_attr_cache_type.attr,
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
|
|
|
static umode_t virtblk_attrs_are_visible(struct kobject *kobj,
|
|
|
|
struct attribute *a, int n)
|
|
|
|
{
|
2020-08-21 01:19:15 +00:00
|
|
|
struct device *dev = kobj_to_dev(kobj);
|
2018-09-28 06:17:23 +00:00
|
|
|
struct gendisk *disk = dev_to_disk(dev);
|
|
|
|
struct virtio_blk *vblk = disk->private_data;
|
|
|
|
struct virtio_device *vdev = vblk->vdev;
|
|
|
|
|
|
|
|
if (a == &dev_attr_cache_type.attr &&
|
|
|
|
!virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE))
|
|
|
|
return S_IRUGO;
|
|
|
|
|
|
|
|
return a->mode;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct attribute_group virtblk_attr_group = {
|
|
|
|
.attrs = virtblk_attrs,
|
|
|
|
.is_visible = virtblk_attrs_are_visible,
|
|
|
|
};
|
|
|
|
|
|
|
|
static const struct attribute_group *virtblk_attr_groups[] = {
|
|
|
|
&virtblk_attr_group,
|
|
|
|
NULL,
|
|
|
|
};
|
2012-07-03 13:19:37 +00:00
|
|
|
|
2022-08-15 17:00:43 +00:00
|
|
|
static void virtblk_map_queues(struct blk_mq_tag_set *set)
|
2017-02-05 17:15:25 +00:00
|
|
|
{
|
|
|
|
struct virtio_blk *vblk = set->driver_data;
|
virtio-blk: support polling I/O
This patch supports polling I/O via virtio-blk driver. Polling
feature is enabled by module parameter "poll_queues" and it sets
dedicated polling queues for virtio-blk. This patch improves the
polling I/O throughput and latency.
The virtio-blk driver doesn't not have a poll function and a poll
queue and it has been operating in interrupt driven method even if
the polling function is called in the upper layer.
virtio-blk polling is implemented upon 'batched completion' of block
layer. virtblk_poll() queues completed request to io_comp_batch->req_list
and later, virtblk_complete_batch() calls unmap function and ends
the requests in batch.
virtio-blk reads the number of poll queues from module parameter
"poll_queues". If VM sets queue parameter as below,
("num-queues=N" [QEMU property], "poll_queues=M" [module parameter])
It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
queues, the poll queues have no callback function.
Regarding HW-SW queue mapping, the default queue mapping uses the
existing method that condsiders MSI irq vector. But the poll queue
doesn't have an irq, so it uses the regular blk-mq cpu mapping.
For verifying the improvement, I did Fio polling I/O performance test
with io_uring engine with the options below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
queues for VM.
As a result, IOPS and average latency improved about 10%.
Test result:
- Fio io_uring poll without virtio-blk poll support
-- numjobs=1 : IOPS = 339K, avg latency = 188.33us
-- numjobs=2 : IOPS = 367K, avg latency = 347.33us
-- numjobs=4 : IOPS = 383K, avg latency = 682.06us
- Fio io_uring poll with virtio-blk poll support
-- numjobs=1 : IOPS = 385K, avg latency = 165.94us
-- numjobs=2 : IOPS = 408K, avg latency = 313.28us
-- numjobs=4 : IOPS = 424K, avg latency = 613.05us
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-2-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:06 +00:00
|
|
|
int i, qoff;
|
|
|
|
|
|
|
|
for (i = 0, qoff = 0; i < set->nr_maps; i++) {
|
|
|
|
struct blk_mq_queue_map *map = &set->map[i];
|
|
|
|
|
|
|
|
map->nr_queues = vblk->io_queues[i];
|
|
|
|
map->queue_offset = qoff;
|
|
|
|
qoff += map->nr_queues;
|
|
|
|
|
|
|
|
if (map->nr_queues == 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Regular queues have interrupts and hence CPU affinity is
|
|
|
|
* defined by the core virtio code, but polling queues have
|
|
|
|
* no interrupts so we let the block layer assign CPU affinity.
|
|
|
|
*/
|
|
|
|
if (i == HCTX_TYPE_POLL)
|
|
|
|
blk_mq_map_queues(&set->map[i]);
|
|
|
|
else
|
|
|
|
blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Revert "virtio-blk: support completion batching for the IRQ path"
This reverts commit 07b679f70d73483930e8d3c293942416d9cd5c13.
This change appears to have broken things...
We now see applications hanging during disk accesses.
e.g.
multi-port virtio-blk device running in h/w (FPGA)
Host running a simple 'fio' test.
[global]
thread=1
direct=1
ioengine=libaio
norandommap=1
group_reporting=1
bs=4K
rw=read
iodepth=128
runtime=1
numjobs=4
time_based
[job0]
filename=/dev/vda
[job1]
filename=/dev/vdb
[job2]
filename=/dev/vdc
...
[job15]
filename=/dev/vdp
i.e. 16 disks; 4 queues per disk; simple burst of 4KB reads
This is repeatedly run in a loop.
After a few, normally <10 seconds, fio hangs.
With 64 queues (16 disks), failure occurs within a few seconds; with 8 queues (2 disks) it may take ~hour before hanging.
Last message:
fio-3.19
Starting 8 threads
Jobs: 1 (f=1): [_(7),R(1)][68.3%][eta 03h:11m:06s]
I think this means at the end of the run 1 queue was left incomplete.
'diskstats' (run while fio is hung) shows no outstanding transactions.
e.g.
$ cat /proc/diskstats
...
252 0 vda 1843140071 0 14745120568 712568645 0 0 0 0 0 3117947 712568645 0 0 0 0 0 0
252 16 vdb 1816291511 0 14530332088 704905623 0 0 0 0 0 3117711 704905623 0 0 0 0 0 0
...
Other stats (in the h/w, and added to the virtio-blk driver ([a]virtio_queue_rq(), [b]virtblk_handle_req(), [c]virtblk_request_done()) all agree, and show every request had a completion, and that virtblk_request_done() never gets called.
e.g.
PF= 0 vq=0 1 2 3
[a]request_count - 839416590 813148916 105586179 84988123
[b]completion1_count - 839416590 813148916 105586179 84988123
[c]completion2_count - 0 0 0 0
PF= 1 vq=0 1 2 3
[a]request_count - 823335887 812516140 104582672 75856549
[b]completion1_count - 823335887 812516140 104582672 75856549
[c]completion2_count - 0 0 0 0
i.e. the issue is after the virtio-blk driver.
This change was introduced in kernel 6.3.0.
I am seeing this using 6.3.3.
If I run with an earlier kernel (5.15), it does not occur.
If I make a simple patch to the 6.3.3 virtio-blk driver, to skip the blk_mq_add_to_batch()call, it does not fail.
e.g.
kernel 5.15 - this is OK
virtio_blk.c,virtblk_done() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
blk_mq_complete_request(req);
}
kernel 6.3.3 - this fails
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
If I do, kernel 6.3.3 - this is OK
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
virtblk_request_done(req); //force this here...
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
Perhaps you might like to fix/test/revert this change...
Martin
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202306090826.C1fZmdMe-lkp@intel.com/
Cc: Suwan Kim <suwan.kim027@gmail.com>
Tested-by: edliaw@google.com
Reported-by: "Roberts, Martin" <martin.roberts@intel.com>
Message-Id: <336455b4f630f329380a8f53ee8cad3868764d5c.1686295549.git.mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2023-06-08 21:42:53 +00:00
|
|
|
static void virtblk_complete_batch(struct io_comp_batch *iob)
|
|
|
|
{
|
|
|
|
struct request *req;
|
|
|
|
|
|
|
|
rq_list_for_each(&iob->req_list, req) {
|
|
|
|
virtblk_unmap_data(req, blk_mq_rq_to_pdu(req));
|
|
|
|
virtblk_cleanup_cmd(req);
|
|
|
|
}
|
|
|
|
blk_mq_end_request_batch(iob);
|
|
|
|
}
|
|
|
|
|
virtio-blk: support polling I/O
This patch supports polling I/O via virtio-blk driver. Polling
feature is enabled by module parameter "poll_queues" and it sets
dedicated polling queues for virtio-blk. This patch improves the
polling I/O throughput and latency.
The virtio-blk driver doesn't not have a poll function and a poll
queue and it has been operating in interrupt driven method even if
the polling function is called in the upper layer.
virtio-blk polling is implemented upon 'batched completion' of block
layer. virtblk_poll() queues completed request to io_comp_batch->req_list
and later, virtblk_complete_batch() calls unmap function and ends
the requests in batch.
virtio-blk reads the number of poll queues from module parameter
"poll_queues". If VM sets queue parameter as below,
("num-queues=N" [QEMU property], "poll_queues=M" [module parameter])
It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
queues, the poll queues have no callback function.
Regarding HW-SW queue mapping, the default queue mapping uses the
existing method that condsiders MSI irq vector. But the poll queue
doesn't have an irq, so it uses the regular blk-mq cpu mapping.
For verifying the improvement, I did Fio polling I/O performance test
with io_uring engine with the options below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
queues for VM.
As a result, IOPS and average latency improved about 10%.
Test result:
- Fio io_uring poll without virtio-blk poll support
-- numjobs=1 : IOPS = 339K, avg latency = 188.33us
-- numjobs=2 : IOPS = 367K, avg latency = 347.33us
-- numjobs=4 : IOPS = 383K, avg latency = 682.06us
- Fio io_uring poll with virtio-blk poll support
-- numjobs=1 : IOPS = 385K, avg latency = 165.94us
-- numjobs=2 : IOPS = 408K, avg latency = 313.28us
-- numjobs=4 : IOPS = 424K, avg latency = 613.05us
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-2-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:06 +00:00
|
|
|
static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
|
|
|
|
{
|
|
|
|
struct virtio_blk *vblk = hctx->queue->queuedata;
|
2022-08-10 16:09:48 +00:00
|
|
|
struct virtio_blk_vq *vq = get_virtio_blk_vq(hctx);
|
Revert "virtio-blk: support completion batching for the IRQ path"
This reverts commit 07b679f70d73483930e8d3c293942416d9cd5c13.
This change appears to have broken things...
We now see applications hanging during disk accesses.
e.g.
multi-port virtio-blk device running in h/w (FPGA)
Host running a simple 'fio' test.
[global]
thread=1
direct=1
ioengine=libaio
norandommap=1
group_reporting=1
bs=4K
rw=read
iodepth=128
runtime=1
numjobs=4
time_based
[job0]
filename=/dev/vda
[job1]
filename=/dev/vdb
[job2]
filename=/dev/vdc
...
[job15]
filename=/dev/vdp
i.e. 16 disks; 4 queues per disk; simple burst of 4KB reads
This is repeatedly run in a loop.
After a few, normally <10 seconds, fio hangs.
With 64 queues (16 disks), failure occurs within a few seconds; with 8 queues (2 disks) it may take ~hour before hanging.
Last message:
fio-3.19
Starting 8 threads
Jobs: 1 (f=1): [_(7),R(1)][68.3%][eta 03h:11m:06s]
I think this means at the end of the run 1 queue was left incomplete.
'diskstats' (run while fio is hung) shows no outstanding transactions.
e.g.
$ cat /proc/diskstats
...
252 0 vda 1843140071 0 14745120568 712568645 0 0 0 0 0 3117947 712568645 0 0 0 0 0 0
252 16 vdb 1816291511 0 14530332088 704905623 0 0 0 0 0 3117711 704905623 0 0 0 0 0 0
...
Other stats (in the h/w, and added to the virtio-blk driver ([a]virtio_queue_rq(), [b]virtblk_handle_req(), [c]virtblk_request_done()) all agree, and show every request had a completion, and that virtblk_request_done() never gets called.
e.g.
PF= 0 vq=0 1 2 3
[a]request_count - 839416590 813148916 105586179 84988123
[b]completion1_count - 839416590 813148916 105586179 84988123
[c]completion2_count - 0 0 0 0
PF= 1 vq=0 1 2 3
[a]request_count - 823335887 812516140 104582672 75856549
[b]completion1_count - 823335887 812516140 104582672 75856549
[c]completion2_count - 0 0 0 0
i.e. the issue is after the virtio-blk driver.
This change was introduced in kernel 6.3.0.
I am seeing this using 6.3.3.
If I run with an earlier kernel (5.15), it does not occur.
If I make a simple patch to the 6.3.3 virtio-blk driver, to skip the blk_mq_add_to_batch()call, it does not fail.
e.g.
kernel 5.15 - this is OK
virtio_blk.c,virtblk_done() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
blk_mq_complete_request(req);
}
kernel 6.3.3 - this fails
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
If I do, kernel 6.3.3 - this is OK
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
virtblk_request_done(req); //force this here...
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
Perhaps you might like to fix/test/revert this change...
Martin
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202306090826.C1fZmdMe-lkp@intel.com/
Cc: Suwan Kim <suwan.kim027@gmail.com>
Tested-by: edliaw@google.com
Reported-by: "Roberts, Martin" <martin.roberts@intel.com>
Message-Id: <336455b4f630f329380a8f53ee8cad3868764d5c.1686295549.git.mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2023-06-08 21:42:53 +00:00
|
|
|
struct virtblk_req *vbr;
|
virtio-blk: support polling I/O
This patch supports polling I/O via virtio-blk driver. Polling
feature is enabled by module parameter "poll_queues" and it sets
dedicated polling queues for virtio-blk. This patch improves the
polling I/O throughput and latency.
The virtio-blk driver doesn't not have a poll function and a poll
queue and it has been operating in interrupt driven method even if
the polling function is called in the upper layer.
virtio-blk polling is implemented upon 'batched completion' of block
layer. virtblk_poll() queues completed request to io_comp_batch->req_list
and later, virtblk_complete_batch() calls unmap function and ends
the requests in batch.
virtio-blk reads the number of poll queues from module parameter
"poll_queues". If VM sets queue parameter as below,
("num-queues=N" [QEMU property], "poll_queues=M" [module parameter])
It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
queues, the poll queues have no callback function.
Regarding HW-SW queue mapping, the default queue mapping uses the
existing method that condsiders MSI irq vector. But the poll queue
doesn't have an irq, so it uses the regular blk-mq cpu mapping.
For verifying the improvement, I did Fio polling I/O performance test
with io_uring engine with the options below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
queues for VM.
As a result, IOPS and average latency improved about 10%.
Test result:
- Fio io_uring poll without virtio-blk poll support
-- numjobs=1 : IOPS = 339K, avg latency = 188.33us
-- numjobs=2 : IOPS = 367K, avg latency = 347.33us
-- numjobs=4 : IOPS = 383K, avg latency = 682.06us
- Fio io_uring poll with virtio-blk poll support
-- numjobs=1 : IOPS = 385K, avg latency = 165.94us
-- numjobs=2 : IOPS = 408K, avg latency = 313.28us
-- numjobs=4 : IOPS = 424K, avg latency = 613.05us
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-2-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:06 +00:00
|
|
|
unsigned long flags;
|
Revert "virtio-blk: support completion batching for the IRQ path"
This reverts commit 07b679f70d73483930e8d3c293942416d9cd5c13.
This change appears to have broken things...
We now see applications hanging during disk accesses.
e.g.
multi-port virtio-blk device running in h/w (FPGA)
Host running a simple 'fio' test.
[global]
thread=1
direct=1
ioengine=libaio
norandommap=1
group_reporting=1
bs=4K
rw=read
iodepth=128
runtime=1
numjobs=4
time_based
[job0]
filename=/dev/vda
[job1]
filename=/dev/vdb
[job2]
filename=/dev/vdc
...
[job15]
filename=/dev/vdp
i.e. 16 disks; 4 queues per disk; simple burst of 4KB reads
This is repeatedly run in a loop.
After a few, normally <10 seconds, fio hangs.
With 64 queues (16 disks), failure occurs within a few seconds; with 8 queues (2 disks) it may take ~hour before hanging.
Last message:
fio-3.19
Starting 8 threads
Jobs: 1 (f=1): [_(7),R(1)][68.3%][eta 03h:11m:06s]
I think this means at the end of the run 1 queue was left incomplete.
'diskstats' (run while fio is hung) shows no outstanding transactions.
e.g.
$ cat /proc/diskstats
...
252 0 vda 1843140071 0 14745120568 712568645 0 0 0 0 0 3117947 712568645 0 0 0 0 0 0
252 16 vdb 1816291511 0 14530332088 704905623 0 0 0 0 0 3117711 704905623 0 0 0 0 0 0
...
Other stats (in the h/w, and added to the virtio-blk driver ([a]virtio_queue_rq(), [b]virtblk_handle_req(), [c]virtblk_request_done()) all agree, and show every request had a completion, and that virtblk_request_done() never gets called.
e.g.
PF= 0 vq=0 1 2 3
[a]request_count - 839416590 813148916 105586179 84988123
[b]completion1_count - 839416590 813148916 105586179 84988123
[c]completion2_count - 0 0 0 0
PF= 1 vq=0 1 2 3
[a]request_count - 823335887 812516140 104582672 75856549
[b]completion1_count - 823335887 812516140 104582672 75856549
[c]completion2_count - 0 0 0 0
i.e. the issue is after the virtio-blk driver.
This change was introduced in kernel 6.3.0.
I am seeing this using 6.3.3.
If I run with an earlier kernel (5.15), it does not occur.
If I make a simple patch to the 6.3.3 virtio-blk driver, to skip the blk_mq_add_to_batch()call, it does not fail.
e.g.
kernel 5.15 - this is OK
virtio_blk.c,virtblk_done() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
blk_mq_complete_request(req);
}
kernel 6.3.3 - this fails
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
If I do, kernel 6.3.3 - this is OK
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
virtblk_request_done(req); //force this here...
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
Perhaps you might like to fix/test/revert this change...
Martin
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202306090826.C1fZmdMe-lkp@intel.com/
Cc: Suwan Kim <suwan.kim027@gmail.com>
Tested-by: edliaw@google.com
Reported-by: "Roberts, Martin" <martin.roberts@intel.com>
Message-Id: <336455b4f630f329380a8f53ee8cad3868764d5c.1686295549.git.mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2023-06-08 21:42:53 +00:00
|
|
|
unsigned int len;
|
virtio-blk: support polling I/O
This patch supports polling I/O via virtio-blk driver. Polling
feature is enabled by module parameter "poll_queues" and it sets
dedicated polling queues for virtio-blk. This patch improves the
polling I/O throughput and latency.
The virtio-blk driver doesn't not have a poll function and a poll
queue and it has been operating in interrupt driven method even if
the polling function is called in the upper layer.
virtio-blk polling is implemented upon 'batched completion' of block
layer. virtblk_poll() queues completed request to io_comp_batch->req_list
and later, virtblk_complete_batch() calls unmap function and ends
the requests in batch.
virtio-blk reads the number of poll queues from module parameter
"poll_queues". If VM sets queue parameter as below,
("num-queues=N" [QEMU property], "poll_queues=M" [module parameter])
It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
queues, the poll queues have no callback function.
Regarding HW-SW queue mapping, the default queue mapping uses the
existing method that condsiders MSI irq vector. But the poll queue
doesn't have an irq, so it uses the regular blk-mq cpu mapping.
For verifying the improvement, I did Fio polling I/O performance test
with io_uring engine with the options below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
queues for VM.
As a result, IOPS and average latency improved about 10%.
Test result:
- Fio io_uring poll without virtio-blk poll support
-- numjobs=1 : IOPS = 339K, avg latency = 188.33us
-- numjobs=2 : IOPS = 367K, avg latency = 347.33us
-- numjobs=4 : IOPS = 383K, avg latency = 682.06us
- Fio io_uring poll with virtio-blk poll support
-- numjobs=1 : IOPS = 385K, avg latency = 165.94us
-- numjobs=2 : IOPS = 408K, avg latency = 313.28us
-- numjobs=4 : IOPS = 424K, avg latency = 613.05us
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-2-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:06 +00:00
|
|
|
int found = 0;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&vq->lock, flags);
|
Revert "virtio-blk: support completion batching for the IRQ path"
This reverts commit 07b679f70d73483930e8d3c293942416d9cd5c13.
This change appears to have broken things...
We now see applications hanging during disk accesses.
e.g.
multi-port virtio-blk device running in h/w (FPGA)
Host running a simple 'fio' test.
[global]
thread=1
direct=1
ioengine=libaio
norandommap=1
group_reporting=1
bs=4K
rw=read
iodepth=128
runtime=1
numjobs=4
time_based
[job0]
filename=/dev/vda
[job1]
filename=/dev/vdb
[job2]
filename=/dev/vdc
...
[job15]
filename=/dev/vdp
i.e. 16 disks; 4 queues per disk; simple burst of 4KB reads
This is repeatedly run in a loop.
After a few, normally <10 seconds, fio hangs.
With 64 queues (16 disks), failure occurs within a few seconds; with 8 queues (2 disks) it may take ~hour before hanging.
Last message:
fio-3.19
Starting 8 threads
Jobs: 1 (f=1): [_(7),R(1)][68.3%][eta 03h:11m:06s]
I think this means at the end of the run 1 queue was left incomplete.
'diskstats' (run while fio is hung) shows no outstanding transactions.
e.g.
$ cat /proc/diskstats
...
252 0 vda 1843140071 0 14745120568 712568645 0 0 0 0 0 3117947 712568645 0 0 0 0 0 0
252 16 vdb 1816291511 0 14530332088 704905623 0 0 0 0 0 3117711 704905623 0 0 0 0 0 0
...
Other stats (in the h/w, and added to the virtio-blk driver ([a]virtio_queue_rq(), [b]virtblk_handle_req(), [c]virtblk_request_done()) all agree, and show every request had a completion, and that virtblk_request_done() never gets called.
e.g.
PF= 0 vq=0 1 2 3
[a]request_count - 839416590 813148916 105586179 84988123
[b]completion1_count - 839416590 813148916 105586179 84988123
[c]completion2_count - 0 0 0 0
PF= 1 vq=0 1 2 3
[a]request_count - 823335887 812516140 104582672 75856549
[b]completion1_count - 823335887 812516140 104582672 75856549
[c]completion2_count - 0 0 0 0
i.e. the issue is after the virtio-blk driver.
This change was introduced in kernel 6.3.0.
I am seeing this using 6.3.3.
If I run with an earlier kernel (5.15), it does not occur.
If I make a simple patch to the 6.3.3 virtio-blk driver, to skip the blk_mq_add_to_batch()call, it does not fail.
e.g.
kernel 5.15 - this is OK
virtio_blk.c,virtblk_done() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
blk_mq_complete_request(req);
}
kernel 6.3.3 - this fails
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
If I do, kernel 6.3.3 - this is OK
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
virtblk_request_done(req); //force this here...
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
Perhaps you might like to fix/test/revert this change...
Martin
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202306090826.C1fZmdMe-lkp@intel.com/
Cc: Suwan Kim <suwan.kim027@gmail.com>
Tested-by: edliaw@google.com
Reported-by: "Roberts, Martin" <martin.roberts@intel.com>
Message-Id: <336455b4f630f329380a8f53ee8cad3868764d5c.1686295549.git.mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2023-06-08 21:42:53 +00:00
|
|
|
|
|
|
|
while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) {
|
|
|
|
struct request *req = blk_mq_rq_from_pdu(vbr);
|
|
|
|
|
|
|
|
found++;
|
|
|
|
if (!blk_mq_complete_request_remote(req) &&
|
|
|
|
!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr),
|
|
|
|
virtblk_complete_batch))
|
|
|
|
virtblk_request_done(req);
|
|
|
|
}
|
virtio-blk: support polling I/O
This patch supports polling I/O via virtio-blk driver. Polling
feature is enabled by module parameter "poll_queues" and it sets
dedicated polling queues for virtio-blk. This patch improves the
polling I/O throughput and latency.
The virtio-blk driver doesn't not have a poll function and a poll
queue and it has been operating in interrupt driven method even if
the polling function is called in the upper layer.
virtio-blk polling is implemented upon 'batched completion' of block
layer. virtblk_poll() queues completed request to io_comp_batch->req_list
and later, virtblk_complete_batch() calls unmap function and ends
the requests in batch.
virtio-blk reads the number of poll queues from module parameter
"poll_queues". If VM sets queue parameter as below,
("num-queues=N" [QEMU property], "poll_queues=M" [module parameter])
It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
queues, the poll queues have no callback function.
Regarding HW-SW queue mapping, the default queue mapping uses the
existing method that condsiders MSI irq vector. But the poll queue
doesn't have an irq, so it uses the regular blk-mq cpu mapping.
For verifying the improvement, I did Fio polling I/O performance test
with io_uring engine with the options below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
queues for VM.
As a result, IOPS and average latency improved about 10%.
Test result:
- Fio io_uring poll without virtio-blk poll support
-- numjobs=1 : IOPS = 339K, avg latency = 188.33us
-- numjobs=2 : IOPS = 367K, avg latency = 347.33us
-- numjobs=4 : IOPS = 383K, avg latency = 682.06us
- Fio io_uring poll with virtio-blk poll support
-- numjobs=1 : IOPS = 385K, avg latency = 165.94us
-- numjobs=2 : IOPS = 408K, avg latency = 313.28us
-- numjobs=4 : IOPS = 424K, avg latency = 613.05us
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-2-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:06 +00:00
|
|
|
|
|
|
|
if (found)
|
|
|
|
blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
|
|
|
|
|
|
|
|
spin_unlock_irqrestore(&vq->lock, flags);
|
|
|
|
|
|
|
|
return found;
|
|
|
|
}
|
|
|
|
|
2017-03-30 20:39:16 +00:00
|
|
|
static const struct blk_mq_ops virtio_mq_ops = {
|
2013-11-01 16:52:52 +00:00
|
|
|
.queue_rq = virtio_queue_rq,
|
virtio-blk: support mq_ops->queue_rqs()
This patch supports mq_ops->queue_rqs() hook. It has an advantage of
batch submission to virtio-blk driver. It also helps polling I/O because
polling uses batched completion of block layer. Batch submission in
queue_rqs() can boost polling performance.
In queue_rqs(), it iterates plug->mq_list, collects requests that
belong to same HW queue until it encounters a request from other
HW queue or sees the end of the list.
Then, virtio-blk adds requests into virtqueue and kicks virtqueue
to submit requests.
If there is an error, it inserts error request to requeue_list and
passes it to ordinary block layer path.
For verification, I did fio test.
(io_uring, randread, direct=1, bs=4K, iodepth=64 numjobs=N)
I set 4 vcpu and 2 virtio-blk queues for VM and run fio test 5 times.
It shows about 2% improvement.
| numjobs=2 | numjobs=4
-----------------------------------------------------------
fio without queue_rqs() | 291K IOPS | 238K IOPS
-----------------------------------------------------------
fio with queue_rqs() | 295K IOPS | 243K IOPS
For polling I/O performance, I also did fio test as below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=4)
I set 4 vcpu and 2 poll queues for VM.
It shows about 2% improvement in polling I/O.
| IOPS | avg latency
-----------------------------------------------------------
fio poll without queue_rqs() | 424K | 613.05 usec
-----------------------------------------------------------
fio poll with queue_rqs() | 435K | 601.01 usec
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-3-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:07 +00:00
|
|
|
.queue_rqs = virtio_queue_rqs,
|
2018-11-26 18:00:12 +00:00
|
|
|
.commit_rqs = virtio_commit_rqs,
|
2014-02-10 11:24:39 +00:00
|
|
|
.complete = virtblk_request_done,
|
2017-02-05 17:15:25 +00:00
|
|
|
.map_queues = virtblk_map_queues,
|
virtio-blk: support polling I/O
This patch supports polling I/O via virtio-blk driver. Polling
feature is enabled by module parameter "poll_queues" and it sets
dedicated polling queues for virtio-blk. This patch improves the
polling I/O throughput and latency.
The virtio-blk driver doesn't not have a poll function and a poll
queue and it has been operating in interrupt driven method even if
the polling function is called in the upper layer.
virtio-blk polling is implemented upon 'batched completion' of block
layer. virtblk_poll() queues completed request to io_comp_batch->req_list
and later, virtblk_complete_batch() calls unmap function and ends
the requests in batch.
virtio-blk reads the number of poll queues from module parameter
"poll_queues". If VM sets queue parameter as below,
("num-queues=N" [QEMU property], "poll_queues=M" [module parameter])
It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
queues, the poll queues have no callback function.
Regarding HW-SW queue mapping, the default queue mapping uses the
existing method that condsiders MSI irq vector. But the poll queue
doesn't have an irq, so it uses the regular blk-mq cpu mapping.
For verifying the improvement, I did Fio polling I/O performance test
with io_uring engine with the options below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
queues for VM.
As a result, IOPS and average latency improved about 10%.
Test result:
- Fio io_uring poll without virtio-blk poll support
-- numjobs=1 : IOPS = 339K, avg latency = 188.33us
-- numjobs=2 : IOPS = 367K, avg latency = 347.33us
-- numjobs=4 : IOPS = 383K, avg latency = 682.06us
- Fio io_uring poll with virtio-blk poll support
-- numjobs=1 : IOPS = 385K, avg latency = 165.94us
-- numjobs=2 : IOPS = 408K, avg latency = 313.28us
-- numjobs=4 : IOPS = 424K, avg latency = 613.05us
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220406153207.163134-2-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
2022-04-06 15:32:06 +00:00
|
|
|
.poll = virtblk_poll,
|
2013-11-01 16:52:52 +00:00
|
|
|
};
|
|
|
|
|
2014-04-15 20:14:00 +00:00
|
|
|
static unsigned int virtblk_queue_depth;
|
|
|
|
module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
|
2013-11-01 16:52:52 +00:00
|
|
|
|
2024-02-13 07:34:22 +00:00
|
|
|
static int virtblk_read_limits(struct virtio_blk *vblk,
|
|
|
|
struct queue_limits *lim)
|
2007-10-22 01:03:38 +00:00
|
|
|
{
|
2024-02-13 07:34:21 +00:00
|
|
|
struct virtio_device *vdev = vblk->vdev;
|
2024-07-08 09:16:47 +00:00
|
|
|
u32 v, max_size, sg_elems, opt_io_size;
|
2022-09-21 08:27:29 +00:00
|
|
|
u32 max_discard_segs = 0;
|
|
|
|
u32 discard_granularity = 0;
|
2010-02-24 20:22:25 +00:00
|
|
|
u16 min_io_size;
|
|
|
|
u8 physical_block_exp, alignment_offset;
|
2023-09-04 06:10:45 +00:00
|
|
|
size_t max_dma_size;
|
2024-02-13 07:34:21 +00:00
|
|
|
int err;
|
2008-01-31 14:53:53 +00:00
|
|
|
|
2008-12-30 15:26:05 +00:00
|
|
|
/* We need to know how many segments before we allocate. */
|
2013-10-14 07:41:51 +00:00
|
|
|
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX,
|
|
|
|
struct virtio_blk_config, seg_max,
|
|
|
|
&sg_elems);
|
2010-05-25 12:17:54 +00:00
|
|
|
|
|
|
|
/* We need at least one SG element, whatever they say. */
|
|
|
|
if (err || !sg_elems)
|
2008-12-30 15:26:05 +00:00
|
|
|
sg_elems = 1;
|
|
|
|
|
2021-05-24 15:40:20 +00:00
|
|
|
/* Prevent integer overflows and honor max vq size */
|
|
|
|
sg_elems = min_t(u32, sg_elems, VIRTIO_BLK_MAX_SG_ELEMS - 2);
|
|
|
|
|
2008-12-30 15:26:05 +00:00
|
|
|
/* We can handle whatever the host told us to handle. */
|
2024-02-13 07:34:22 +00:00
|
|
|
lim->max_segments = sg_elems;
|
2008-12-30 15:26:05 +00:00
|
|
|
|
2008-12-30 15:26:04 +00:00
|
|
|
/* No real sector limit. */
|
2024-02-13 07:34:22 +00:00
|
|
|
lim->max_hw_sectors = UINT_MAX;
|
2008-12-30 15:26:04 +00:00
|
|
|
|
2023-09-04 06:10:45 +00:00
|
|
|
max_dma_size = virtio_max_dma_size(vdev);
|
|
|
|
max_size = max_dma_size > U32_MAX ? U32_MAX : max_dma_size;
|
2019-02-07 11:59:17 +00:00
|
|
|
|
2008-02-05 04:49:56 +00:00
|
|
|
/* Host can optionally specify maximum segment size and number of
|
|
|
|
* segments. */
|
2013-10-14 07:41:51 +00:00
|
|
|
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX,
|
|
|
|
struct virtio_blk_config, size_max, &v);
|
2007-10-22 01:03:38 +00:00
|
|
|
if (!err)
|
2019-02-07 11:59:17 +00:00
|
|
|
max_size = min(max_size, v);
|
|
|
|
|
2024-02-13 07:34:22 +00:00
|
|
|
lim->max_segment_size = max_size;
|
2007-10-22 01:03:38 +00:00
|
|
|
|
2008-05-29 09:08:26 +00:00
|
|
|
/* Host can optionally specify the block size of the device */
|
2024-07-08 09:16:50 +00:00
|
|
|
virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
|
2013-10-14 07:41:51 +00:00
|
|
|
struct virtio_blk_config, blk_size,
|
2024-07-08 09:16:47 +00:00
|
|
|
&lim->logical_block_size);
|
2010-02-24 20:22:25 +00:00
|
|
|
|
|
|
|
/* Use topology information if available */
|
2013-10-14 07:41:51 +00:00
|
|
|
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
|
|
|
|
struct virtio_blk_config, physical_block_exp,
|
|
|
|
&physical_block_exp);
|
2010-02-24 20:22:25 +00:00
|
|
|
if (!err && physical_block_exp)
|
2024-07-08 09:16:47 +00:00
|
|
|
lim->physical_block_size =
|
|
|
|
lim->logical_block_size * (1 << physical_block_exp);
|
2010-02-24 20:22:25 +00:00
|
|
|
|
2013-10-14 07:41:51 +00:00
|
|
|
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
|
|
|
|
struct virtio_blk_config, alignment_offset,
|
|
|
|
&alignment_offset);
|
2010-02-24 20:22:25 +00:00
|
|
|
if (!err && alignment_offset)
|
2024-07-08 09:16:47 +00:00
|
|
|
lim->alignment_offset =
|
|
|
|
lim->logical_block_size * alignment_offset;
|
2010-02-24 20:22:25 +00:00
|
|
|
|
2013-10-14 07:41:51 +00:00
|
|
|
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
|
|
|
|
struct virtio_blk_config, min_io_size,
|
|
|
|
&min_io_size);
|
2010-02-24 20:22:25 +00:00
|
|
|
if (!err && min_io_size)
|
2024-07-08 09:16:47 +00:00
|
|
|
lim->io_min = lim->logical_block_size * min_io_size;
|
2010-02-24 20:22:25 +00:00
|
|
|
|
2013-10-14 07:41:51 +00:00
|
|
|
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
|
|
|
|
struct virtio_blk_config, opt_io_size,
|
|
|
|
&opt_io_size);
|
2010-02-24 20:22:25 +00:00
|
|
|
if (!err && opt_io_size)
|
2024-07-08 09:16:47 +00:00
|
|
|
lim->io_opt = lim->logical_block_size * opt_io_size;
|
2010-02-24 20:22:25 +00:00
|
|
|
|
2018-11-01 22:40:35 +00:00
|
|
|
if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
|
|
|
|
virtio_cread(vdev, struct virtio_blk_config,
|
2022-09-21 08:27:29 +00:00
|
|
|
discard_sector_alignment, &discard_granularity);
|
2018-11-01 22:40:35 +00:00
|
|
|
|
|
|
|
virtio_cread(vdev, struct virtio_blk_config,
|
|
|
|
max_discard_sectors, &v);
|
2024-02-13 07:34:22 +00:00
|
|
|
lim->max_hw_discard_sectors = v ? v : UINT_MAX;
|
2018-11-01 22:40:35 +00:00
|
|
|
|
|
|
|
virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
|
2022-09-21 08:27:29 +00:00
|
|
|
&max_discard_segs);
|
2018-11-01 22:40:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
|
|
|
|
virtio_cread(vdev, struct virtio_blk_config,
|
|
|
|
max_write_zeroes_sectors, &v);
|
2024-02-13 07:34:22 +00:00
|
|
|
lim->max_write_zeroes_sectors = v ? v : UINT_MAX;
|
2018-11-01 22:40:35 +00:00
|
|
|
}
|
|
|
|
|
2022-09-21 08:27:29 +00:00
|
|
|
/* The discard and secure erase limits are combined since the Linux
|
|
|
|
* block layer uses the same limit for both commands.
|
|
|
|
*
|
|
|
|
* If both VIRTIO_BLK_F_SECURE_ERASE and VIRTIO_BLK_F_DISCARD features
|
|
|
|
* are negotiated, we will use the minimum between the limits.
|
|
|
|
*
|
|
|
|
* discard sector alignment is set to the minimum between discard_sector_alignment
|
|
|
|
* and secure_erase_sector_alignment.
|
|
|
|
*
|
|
|
|
* max discard sectors is set to the minimum between max_discard_seg and
|
|
|
|
* max_secure_erase_seg.
|
|
|
|
*/
|
|
|
|
if (virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) {
|
|
|
|
|
|
|
|
virtio_cread(vdev, struct virtio_blk_config,
|
|
|
|
secure_erase_sector_alignment, &v);
|
|
|
|
|
|
|
|
/* secure_erase_sector_alignment should not be zero, the device should set a
|
|
|
|
* valid number of sectors.
|
|
|
|
*/
|
|
|
|
if (!v) {
|
|
|
|
dev_err(&vdev->dev,
|
|
|
|
"virtio_blk: secure_erase_sector_alignment can't be 0\n");
|
2024-02-13 07:34:21 +00:00
|
|
|
return -EINVAL;
|
2022-09-21 08:27:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
discard_granularity = min_not_zero(discard_granularity, v);
|
|
|
|
|
|
|
|
virtio_cread(vdev, struct virtio_blk_config,
|
|
|
|
max_secure_erase_sectors, &v);
|
|
|
|
|
|
|
|
/* max_secure_erase_sectors should not be zero, the device should set a
|
|
|
|
* valid number of sectors.
|
|
|
|
*/
|
|
|
|
if (!v) {
|
|
|
|
dev_err(&vdev->dev,
|
|
|
|
"virtio_blk: max_secure_erase_sectors can't be 0\n");
|
2024-02-13 07:34:21 +00:00
|
|
|
return -EINVAL;
|
2022-09-21 08:27:29 +00:00
|
|
|
}
|
|
|
|
|
2024-02-13 07:34:22 +00:00
|
|
|
lim->max_secure_erase_sectors = v;
|
2022-09-21 08:27:29 +00:00
|
|
|
|
|
|
|
virtio_cread(vdev, struct virtio_blk_config,
|
|
|
|
max_secure_erase_seg, &v);
|
|
|
|
|
|
|
|
/* max_secure_erase_seg should not be zero, the device should set a
|
|
|
|
* valid number of segments
|
|
|
|
*/
|
|
|
|
if (!v) {
|
|
|
|
dev_err(&vdev->dev,
|
|
|
|
"virtio_blk: max_secure_erase_seg can't be 0\n");
|
2024-02-13 07:34:21 +00:00
|
|
|
return -EINVAL;
|
2022-09-21 08:27:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
max_discard_segs = min_not_zero(max_discard_segs, v);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD) ||
|
|
|
|
virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) {
|
|
|
|
/* max_discard_seg and discard_granularity will be 0 only
|
|
|
|
* if max_discard_seg and discard_sector_alignment fields in the virtio
|
|
|
|
* config are 0 and VIRTIO_BLK_F_SECURE_ERASE feature is not negotiated.
|
|
|
|
* In this case, we use default values.
|
|
|
|
*/
|
|
|
|
if (!max_discard_segs)
|
|
|
|
max_discard_segs = sg_elems;
|
|
|
|
|
2024-02-13 07:34:22 +00:00
|
|
|
lim->max_discard_segments =
|
|
|
|
min(max_discard_segs, MAX_DISCARD_SEGMENTS);
|
2022-09-21 08:27:29 +00:00
|
|
|
|
|
|
|
if (discard_granularity)
|
2024-02-13 07:34:22 +00:00
|
|
|
lim->discard_granularity =
|
|
|
|
discard_granularity << SECTOR_SHIFT;
|
2022-09-21 08:27:29 +00:00
|
|
|
else
|
2024-07-08 09:16:47 +00:00
|
|
|
lim->discard_granularity = lim->logical_block_size;
|
2022-09-21 08:27:29 +00:00
|
|
|
}
|
|
|
|
|
2023-03-30 21:49:53 +00:00
|
|
|
if (virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED)) {
|
2023-12-17 16:53:55 +00:00
|
|
|
u8 model;
|
|
|
|
|
2024-02-13 07:34:22 +00:00
|
|
|
virtio_cread(vdev, struct virtio_blk_config, zoned.model, &model);
|
2023-12-17 16:53:55 +00:00
|
|
|
switch (model) {
|
|
|
|
case VIRTIO_BLK_Z_NONE:
|
|
|
|
case VIRTIO_BLK_Z_HA:
|
2024-02-13 07:34:22 +00:00
|
|
|
/* treat host-aware devices as non-zoned */
|
|
|
|
return 0;
|
2023-12-17 16:53:55 +00:00
|
|
|
case VIRTIO_BLK_Z_HM:
|
2024-02-13 07:34:22 +00:00
|
|
|
err = virtblk_read_zoned_limits(vblk, lim);
|
2023-12-17 16:53:55 +00:00
|
|
|
if (err)
|
2024-02-13 07:34:22 +00:00
|
|
|
return err;
|
2023-12-17 16:53:55 +00:00
|
|
|
break;
|
|
|
|
default:
|
2024-02-13 07:34:22 +00:00
|
|
|
dev_err(&vdev->dev, "unsupported zone model %d\n", model);
|
|
|
|
return -EINVAL;
|
2023-12-17 16:53:55 +00:00
|
|
|
}
|
2022-10-16 03:41:27 +00:00
|
|
|
}
|
|
|
|
|
2024-02-13 07:34:21 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int virtblk_probe(struct virtio_device *vdev)
|
|
|
|
{
|
|
|
|
struct virtio_blk *vblk;
|
2024-06-17 06:04:41 +00:00
|
|
|
struct queue_limits lim = {
|
|
|
|
.features = BLK_FEAT_ROTATIONAL,
|
2024-07-08 09:16:47 +00:00
|
|
|
.logical_block_size = SECTOR_SIZE,
|
2024-06-17 06:04:41 +00:00
|
|
|
};
|
2024-02-13 07:34:21 +00:00
|
|
|
int err, index;
|
|
|
|
unsigned int queue_depth;
|
|
|
|
|
|
|
|
if (!vdev->config->get) {
|
|
|
|
dev_err(&vdev->dev, "%s failure: config access disabled\n",
|
|
|
|
__func__);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = ida_alloc_range(&vd_index_ida, 0,
|
|
|
|
minor_to_index(1 << MINORBITS) - 1, GFP_KERNEL);
|
|
|
|
if (err < 0)
|
|
|
|
goto out;
|
|
|
|
index = err;
|
|
|
|
|
|
|
|
vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
|
|
|
|
if (!vblk) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out_free_index;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_init(&vblk->vdev_mutex);
|
|
|
|
|
|
|
|
vblk->vdev = vdev;
|
|
|
|
|
|
|
|
INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
|
|
|
|
|
|
|
|
err = init_vq(vblk);
|
|
|
|
if (err)
|
|
|
|
goto out_free_vblk;
|
|
|
|
|
|
|
|
/* Default queue sizing is to fill the ring. */
|
|
|
|
if (!virtblk_queue_depth) {
|
|
|
|
queue_depth = vblk->vqs[0].vq->num_free;
|
|
|
|
/* ... but without indirect descs, we use 2 descs per req */
|
|
|
|
if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
|
|
|
|
queue_depth /= 2;
|
|
|
|
} else {
|
|
|
|
queue_depth = virtblk_queue_depth;
|
|
|
|
}
|
|
|
|
|
|
|
|
memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
|
|
|
|
vblk->tag_set.ops = &virtio_mq_ops;
|
|
|
|
vblk->tag_set.queue_depth = queue_depth;
|
|
|
|
vblk->tag_set.numa_node = NUMA_NO_NODE;
|
|
|
|
vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
|
|
|
|
vblk->tag_set.cmd_size =
|
|
|
|
sizeof(struct virtblk_req) +
|
|
|
|
sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT;
|
|
|
|
vblk->tag_set.driver_data = vblk;
|
|
|
|
vblk->tag_set.nr_hw_queues = vblk->num_vqs;
|
|
|
|
vblk->tag_set.nr_maps = 1;
|
|
|
|
if (vblk->io_queues[HCTX_TYPE_POLL])
|
|
|
|
vblk->tag_set.nr_maps = 3;
|
|
|
|
|
|
|
|
err = blk_mq_alloc_tag_set(&vblk->tag_set);
|
|
|
|
if (err)
|
|
|
|
goto out_free_vq;
|
|
|
|
|
2024-02-13 07:34:22 +00:00
|
|
|
err = virtblk_read_limits(vblk, &lim);
|
|
|
|
if (err)
|
|
|
|
goto out_free_tags;
|
|
|
|
|
2024-06-17 06:04:40 +00:00
|
|
|
if (virtblk_get_cache_mode(vdev))
|
|
|
|
lim.features |= BLK_FEAT_WRITE_CACHE;
|
|
|
|
|
2024-02-13 07:34:22 +00:00
|
|
|
vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, &lim, vblk);
|
2024-02-13 07:34:21 +00:00
|
|
|
if (IS_ERR(vblk->disk)) {
|
|
|
|
err = PTR_ERR(vblk->disk);
|
|
|
|
goto out_free_tags;
|
|
|
|
}
|
|
|
|
|
|
|
|
virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
|
|
|
|
|
|
|
|
vblk->disk->major = major;
|
|
|
|
vblk->disk->first_minor = index_to_minor(index);
|
|
|
|
vblk->disk->minors = 1 << PART_BITS;
|
|
|
|
vblk->disk->private_data = vblk;
|
|
|
|
vblk->disk->fops = &virtblk_fops;
|
|
|
|
vblk->index = index;
|
|
|
|
|
|
|
|
/* If disk is read-only in the host, the guest should obey */
|
|
|
|
if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
|
|
|
|
set_disk_ro(vblk->disk, 1);
|
|
|
|
|
2018-01-03 16:03:39 +00:00
|
|
|
virtblk_update_capacity(vblk, false);
|
2014-10-14 23:52:30 +00:00
|
|
|
virtio_device_ready(vdev);
|
|
|
|
|
2023-03-30 21:49:53 +00:00
|
|
|
/*
|
|
|
|
* All steps that follow use the VQs therefore they need to be
|
|
|
|
* placed after the virtio_device_ready() call above.
|
|
|
|
*/
|
2024-06-17 06:04:49 +00:00
|
|
|
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
|
|
|
|
(lim.features & BLK_FEAT_ZONED)) {
|
2024-04-08 01:41:20 +00:00
|
|
|
err = blk_revalidate_disk_zones(vblk->disk);
|
2024-02-13 07:34:22 +00:00
|
|
|
if (err)
|
2022-10-16 03:41:27 +00:00
|
|
|
goto out_cleanup_disk;
|
|
|
|
}
|
|
|
|
|
2021-08-18 14:45:41 +00:00
|
|
|
err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);
|
|
|
|
if (err)
|
|
|
|
goto out_cleanup_disk;
|
|
|
|
|
2007-10-22 01:03:38 +00:00
|
|
|
return 0;
|
|
|
|
|
2021-08-18 14:45:41 +00:00
|
|
|
out_cleanup_disk:
|
2022-06-19 06:05:52 +00:00
|
|
|
put_disk(vblk->disk);
|
2014-04-15 20:14:00 +00:00
|
|
|
out_free_tags:
|
|
|
|
blk_mq_free_tag_set(&vblk->tag_set);
|
2007-10-22 01:03:38 +00:00
|
|
|
out_free_vq:
|
2009-06-13 04:16:36 +00:00
|
|
|
vdev->config->del_vqs(vdev);
|
2020-06-15 04:14:59 +00:00
|
|
|
kfree(vblk->vqs);
|
2007-10-22 01:03:38 +00:00
|
|
|
out_free_vblk:
|
|
|
|
kfree(vblk);
|
2011-10-30 19:29:59 +00:00
|
|
|
out_free_index:
|
2022-11-30 12:30:03 +00:00
|
|
|
ida_free(&vd_index_ida, index);
|
2007-10-22 01:03:38 +00:00
|
|
|
out:
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2012-12-21 23:13:49 +00:00
|
|
|
static void virtblk_remove(struct virtio_device *vdev)
|
2007-10-22 01:03:38 +00:00
|
|
|
{
|
|
|
|
struct virtio_blk *vblk = vdev->priv;
|
|
|
|
|
2014-10-14 23:52:26 +00:00
|
|
|
/* Make sure no work handler is accessing the device. */
|
|
|
|
flush_work(&vblk->config_work);
|
2011-02-01 20:43:48 +00:00
|
|
|
|
2012-05-25 02:34:47 +00:00
|
|
|
del_gendisk(vblk->disk);
|
2014-04-15 20:14:00 +00:00
|
|
|
blk_mq_free_tag_set(&vblk->tag_set);
|
|
|
|
|
2020-04-30 14:04:42 +00:00
|
|
|
mutex_lock(&vblk->vdev_mutex);
|
|
|
|
|
2008-02-05 04:50:03 +00:00
|
|
|
/* Stop all the virtqueues. */
|
2021-10-13 10:55:44 +00:00
|
|
|
virtio_reset_device(vdev);
|
2008-02-05 04:50:03 +00:00
|
|
|
|
2020-04-30 14:04:42 +00:00
|
|
|
/* Virtqueues are stopped, nothing can use vblk->vdev anymore. */
|
|
|
|
vblk->vdev = NULL;
|
|
|
|
|
2009-06-13 04:16:36 +00:00
|
|
|
vdev->config->del_vqs(vdev);
|
2014-06-26 09:41:48 +00:00
|
|
|
kfree(vblk->vqs);
|
2013-01-02 05:07:17 +00:00
|
|
|
|
2020-04-30 14:04:42 +00:00
|
|
|
mutex_unlock(&vblk->vdev_mutex);
|
|
|
|
|
2022-02-15 09:45:14 +00:00
|
|
|
put_disk(vblk->disk);
|
2007-10-22 01:03:38 +00:00
|
|
|
}
|
|
|
|
|
2013-09-16 23:55:23 +00:00
|
|
|
#ifdef CONFIG_PM_SLEEP
|
2011-12-22 11:28:30 +00:00
|
|
|
static int virtblk_freeze(struct virtio_device *vdev)
|
|
|
|
{
|
|
|
|
struct virtio_blk *vblk = vdev->priv;
|
|
|
|
|
2024-01-29 08:52:50 +00:00
|
|
|
/* Ensure no requests in virtqueues before deleting vqs. */
|
|
|
|
blk_mq_freeze_queue(vblk->disk->queue);
|
|
|
|
|
2011-12-22 11:28:30 +00:00
|
|
|
/* Ensure we don't receive any more interrupts */
|
2021-10-13 10:55:44 +00:00
|
|
|
virtio_reset_device(vdev);
|
2011-12-22 11:28:30 +00:00
|
|
|
|
2014-10-14 23:52:26 +00:00
|
|
|
/* Make sure no work handler is accessing the device. */
|
2011-12-22 11:28:30 +00:00
|
|
|
flush_work(&vblk->config_work);
|
|
|
|
|
|
|
|
vdev->config->del_vqs(vdev);
|
2021-05-17 08:43:32 +00:00
|
|
|
kfree(vblk->vqs);
|
|
|
|
|
2011-12-22 11:28:30 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int virtblk_restore(struct virtio_device *vdev)
|
|
|
|
{
|
|
|
|
struct virtio_blk *vblk = vdev->priv;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = init_vq(vdev->priv);
|
2014-10-14 23:52:32 +00:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
virtio_device_ready(vdev);
|
2013-11-01 16:52:52 +00:00
|
|
|
|
2024-01-29 08:52:50 +00:00
|
|
|
blk_mq_unfreeze_queue(vblk->disk->queue);
|
2014-10-14 23:52:32 +00:00
|
|
|
return 0;
|
2011-12-22 11:28:30 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2010-01-10 12:40:02 +00:00
|
|
|
static const struct virtio_device_id id_table[] = {
|
2007-10-22 01:03:38 +00:00
|
|
|
{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
|
|
|
|
{ 0 },
|
|
|
|
};
|
|
|
|
|
2014-10-07 14:39:49 +00:00
|
|
|
static unsigned int features_legacy[] = {
|
2010-09-03 09:56:18 +00:00
|
|
|
VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
|
2017-01-28 08:32:53 +00:00
|
|
|
VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
|
2016-02-24 15:07:27 +00:00
|
|
|
VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
|
2018-11-01 22:40:35 +00:00
|
|
|
VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
|
2022-09-21 08:27:29 +00:00
|
|
|
VIRTIO_BLK_F_SECURE_ERASE,
|
2014-10-07 14:39:49 +00:00
|
|
|
}
|
|
|
|
;
|
|
|
|
static unsigned int features[] = {
|
|
|
|
VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
|
|
|
|
VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
|
2016-02-24 15:07:27 +00:00
|
|
|
VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
|
2018-11-01 22:40:35 +00:00
|
|
|
VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
|
2023-03-30 21:49:53 +00:00
|
|
|
VIRTIO_BLK_F_SECURE_ERASE, VIRTIO_BLK_F_ZONED,
|
2008-05-03 02:50:50 +00:00
|
|
|
};
|
|
|
|
|
2012-12-21 23:13:49 +00:00
|
|
|
static struct virtio_driver virtio_blk = {
|
2014-10-07 14:39:49 +00:00
|
|
|
.feature_table = features,
|
|
|
|
.feature_table_size = ARRAY_SIZE(features),
|
|
|
|
.feature_table_legacy = features_legacy,
|
|
|
|
.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
|
|
|
|
.driver.name = KBUILD_MODNAME,
|
|
|
|
.id_table = id_table,
|
|
|
|
.probe = virtblk_probe,
|
|
|
|
.remove = virtblk_remove,
|
|
|
|
.config_changed = virtblk_config_changed,
|
2013-09-16 23:55:23 +00:00
|
|
|
#ifdef CONFIG_PM_SLEEP
|
2014-10-07 14:39:49 +00:00
|
|
|
.freeze = virtblk_freeze,
|
|
|
|
.restore = virtblk_restore,
|
2011-12-22 11:28:30 +00:00
|
|
|
#endif
|
2007-10-22 01:03:38 +00:00
|
|
|
};
|
|
|
|
|
2022-03-16 19:20:02 +00:00
|
|
|
static int __init virtio_blk_init(void)
|
2007-10-22 01:03:38 +00:00
|
|
|
{
|
2011-02-01 20:43:48 +00:00
|
|
|
int error;
|
|
|
|
|
|
|
|
virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
|
|
|
|
if (!virtblk_wq)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2008-01-31 14:53:53 +00:00
|
|
|
major = register_blkdev(0, "virtblk");
|
2011-02-01 20:43:48 +00:00
|
|
|
if (major < 0) {
|
|
|
|
error = major;
|
|
|
|
goto out_destroy_workqueue;
|
|
|
|
}
|
|
|
|
|
|
|
|
error = register_virtio_driver(&virtio_blk);
|
|
|
|
if (error)
|
|
|
|
goto out_unregister_blkdev;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out_unregister_blkdev:
|
|
|
|
unregister_blkdev(major, "virtblk");
|
|
|
|
out_destroy_workqueue:
|
|
|
|
destroy_workqueue(virtblk_wq);
|
|
|
|
return error;
|
2007-10-22 01:03:38 +00:00
|
|
|
}
|
|
|
|
|
2022-03-16 19:20:02 +00:00
|
|
|
static void __exit virtio_blk_fini(void)
|
2007-10-22 01:03:38 +00:00
|
|
|
{
|
|
|
|
unregister_virtio_driver(&virtio_blk);
|
2014-10-23 15:57:19 +00:00
|
|
|
unregister_blkdev(major, "virtblk");
|
2011-02-01 20:43:48 +00:00
|
|
|
destroy_workqueue(virtblk_wq);
|
2007-10-22 01:03:38 +00:00
|
|
|
}
|
2022-03-16 19:20:02 +00:00
|
|
|
module_init(virtio_blk_init);
|
|
|
|
module_exit(virtio_blk_fini);
|
2007-10-22 01:03:38 +00:00
|
|
|
|
|
|
|
MODULE_DEVICE_TABLE(virtio, id_table);
|
|
|
|
MODULE_DESCRIPTION("Virtio block driver");
|
|
|
|
MODULE_LICENSE("GPL");
|