forked from Minki/linux
9b15d109a6
This patch improves discard bio split for address and size alignment in __blkdev_issue_discard(). The aligned discard bio may help underlying device controller to perform better discard and internal garbage collection, and avoid unnecessary internal fragment. Current discard bio split algorithm in __blkdev_issue_discard() may have non-discarded fregment on device even the discard bio LBA and size are both aligned to device's discard granularity size. Here is the example steps on how to reproduce the above problem. - On a VMWare ESXi 6.5 update3 installation, create a 51GB virtual disk with thin mode and give it to a Linux virtual machine. - Inside the Linux virtual machine, if the 50GB virtual disk shows up as /dev/sdb, fill data into the first 50GB by, # dd if=/dev/zero of=/dev/sdb bs=4096 count=13107200 - Discard the 50GB range from offset 0 on /dev/sdb, # blkdiscard /dev/sdb -o 0 -l 53687091200 - Observe the underlying mapping status of the device # sg_get_lba_status /dev/sdb -m 1048 --lba=0 descriptor LBA: 0x0000000000000000 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000000000800 blocks: 16773120 deallocated descriptor LBA: 0x0000000000fff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000001000000 blocks:8386560
deallocated descriptor LBA: 0x00000000017ff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000001800000 blocks:8386560
deallocated descriptor LBA: 0x0000000001fff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000002000000 blocks:8386560
deallocated descriptor LBA: 0x00000000027ff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000002800000 blocks:8386560
deallocated descriptor LBA: 0x0000000002fff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000003000000 blocks:8386560
deallocated descriptor LBA: 0x00000000037ff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000003800000 blocks:8386560
deallocated descriptor LBA: 0x0000000003fff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000004000000 blocks:8386560
deallocated descriptor LBA: 0x00000000047ff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000004800000 blocks:8386560
deallocated descriptor LBA: 0x0000000004fff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000005000000 blocks:8386560
deallocated descriptor LBA: 0x00000000057ff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000005800000 blocks:8386560
deallocated descriptor LBA: 0x0000000005fff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000006000000 blocks: 6291456 deallocated descriptor LBA: 0x0000000006600000 blocks: 0 deallocated Although the discard bio starts at LBA 0 and has 50<<30 bytes size which are perfect aligned to the discard granularity, from the above list these are many 1MB (2048 sectors) internal fragments exist unexpectedly. The problem is in __blkdev_issue_discard(), an improper algorithm causes an improper bio size which is not aligned. 25 int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, 26 sector_t nr_sects, gfp_t gfp_mask, int flags, 27 struct bio **biop) 28 { 29 struct request_queue *q = bdev_get_queue(bdev); [snipped] 56 57 while (nr_sects) { 58 sector_t req_sects = min_t(sector_t, nr_sects, 59 bio_allowed_max_sectors(q)); 60 61 WARN_ON_ONCE((req_sects << 9) > UINT_MAX); 62 63 bio = blk_next_bio(bio, 0, gfp_mask); 64 bio->bi_iter.bi_sector = sector; 65 bio_set_dev(bio, bdev); 66 bio_set_op_attrs(bio, op, 0); 67 68 bio->bi_iter.bi_size = req_sects << 9; 69 sector += req_sects; 70 nr_sects -= req_sects; [snipped] 79 } 80 81 *biop = bio; 82 return 0; 83 } 84 EXPORT_SYMBOL(__blkdev_issue_discard); At line 58-59, to discard a 50GB range, req_sects is set as return value of bio_allowed_max_sectors(q), which is 8388607 sectors. In the above case, the discard granularity is 2048 sectors, although the start LBA and discard length are aligned to discard granularity, req_sects never has chance to be aligned to discard granularity. This is why there are some still-mapped 2048 sectors fragment in every 4 or 8 GB range. If req_sects at line 58 is set to a value aligned to discard_granularity and close to UNIT_MAX, then all consequent split bios inside device driver are (almostly) aligned to discard_granularity of the device queue. The 2048 sectors still-mapped fragment will disappear. This patch introduces bio_aligned_discard_max_sectors() to return the the value which is aligned to q->limits.discard_granularity and closest to UINT_MAX. Then this patch replaces bio_allowed_max_sectors() with this new routine to decide a more proper split bio length. But we still need to handle the situation when discard start LBA is not aligned to q->limits.discard_granularity, otherwise even the length is aligned, current code may still leave 2048 fragment around every 4GB range. Therefore, to calculate req_sects, firstly the start LBA of discard range is checked (including partition offset), if it is not aligned to discard granularity, the first split location should make sure following bio has bi_sector aligned to discard granularity. Then there won't be still-mapped fragment in the middle of the discard range. The above is how this patch improves discard bio alignment in __blkdev_issue_discard(). Now with this patch, after discard with same command line mentiond previously, sg_get_lba_status returns, descriptor LBA: 0x0000000000000000 blocks: 106954752 deallocated descriptor LBA: 0x0000000006600000 blocks: 0 deallocated We an see there is no 2048 sectors segment anymore, everything is clean. Reported-and-tested-by: Acshai Manoj <acshai.manoj@microfocus.com> Signed-off-by: Coly Li <colyli@suse.de> Reviewed-by: Hannes Reinecke <hare@suse.com> Reviewed-by: Ming Lei <ming.lei@redhat.com> Reviewed-by: Xiao Ni <xni@redhat.com> Cc: Bart Van Assche <bvanassche@acm.org> Cc: Christoph Hellwig <hch@lst.de> Cc: Enzo Matsumiya <ematsumiya@suse.com> Cc: Jens Axboe <axboe@kernel.dk> Signed-off-by: Jens Axboe <axboe@kernel.dk>
433 lines
11 KiB
C
433 lines
11 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Functions related to generic helpers functions
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/module.h>
|
|
#include <linux/bio.h>
|
|
#include <linux/blkdev.h>
|
|
#include <linux/scatterlist.h>
|
|
|
|
#include "blk.h"
|
|
|
|
struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp)
|
|
{
|
|
struct bio *new = bio_alloc(gfp, nr_pages);
|
|
|
|
if (bio) {
|
|
bio_chain(bio, new);
|
|
submit_bio(bio);
|
|
}
|
|
|
|
return new;
|
|
}
|
|
|
|
int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
|
|
sector_t nr_sects, gfp_t gfp_mask, int flags,
|
|
struct bio **biop)
|
|
{
|
|
struct request_queue *q = bdev_get_queue(bdev);
|
|
struct bio *bio = *biop;
|
|
unsigned int op;
|
|
sector_t bs_mask, part_offset = 0;
|
|
|
|
if (!q)
|
|
return -ENXIO;
|
|
|
|
if (bdev_read_only(bdev))
|
|
return -EPERM;
|
|
|
|
if (flags & BLKDEV_DISCARD_SECURE) {
|
|
if (!blk_queue_secure_erase(q))
|
|
return -EOPNOTSUPP;
|
|
op = REQ_OP_SECURE_ERASE;
|
|
} else {
|
|
if (!blk_queue_discard(q))
|
|
return -EOPNOTSUPP;
|
|
op = REQ_OP_DISCARD;
|
|
}
|
|
|
|
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
|
|
if ((sector | nr_sects) & bs_mask)
|
|
return -EINVAL;
|
|
|
|
if (!nr_sects)
|
|
return -EINVAL;
|
|
|
|
/* In case the discard request is in a partition */
|
|
if (bdev->bd_partno)
|
|
part_offset = bdev->bd_part->start_sect;
|
|
|
|
while (nr_sects) {
|
|
sector_t granularity_aligned_lba, req_sects;
|
|
sector_t sector_mapped = sector + part_offset;
|
|
|
|
granularity_aligned_lba = round_up(sector_mapped,
|
|
q->limits.discard_granularity >> SECTOR_SHIFT);
|
|
|
|
/*
|
|
* Check whether the discard bio starts at a discard_granularity
|
|
* aligned LBA,
|
|
* - If no: set (granularity_aligned_lba - sector_mapped) to
|
|
* bi_size of the first split bio, then the second bio will
|
|
* start at a discard_granularity aligned LBA on the device.
|
|
* - If yes: use bio_aligned_discard_max_sectors() as the max
|
|
* possible bi_size of the first split bio. Then when this bio
|
|
* is split in device drive, the split ones are very probably
|
|
* to be aligned to discard_granularity of the device's queue.
|
|
*/
|
|
if (granularity_aligned_lba == sector_mapped)
|
|
req_sects = min_t(sector_t, nr_sects,
|
|
bio_aligned_discard_max_sectors(q));
|
|
else
|
|
req_sects = min_t(sector_t, nr_sects,
|
|
granularity_aligned_lba - sector_mapped);
|
|
|
|
WARN_ON_ONCE((req_sects << 9) > UINT_MAX);
|
|
|
|
bio = blk_next_bio(bio, 0, gfp_mask);
|
|
bio->bi_iter.bi_sector = sector;
|
|
bio_set_dev(bio, bdev);
|
|
bio_set_op_attrs(bio, op, 0);
|
|
|
|
bio->bi_iter.bi_size = req_sects << 9;
|
|
sector += req_sects;
|
|
nr_sects -= req_sects;
|
|
|
|
/*
|
|
* We can loop for a long time in here, if someone does
|
|
* full device discards (like mkfs). Be nice and allow
|
|
* us to schedule out to avoid softlocking if preempt
|
|
* is disabled.
|
|
*/
|
|
cond_resched();
|
|
}
|
|
|
|
*biop = bio;
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL(__blkdev_issue_discard);
|
|
|
|
/**
|
|
* blkdev_issue_discard - queue a discard
|
|
* @bdev: blockdev to issue discard for
|
|
* @sector: start sector
|
|
* @nr_sects: number of sectors to discard
|
|
* @gfp_mask: memory allocation flags (for bio_alloc)
|
|
* @flags: BLKDEV_DISCARD_* flags to control behaviour
|
|
*
|
|
* Description:
|
|
* Issue a discard request for the sectors in question.
|
|
*/
|
|
int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
|
|
sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
|
|
{
|
|
struct bio *bio = NULL;
|
|
struct blk_plug plug;
|
|
int ret;
|
|
|
|
blk_start_plug(&plug);
|
|
ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags,
|
|
&bio);
|
|
if (!ret && bio) {
|
|
ret = submit_bio_wait(bio);
|
|
if (ret == -EOPNOTSUPP)
|
|
ret = 0;
|
|
bio_put(bio);
|
|
}
|
|
blk_finish_plug(&plug);
|
|
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL(blkdev_issue_discard);
|
|
|
|
/**
|
|
* __blkdev_issue_write_same - generate number of bios with same page
|
|
* @bdev: target blockdev
|
|
* @sector: start sector
|
|
* @nr_sects: number of sectors to write
|
|
* @gfp_mask: memory allocation flags (for bio_alloc)
|
|
* @page: page containing data to write
|
|
* @biop: pointer to anchor bio
|
|
*
|
|
* Description:
|
|
* Generate and issue number of bios(REQ_OP_WRITE_SAME) with same page.
|
|
*/
|
|
static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
|
|
sector_t nr_sects, gfp_t gfp_mask, struct page *page,
|
|
struct bio **biop)
|
|
{
|
|
struct request_queue *q = bdev_get_queue(bdev);
|
|
unsigned int max_write_same_sectors;
|
|
struct bio *bio = *biop;
|
|
sector_t bs_mask;
|
|
|
|
if (!q)
|
|
return -ENXIO;
|
|
|
|
if (bdev_read_only(bdev))
|
|
return -EPERM;
|
|
|
|
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
|
|
if ((sector | nr_sects) & bs_mask)
|
|
return -EINVAL;
|
|
|
|
if (!bdev_write_same(bdev))
|
|
return -EOPNOTSUPP;
|
|
|
|
/* Ensure that max_write_same_sectors doesn't overflow bi_size */
|
|
max_write_same_sectors = bio_allowed_max_sectors(q);
|
|
|
|
while (nr_sects) {
|
|
bio = blk_next_bio(bio, 1, gfp_mask);
|
|
bio->bi_iter.bi_sector = sector;
|
|
bio_set_dev(bio, bdev);
|
|
bio->bi_vcnt = 1;
|
|
bio->bi_io_vec->bv_page = page;
|
|
bio->bi_io_vec->bv_offset = 0;
|
|
bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev);
|
|
bio_set_op_attrs(bio, REQ_OP_WRITE_SAME, 0);
|
|
|
|
if (nr_sects > max_write_same_sectors) {
|
|
bio->bi_iter.bi_size = max_write_same_sectors << 9;
|
|
nr_sects -= max_write_same_sectors;
|
|
sector += max_write_same_sectors;
|
|
} else {
|
|
bio->bi_iter.bi_size = nr_sects << 9;
|
|
nr_sects = 0;
|
|
}
|
|
cond_resched();
|
|
}
|
|
|
|
*biop = bio;
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* blkdev_issue_write_same - queue a write same operation
|
|
* @bdev: target blockdev
|
|
* @sector: start sector
|
|
* @nr_sects: number of sectors to write
|
|
* @gfp_mask: memory allocation flags (for bio_alloc)
|
|
* @page: page containing data
|
|
*
|
|
* Description:
|
|
* Issue a write same request for the sectors in question.
|
|
*/
|
|
int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
|
|
sector_t nr_sects, gfp_t gfp_mask,
|
|
struct page *page)
|
|
{
|
|
struct bio *bio = NULL;
|
|
struct blk_plug plug;
|
|
int ret;
|
|
|
|
blk_start_plug(&plug);
|
|
ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, page,
|
|
&bio);
|
|
if (ret == 0 && bio) {
|
|
ret = submit_bio_wait(bio);
|
|
bio_put(bio);
|
|
}
|
|
blk_finish_plug(&plug);
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL(blkdev_issue_write_same);
|
|
|
|
static int __blkdev_issue_write_zeroes(struct block_device *bdev,
|
|
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
|
|
struct bio **biop, unsigned flags)
|
|
{
|
|
struct bio *bio = *biop;
|
|
unsigned int max_write_zeroes_sectors;
|
|
struct request_queue *q = bdev_get_queue(bdev);
|
|
|
|
if (!q)
|
|
return -ENXIO;
|
|
|
|
if (bdev_read_only(bdev))
|
|
return -EPERM;
|
|
|
|
/* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
|
|
max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
|
|
|
|
if (max_write_zeroes_sectors == 0)
|
|
return -EOPNOTSUPP;
|
|
|
|
while (nr_sects) {
|
|
bio = blk_next_bio(bio, 0, gfp_mask);
|
|
bio->bi_iter.bi_sector = sector;
|
|
bio_set_dev(bio, bdev);
|
|
bio->bi_opf = REQ_OP_WRITE_ZEROES;
|
|
if (flags & BLKDEV_ZERO_NOUNMAP)
|
|
bio->bi_opf |= REQ_NOUNMAP;
|
|
|
|
if (nr_sects > max_write_zeroes_sectors) {
|
|
bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
|
|
nr_sects -= max_write_zeroes_sectors;
|
|
sector += max_write_zeroes_sectors;
|
|
} else {
|
|
bio->bi_iter.bi_size = nr_sects << 9;
|
|
nr_sects = 0;
|
|
}
|
|
cond_resched();
|
|
}
|
|
|
|
*biop = bio;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Convert a number of 512B sectors to a number of pages.
|
|
* The result is limited to a number of pages that can fit into a BIO.
|
|
* Also make sure that the result is always at least 1 (page) for the cases
|
|
* where nr_sects is lower than the number of sectors in a page.
|
|
*/
|
|
static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects)
|
|
{
|
|
sector_t pages = DIV_ROUND_UP_SECTOR_T(nr_sects, PAGE_SIZE / 512);
|
|
|
|
return min(pages, (sector_t)BIO_MAX_PAGES);
|
|
}
|
|
|
|
static int __blkdev_issue_zero_pages(struct block_device *bdev,
|
|
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
|
|
struct bio **biop)
|
|
{
|
|
struct request_queue *q = bdev_get_queue(bdev);
|
|
struct bio *bio = *biop;
|
|
int bi_size = 0;
|
|
unsigned int sz;
|
|
|
|
if (!q)
|
|
return -ENXIO;
|
|
|
|
if (bdev_read_only(bdev))
|
|
return -EPERM;
|
|
|
|
while (nr_sects != 0) {
|
|
bio = blk_next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
|
|
gfp_mask);
|
|
bio->bi_iter.bi_sector = sector;
|
|
bio_set_dev(bio, bdev);
|
|
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
|
|
|
|
while (nr_sects != 0) {
|
|
sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
|
|
bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0);
|
|
nr_sects -= bi_size >> 9;
|
|
sector += bi_size >> 9;
|
|
if (bi_size < sz)
|
|
break;
|
|
}
|
|
cond_resched();
|
|
}
|
|
|
|
*biop = bio;
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* __blkdev_issue_zeroout - generate number of zero filed write bios
|
|
* @bdev: blockdev to issue
|
|
* @sector: start sector
|
|
* @nr_sects: number of sectors to write
|
|
* @gfp_mask: memory allocation flags (for bio_alloc)
|
|
* @biop: pointer to anchor bio
|
|
* @flags: controls detailed behavior
|
|
*
|
|
* Description:
|
|
* Zero-fill a block range, either using hardware offload or by explicitly
|
|
* writing zeroes to the device.
|
|
*
|
|
* If a device is using logical block provisioning, the underlying space will
|
|
* not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
|
|
*
|
|
* If %flags contains BLKDEV_ZERO_NOFALLBACK, the function will return
|
|
* -EOPNOTSUPP if no explicit hardware offload for zeroing is provided.
|
|
*/
|
|
int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
|
|
sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
|
|
unsigned flags)
|
|
{
|
|
int ret;
|
|
sector_t bs_mask;
|
|
|
|
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
|
|
if ((sector | nr_sects) & bs_mask)
|
|
return -EINVAL;
|
|
|
|
ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
|
|
biop, flags);
|
|
if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK))
|
|
return ret;
|
|
|
|
return __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask,
|
|
biop);
|
|
}
|
|
EXPORT_SYMBOL(__blkdev_issue_zeroout);
|
|
|
|
/**
|
|
* blkdev_issue_zeroout - zero-fill a block range
|
|
* @bdev: blockdev to write
|
|
* @sector: start sector
|
|
* @nr_sects: number of sectors to write
|
|
* @gfp_mask: memory allocation flags (for bio_alloc)
|
|
* @flags: controls detailed behavior
|
|
*
|
|
* Description:
|
|
* Zero-fill a block range, either using hardware offload or by explicitly
|
|
* writing zeroes to the device. See __blkdev_issue_zeroout() for the
|
|
* valid values for %flags.
|
|
*/
|
|
int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
|
|
sector_t nr_sects, gfp_t gfp_mask, unsigned flags)
|
|
{
|
|
int ret = 0;
|
|
sector_t bs_mask;
|
|
struct bio *bio;
|
|
struct blk_plug plug;
|
|
bool try_write_zeroes = !!bdev_write_zeroes_sectors(bdev);
|
|
|
|
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
|
|
if ((sector | nr_sects) & bs_mask)
|
|
return -EINVAL;
|
|
|
|
retry:
|
|
bio = NULL;
|
|
blk_start_plug(&plug);
|
|
if (try_write_zeroes) {
|
|
ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects,
|
|
gfp_mask, &bio, flags);
|
|
} else if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
|
|
ret = __blkdev_issue_zero_pages(bdev, sector, nr_sects,
|
|
gfp_mask, &bio);
|
|
} else {
|
|
/* No zeroing offload support */
|
|
ret = -EOPNOTSUPP;
|
|
}
|
|
if (ret == 0 && bio) {
|
|
ret = submit_bio_wait(bio);
|
|
bio_put(bio);
|
|
}
|
|
blk_finish_plug(&plug);
|
|
if (ret && try_write_zeroes) {
|
|
if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
|
|
try_write_zeroes = false;
|
|
goto retry;
|
|
}
|
|
if (!bdev_write_zeroes_sectors(bdev)) {
|
|
/*
|
|
* Zeroing offload support was indicated, but the
|
|
* device reported ILLEGAL REQUEST (for some devices
|
|
* there is no non-destructive way to verify whether
|
|
* WRITE ZEROES is actually supported).
|
|
*/
|
|
ret = -EOPNOTSUPP;
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL(blkdev_issue_zeroout);
|