vfs-6.9.rw_hint

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCZdcKwQAKCRCRxhvAZXjc
 oldXAP4uzKixPvJeJmmuLs8Yl2X4g4SnxXFoLwMjCOxGSH1DWQD+Oj0nGs81lIKm
 iLCZwk09JzfVEat/6KVmkjiqLLTwNgw=
 =TmTQ
 -----END PGP SIGNATURE-----
gpgsig -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCZeYHOAAKCRCRxhvAZXjc
 opwvAP0fqxfEAS04/MNdYSf0dA5GMr8v+8RBablWtkVuOMMbRQD/RMFJKXK02afq
 B4YUemRHtYETdbV69+yzninHy8y4gQQ=
 =ThqF
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.9.rw_hint' of gitolite.kernel.org:pub/scm/linux/kernel/git/vfs/vfs

Pull write hint fix from Christian Brauner:

UFS devices are widely used in mobile applications, e.g. in smartphones.
UFS vendors need data lifetime information to achieve good performance.
Providing data lifetime information to UFS devices can result in up to
40% lower write amplification. Hence this patch series that restores the
bi_write_hint member in struct bio. After this patch series has been
merged, patches that implement data lifetime support in the SCSI disk
(sd) driver will be sent to the Linux kernel SCSI maintainer.

The following changes are included in this patch series:

- Improvements for the F_GET_RW_HINT and F_SET_RW_HINT fcntls.
- Move enum rw_hint into a new header file.
- Support F_SET_RW_HINT for block devices to make it easy to test data
  lifetime support.
- Restore the bio.bi_write_hint member and restore support in the VFS
  layer and also in the block layer for data lifetime information.

The shell script that has been used to test the patch series combined
with the SCSI patches is available at the end of this cover letter.

* tag 'vfs-6.9.rw_hint' of gitolite.kernel.org:pub/scm/linux/kernel/git/vfs/vfs:
  block, fs: Restore the per-bio/request data lifetime fields
  fs: Propagate write hints to the struct block_device inode
  fs: Move enum rw_hint into a new header file
  fs: Split fcntl_rw_hint()
  fs: Verify write lifetime constants at compile time
  fs: Fix rw_hint validation

Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
Christian Brauner 2024-03-04 18:35:21 +01:00
commit 86835c39e0
18 changed files with 102 additions and 42 deletions

View File

@ -251,6 +251,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
bio->bi_opf = opf;
bio->bi_flags = 0;
bio->bi_ioprio = 0;
bio->bi_write_hint = 0;
bio->bi_status = 0;
bio->bi_iter.bi_sector = 0;
bio->bi_iter.bi_size = 0;
@ -813,6 +814,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
{
bio_set_flag(bio, BIO_CLONED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter = bio_src->bi_iter;
if (bio->bi_bdev) {

View File

@ -172,6 +172,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
if (bio_flagged(bio_src, BIO_REMAPPED))
bio_set_flag(bio, BIO_REMAPPED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;

View File

@ -810,6 +810,10 @@ static struct request *attempt_merge(struct request_queue *q,
if (rq_data_dir(req) != rq_data_dir(next))
return NULL;
/* Don't merge requests with different write hints. */
if (req->write_hint != next->write_hint)
return NULL;
if (req->ioprio != next->ioprio)
return NULL;
@ -937,6 +941,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (!bio_crypt_rq_ctx_compatible(rq, bio))
return false;
/* Don't merge requests with different write hints. */
if (rq->write_hint != bio->bi_write_hint)
return false;
if (rq->ioprio != bio_prio(bio))
return false;

View File

@ -2585,6 +2585,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
rq->cmd_flags |= REQ_FAILFAST_MASK;
rq->__sector = bio->bi_iter.bi_sector;
rq->write_hint = bio->bi_write_hint;
blk_rq_bio_prep(rq, bio, nr_segs);
/* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
@ -3185,6 +3186,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
}
rq->nr_phys_segments = rq_src->nr_phys_segments;
rq->ioprio = rq_src->ioprio;
rq->write_hint = rq_src->write_hint;
if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
goto free_and_out;

View File

@ -169,6 +169,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
if (bio_flagged(bio_src, BIO_REMAPPED))
bio_set_flag(bio, BIO_REMAPPED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;

View File

@ -73,6 +73,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
}
bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
bio.bi_ioprio = iocb->ki_ioprio;
ret = bio_iov_iter_get_pages(&bio, iter);
@ -203,6 +204,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
for (;;) {
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
bio->bi_private = dio;
bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio;
@ -321,6 +323,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
dio->flags = 0;
dio->iocb = iocb;
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
bio->bi_end_io = blkdev_bio_end_io_async;
bio->bi_ioprio = iocb->ki_ioprio;

View File

@ -55,7 +55,7 @@
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
struct writeback_control *wbc);
enum rw_hint hint, struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@ -1889,7 +1889,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
inode->i_write_hint, wbc);
nr_underway++;
}
bh = next;
@ -1944,7 +1945,8 @@ recover:
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
inode->i_write_hint, wbc);
nr_underway++;
}
bh = next;
@ -2756,6 +2758,7 @@ static void end_bio_bh_io_sync(struct bio *bio)
}
static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
enum rw_hint write_hint,
struct writeback_control *wbc)
{
const enum req_op op = opf & REQ_OP_MASK;
@ -2783,6 +2786,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_write_hint = write_hint;
__bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
@ -2802,7 +2806,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
void submit_bh(blk_opf_t opf, struct buffer_head *bh)
{
submit_bh_wbc(opf, bh, NULL);
submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL);
}
EXPORT_SYMBOL(submit_bh);

View File

@ -410,6 +410,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
bio->bi_end_io = dio_bio_end_io;
if (dio->is_pinned)
bio_set_flag(bio, BIO_PAGE_PINNED);
bio->bi_write_hint = file_inode(dio->iocb->ki_filp)->i_write_hint;
sdio->bio = bio;
sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
}

View File

@ -24,6 +24,7 @@
#include <linux/blkdev.h>
#include <linux/quotaops.h>
#include <linux/part_stat.h>
#include <linux/rw_hint.h>
#include <crypto/hash.h>
#include <linux/fscrypt.h>

View File

@ -27,6 +27,7 @@
#include <linux/memfd.h>
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/rw_hint.h>
#include <linux/poll.h>
#include <asm/siginfo.h>
@ -268,8 +269,15 @@ static int f_getowner_uids(struct file *filp, unsigned long arg)
}
#endif
static bool rw_hint_valid(enum rw_hint hint)
static bool rw_hint_valid(u64 hint)
{
BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET);
BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE);
BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT);
BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM);
BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG);
BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME);
switch (hint) {
case RWH_WRITE_LIFE_NOT_SET:
case RWH_WRITE_LIFE_NONE:
@ -283,34 +291,40 @@ static bool rw_hint_valid(enum rw_hint hint)
}
}
static long fcntl_rw_hint(struct file *file, unsigned int cmd,
unsigned long arg)
static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct inode *inode = file_inode(file);
u64 __user *argp = (u64 __user *)arg;
enum rw_hint hint;
u64 h;
u64 hint = READ_ONCE(inode->i_write_hint);
switch (cmd) {
case F_GET_RW_HINT:
h = inode->i_write_hint;
if (copy_to_user(argp, &h, sizeof(*argp)))
return -EFAULT;
return 0;
case F_SET_RW_HINT:
if (copy_from_user(&h, argp, sizeof(h)))
return -EFAULT;
hint = (enum rw_hint) h;
if (!rw_hint_valid(hint))
return -EINVAL;
if (copy_to_user(argp, &hint, sizeof(*argp)))
return -EFAULT;
return 0;
}
inode_lock(inode);
inode->i_write_hint = hint;
inode_unlock(inode);
return 0;
default:
static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct inode *inode = file_inode(file);
u64 __user *argp = (u64 __user *)arg;
u64 hint;
if (copy_from_user(&hint, argp, sizeof(hint)))
return -EFAULT;
if (!rw_hint_valid(hint))
return -EINVAL;
}
WRITE_ONCE(inode->i_write_hint, hint);
/*
* file->f_mapping->host may differ from inode. As an example,
* blkdev_open() modifies file->f_mapping.
*/
if (file->f_mapping->host != inode)
WRITE_ONCE(file->f_mapping->host->i_write_hint, hint);
return 0;
}
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
@ -416,8 +430,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
err = memfd_fcntl(filp, cmd, argi);
break;
case F_GET_RW_HINT:
err = fcntl_get_rw_hint(filp, cmd, arg);
break;
case F_SET_RW_HINT:
err = fcntl_rw_hint(filp, cmd, arg);
err = fcntl_set_rw_hint(filp, cmd, arg);
break;
default:
break;

View File

@ -20,6 +20,7 @@
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
#include <linux/iversion.h>
#include <linux/rw_hint.h>
#include <trace/events/writeback.h>
#include "internal.h"

View File

@ -1690,6 +1690,7 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
bio->bi_end_io = iomap_writepage_end_bio;
wbc_init_bio(wbc, bio);
bio->bi_write_hint = inode->i_write_hint;
ioend = iomap_ioend_from_bio(bio);
INIT_LIST_HEAD(&ioend->io_list);

View File

@ -380,6 +380,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
bio->bi_write_hint = inode->i_write_hint;
bio->bi_ioprio = dio->iocb->ki_ioprio;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;

View File

@ -605,6 +605,7 @@ alloc_new:
GFP_NOFS);
bio->bi_iter.bi_sector = first_block << (blkbits - 9);
wbc_init_bio(wbc, bio);
bio->bi_write_hint = inode->i_write_hint;
}
/*

View File

@ -8,6 +8,7 @@
#include <linux/scatterlist.h>
#include <linux/prefetch.h>
#include <linux/srcu.h>
#include <linux/rw_hint.h>
struct blk_mq_tags;
struct blk_flush_queue;
@ -135,6 +136,7 @@ struct request {
struct blk_crypto_keyslot *crypt_keyslot;
#endif
enum rw_hint write_hint;
unsigned short ioprio;
enum mq_rq_state state;

View File

@ -10,6 +10,7 @@
#include <linux/bvec.h>
#include <linux/device.h>
#include <linux/ktime.h>
#include <linux/rw_hint.h>
struct bio_set;
struct bio;
@ -269,6 +270,7 @@ struct bio {
*/
unsigned short bi_flags; /* BIO_* below */
unsigned short bi_ioprio;
enum rw_hint bi_write_hint;
blk_status_t bi_status;
atomic_t __bi_remaining;

View File

@ -43,6 +43,7 @@
#include <linux/cred.h>
#include <linux/mnt_idmapping.h>
#include <linux/slab.h>
#include <linux/rw_hint.h>
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
@ -309,19 +310,6 @@ struct address_space;
struct writeback_control;
struct readahead_control;
/*
* Write life time hint values.
* Stored in struct inode as u8.
*/
enum rw_hint {
WRITE_LIFE_NOT_SET = 0,
WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE,
WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT,
WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM,
WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG,
WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME,
};
/* Match RWF_* bits to IOCB bits */
#define IOCB_HIPRI (__force int) RWF_HIPRI
#define IOCB_DSYNC (__force int) RWF_DSYNC
@ -677,7 +665,7 @@ struct inode {
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
unsigned short i_bytes;
u8 i_blkbits;
u8 i_write_hint;
enum rw_hint i_write_hint;
blkcnt_t i_blocks;
#ifdef __NEED_I_SIZE_ORDERED

24
include/linux/rw_hint.h Normal file
View File

@ -0,0 +1,24 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RW_HINT_H
#define _LINUX_RW_HINT_H
#include <linux/build_bug.h>
#include <linux/compiler_attributes.h>
#include <uapi/linux/fcntl.h>
/* Block storage write lifetime hint values. */
enum rw_hint {
WRITE_LIFE_NOT_SET = RWH_WRITE_LIFE_NOT_SET,
WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE,
WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT,
WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM,
WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG,
WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME,
} __packed;
/* Sparse ignores __packed annotations on enums, hence the #ifndef below. */
#ifndef __CHECKER__
static_assert(sizeof(enum rw_hint) == 1);
#endif
#endif /* _LINUX_RW_HINT_H */