Merge tag 'fs-atomic_2024-11-05' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into vfs.untorn.writes

Snapshot of untorn@fs-atomic#ritesh.list_ext4-do-not-fallback-to-buffered-io-for-dio-atomic-write at Tue Nov  5 16:20:51 PST 2024

Link: https://lore.kernel.org/r/20241106-zerkleinern-verzweifeln-7ec8173c56ad@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
Christian Brauner 2024-11-06 10:48:07 +01:00
commit 267bf1dd0d
No known key found for this signature in database
GPG Key ID: 91C61BC06578DCA2
17 changed files with 254 additions and 27 deletions

View File

@ -513,6 +513,21 @@ IOMAP_WRITE`` with any combination of the following enhancements:
if the mapping is unwritten and the filesystem cannot handle zeroing
the unaligned regions without exposing stale contents.
* ``IOMAP_ATOMIC``: This write is being issued with torn-write
protection.
Only a single bio can be created for the write, and the write must
not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be
set.
The file range to write must be aligned to satisfy the requirements
of both the filesystem and the underlying block device's atomic
commit capabilities.
If filesystem metadata updates are required (e.g. unwritten extent
conversion or copy on write), all updates for the entire file range
must be committed atomically as well.
Only one space mapping is allowed per untorn write.
Untorn writes must be aligned to, and must not be longer than, a
single file block.
Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
calling this function.

View File

@ -35,13 +35,10 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
return opf;
}
static bool blkdev_dio_invalid(struct block_device *bdev, loff_t pos,
struct iov_iter *iter, bool is_atomic)
static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb,
struct iov_iter *iter)
{
if (is_atomic && !generic_atomic_write_valid(iter, pos))
return true;
return pos & (bdev_logical_block_size(bdev) - 1) ||
return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) ||
!bdev_iter_is_aligned(bdev, iter);
}
@ -368,13 +365,12 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
bool is_atomic = iocb->ki_flags & IOCB_ATOMIC;
unsigned int nr_pages;
if (!iov_iter_count(iter))
return 0;
if (blkdev_dio_invalid(bdev, iocb->ki_pos, iter, is_atomic))
if (blkdev_dio_invalid(bdev, iocb, iter))
return -EINVAL;
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
@ -383,7 +379,7 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
return __blkdev_direct_IO_simple(iocb, iter, bdev,
nr_pages);
return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
} else if (is_atomic) {
} else if (iocb->ki_flags & IOCB_ATOMIC) {
return -EINVAL;
}
return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
@ -625,7 +621,7 @@ static int blkdev_open(struct inode *inode, struct file *filp)
if (!bdev)
return -ENXIO;
if (bdev_can_atomic_write(bdev) && filp->f_flags & O_DIRECT)
if (bdev_can_atomic_write(bdev))
filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
ret = bdev_open(bdev, mode, filp->private_data, NULL, filp);
@ -700,6 +696,12 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
return -EOPNOTSUPP;
if (iocb->ki_flags & IOCB_ATOMIC) {
ret = generic_atomic_write_valid(iocb, from);
if (ret)
return ret;
}
size -= iocb->ki_pos;
if (iov_iter_count(from) > size) {
shorted = iov_iter_count(from) - size;

View File

@ -1729,6 +1729,10 @@ struct ext4_sb_info {
*/
struct work_struct s_sb_upd_work;
/* Atomic write unit values in bytes */
unsigned int s_awu_min;
unsigned int s_awu_max;
/* Ext4 fast commit sub transaction ID */
atomic_t s_fc_subtid;
@ -3855,6 +3859,12 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
return buffer_uptodate(bh);
}
static inline bool ext4_inode_can_atomic_write(struct inode *inode)
{
return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0;
}
extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
loff_t pos, unsigned len,
get_block_t *get_block);

View File

@ -599,6 +599,13 @@ out:
ssize_t err;
loff_t endbyte;
/*
* There is no support for atomic writes on buffered-io yet,
* we should never fallback to buffered-io for DIO atomic
* writes.
*/
WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC);
offset = iocb->ki_pos;
err = ext4_buffered_write_iter(iocb, from);
if (err < 0)
@ -692,6 +699,20 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (IS_DAX(inode))
return ext4_dax_write_iter(iocb, from);
#endif
if (iocb->ki_flags & IOCB_ATOMIC) {
size_t len = iov_iter_count(from);
int ret;
if (len < EXT4_SB(inode->i_sb)->s_awu_min ||
len > EXT4_SB(inode->i_sb)->s_awu_max)
return -EINVAL;
ret = generic_atomic_write_valid(iocb, from);
if (ret)
return ret;
}
if (iocb->ki_flags & IOCB_DIRECT)
return ext4_dio_write_iter(iocb, from);
else
@ -884,6 +905,9 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
return ret;
}
if (ext4_inode_can_atomic_write(inode))
filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
return dquot_file_open(inode, filp);
}

View File

@ -3444,17 +3444,34 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
return ret;
}
static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written)
{
/* must be a directio to fall back to buffered */
if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) !=
(IOMAP_WRITE | IOMAP_DIRECT))
return false;
/* atomic writes are all-or-nothing */
if (flags & IOMAP_ATOMIC)
return false;
/* can only try again if we wrote nothing */
return written == 0;
}
static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
ssize_t written, unsigned flags, struct iomap *iomap)
{
/*
* Check to see whether an error occurred while writing out the data to
* the allocated blocks. If so, return the magic error code so that we
* fallback to buffered I/O and attempt to complete the remainder of
* the I/O. Any blocks that may have been allocated in preparation for
* the direct I/O will be reused during buffered I/O.
* the allocated blocks. If so, return the magic error code for
* non-atomic write so that we fallback to buffered I/O and attempt to
* complete the remainder of the I/O.
* For non-atomic writes, any blocks that may have been
* allocated in preparation for the direct I/O will be reused during
* buffered I/O. For atomic write, we never fallback to buffered-io.
*/
if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
if (ext4_want_directio_fallback(flags, written))
return -ENOTBLK;
return 0;
@ -5578,6 +5595,18 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
}
}
if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned int awu_min = 0, awu_max = 0;
if (ext4_inode_can_atomic_write(inode)) {
awu_min = sbi->s_awu_min;
awu_max = sbi->s_awu_max;
}
generic_fill_statx_atomic_writes(stat, awu_min, awu_max);
}
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
if (flags & EXT4_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;

View File

@ -4425,6 +4425,36 @@ static int ext4_handle_clustersize(struct super_block *sb)
return 0;
}
/*
* ext4_atomic_write_init: Initializes filesystem min & max atomic write units.
* @sb: super block
* TODO: Later add support for bigalloc
*/
static void ext4_atomic_write_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct block_device *bdev = sb->s_bdev;
if (!bdev_can_atomic_write(bdev))
return;
if (!ext4_has_feature_extents(sb))
return;
sbi->s_awu_min = max(sb->s_blocksize,
bdev_atomic_write_unit_min_bytes(bdev));
sbi->s_awu_max = min(sb->s_blocksize,
bdev_atomic_write_unit_max_bytes(bdev));
if (sbi->s_awu_min && sbi->s_awu_max &&
sbi->s_awu_min <= sbi->s_awu_max) {
ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u",
sbi->s_awu_min, sbi->s_awu_max);
} else {
sbi->s_awu_min = 0;
sbi->s_awu_max = 0;
}
}
static void ext4_fast_commit_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@ -5336,6 +5366,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
spin_lock_init(&sbi->s_bdev_wb_lock);
ext4_atomic_write_init(sb);
ext4_fast_commit_init(sb);
sb->s_root = NULL;

View File

@ -271,7 +271,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
* clearing the WRITE_THROUGH flag in the dio request.
*/
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
const struct iomap *iomap, bool use_fua)
const struct iomap *iomap, bool use_fua, bool atomic)
{
blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
@ -283,6 +283,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
opflags |= REQ_FUA;
else
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
if (atomic)
opflags |= REQ_ATOMIC;
return opflags;
}
@ -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
unsigned int fs_block_size = i_blocksize(inode), pad;
loff_t length = iomap_length(iter);
const loff_t length = iomap_length(iter);
bool atomic = iter->flags & IOMAP_ATOMIC;
loff_t pos = iter->pos;
blk_opf_t bio_opf;
struct bio *bio;
@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
size_t copied = 0;
size_t orig_count;
if (atomic && length != fs_block_size)
return -EINVAL;
if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
!bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
return -EINVAL;
@ -382,7 +388,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
* can set up the page vector appropriately for a ZONE_APPEND
* operation.
*/
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
do {
@ -415,6 +421,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
}
n = bio->bi_iter.bi_size;
if (WARN_ON_ONCE(atomic && n != length)) {
/*
* This bio should have covered the complete length,
* which it doesn't, so error. We may need to zero out
* the tail (complete FS block), similar to when
* bio_iov_iter_get_pages() returns an error, above.
*/
ret = -EINVAL;
bio_put(bio);
goto zero_tail;
}
if (dio->flags & IOMAP_DIO_WRITE) {
task_io_account_write(n);
} else {
@ -598,6 +615,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iocb->ki_flags & IOCB_NOWAIT)
iomi.flags |= IOMAP_NOWAIT;
if (iocb->ki_flags & IOCB_ATOMIC)
iomi.flags |= IOMAP_ATOMIC;
if (iov_iter_rw(iter) == READ) {
/* reads can always complete inline */
dio->flags |= IOMAP_DIO_INLINE_COMP;
@ -659,7 +679,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (ret != -EAGAIN) {
trace_iomap_dio_invalidate_fail(inode, iomi.pos,
iomi.len);
ret = -ENOTBLK;
if (iocb->ki_flags & IOCB_ATOMIC) {
/*
* folio invalidation failed, maybe
* this is transient, unlock and see if
* the caller tries again.
*/
ret = -EAGAIN;
} else {
/* fall back to buffered write */
ret = -ENOTBLK;
}
}
goto out_free_dio;
}

View File

@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
{ IOMAP_REPORT, "REPORT" }, \
{ IOMAP_FAULT, "FAULT" }, \
{ IOMAP_DIRECT, "DIRECT" }, \
{ IOMAP_NOWAIT, "NOWAIT" }
{ IOMAP_NOWAIT, "NOWAIT" }, \
{ IOMAP_ATOMIC, "ATOMIC" }
#define IOMAP_F_FLAGS_STRINGS \
{ IOMAP_F_NEW, "NEW" }, \

View File

@ -1830,18 +1830,22 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
return 0;
}
bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos)
int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
{
size_t len = iov_iter_count(iter);
if (!iter_is_ubuf(iter))
return false;
return -EINVAL;
if (!is_power_of_2(len))
return false;
return -EINVAL;
if (!IS_ALIGNED(pos, len))
return false;
if (!IS_ALIGNED(iocb->ki_pos, len))
return -EINVAL;
return true;
if (!(iocb->ki_flags & IOCB_DIRECT))
return -EOPNOTSUPP;
return 0;
}
EXPORT_SYMBOL_GPL(generic_atomic_write_valid);

View File

@ -2115,6 +2115,13 @@ xfs_alloc_buftarg(
btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
mp, ops);
if (bdev_can_atomic_write(btp->bt_bdev)) {
btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes(
btp->bt_bdev);
btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes(
btp->bt_bdev);
}
/*
* When allocating the buftargs we have not yet read the super block and
* thus don't know the file system sector size yet.

View File

@ -124,6 +124,10 @@ struct xfs_buftarg {
struct percpu_counter bt_io_count;
struct ratelimit_state bt_ioerror_rl;
/* Atomic write unit values */
unsigned int bt_bdev_awu_min;
unsigned int bt_bdev_awu_max;
/* built-in cache, if we're not using the perag one */
struct xfs_buf_cache bt_cache[];
};

View File

@ -822,6 +822,20 @@ xfs_file_write_iter(
if (IS_DAX(inode))
return xfs_file_dax_write(iocb, from);
if (iocb->ki_flags & IOCB_ATOMIC) {
/*
* Currently only atomic writing of a single FS block is
* supported. It would be possible to atomic write smaller than
* a FS block, but there is no requirement to support this.
* Note that iomap also does not support this yet.
*/
if (ocount != ip->i_mount->m_sb.sb_blocksize)
return -EINVAL;
ret = generic_atomic_write_valid(iocb, from);
if (ret)
return ret;
}
if (iocb->ki_flags & IOCB_DIRECT) {
/*
* Allow a directio write to fall back to a buffered
@ -1209,6 +1223,8 @@ xfs_file_open(
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
if (xfs_inode_can_atomicwrite(XFS_I(inode)))
file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
return generic_file_open(inode, file);
}

View File

@ -327,6 +327,21 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip)
(XFS_IS_REALTIME_INODE(ip) ? \
(ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp)
static inline bool
xfs_inode_can_atomicwrite(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min)
return false;
if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max)
return false;
return true;
}
/*
* In-core inode flags.
*/

View File

@ -570,6 +570,20 @@ xfs_stat_blksize(
return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize);
}
static void
xfs_get_atomic_write_attr(
struct xfs_inode *ip,
unsigned int *unit_min,
unsigned int *unit_max)
{
if (!xfs_inode_can_atomicwrite(ip)) {
*unit_min = *unit_max = 0;
return;
}
*unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize;
}
STATIC int
xfs_vn_getattr(
struct mnt_idmap *idmap,
@ -643,6 +657,14 @@ xfs_vn_getattr(
stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
stat->dio_offset_align = bdev_logical_block_size(bdev);
}
if (request_mask & STATX_WRITE_ATOMIC) {
unsigned int unit_min, unit_max;
xfs_get_atomic_write_attr(ip, &unit_min,
&unit_max);
generic_fill_statx_atomic_writes(stat,
unit_min, unit_max);
}
fallthrough;
default:
stat->blksize = xfs_stat_blksize(ip);

View File

@ -1674,6 +1674,22 @@ static inline bool bdev_can_atomic_write(struct block_device *bdev)
return true;
}
static inline unsigned int
bdev_atomic_write_unit_min_bytes(struct block_device *bdev)
{
if (!bdev_can_atomic_write(bdev))
return 0;
return queue_atomic_write_unit_min_bytes(bdev_get_queue(bdev));
}
static inline unsigned int
bdev_atomic_write_unit_max_bytes(struct block_device *bdev)
{
if (!bdev_can_atomic_write(bdev))
return 0;
return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev));
}
#define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { }
#endif /* _LINUX_BLKDEV_H */

View File

@ -3721,6 +3721,6 @@ static inline bool vfs_empty_path(int dfd, const char __user *path)
return !c;
}
bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos);
int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter);
#endif /* _LINUX_FS_H */

View File

@ -178,6 +178,7 @@ struct iomap_folio_ops {
#else
#define IOMAP_DAX 0
#endif /* CONFIG_FS_DAX */
#define IOMAP_ATOMIC (1 << 9)
struct iomap_ops {
/*