From 9a8dbdadae509e5717ff6e5aa572ca0974d2101d Mon Sep 17 00:00:00 2001 From: John Garry Date: Sat, 19 Oct 2024 12:51:06 +0000 Subject: [PATCH 01/13] block/fs: Pass an iocb to generic_atomic_write_valid() Darrick and Hannes both thought it better that generic_atomic_write_valid() should be passed a struct iocb, and not just the member of that struct which is referenced; see [0] and [1]. I think that makes a more generic and clean API, so make that change. [0] https://lore.kernel.org/linux-block/680ce641-729b-4150-b875-531a98657682@suse.de/ [1] https://lore.kernel.org/linux-xfs/20240620212401.GA3058325@frogsfrogsfrogs/ Fixes: c34fc6f26ab8 ("fs: Initial atomic write support") Suggested-by: Darrick J. Wong Suggested-by: Hannes Reinecke Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: John Garry Link: https://lore.kernel.org/r/20241019125113.369994-2-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/fops.c | 8 ++++---- fs/read_write.c | 4 ++-- include/linux/fs.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/block/fops.c b/block/fops.c index e696ae53bf1e..968b47b615c4 100644 --- a/block/fops.c +++ b/block/fops.c @@ -35,13 +35,13 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb) return opf; } -static bool blkdev_dio_invalid(struct block_device *bdev, loff_t pos, +static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb, struct iov_iter *iter, bool is_atomic) { - if (is_atomic && !generic_atomic_write_valid(iter, pos)) + if (is_atomic && !generic_atomic_write_valid(iocb, iter)) return true; - return pos & (bdev_logical_block_size(bdev) - 1) || + return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) || !bdev_iter_is_aligned(bdev, iter); } @@ -374,7 +374,7 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (!iov_iter_count(iter)) return 0; - if (blkdev_dio_invalid(bdev, iocb->ki_pos, iter, is_atomic)) + if (blkdev_dio_invalid(bdev, iocb, iter, is_atomic)) return -EINVAL; nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); diff --git a/fs/read_write.c b/fs/read_write.c index 64dc24afdb3a..2c3263530828 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1830,7 +1830,7 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out) return 0; } -bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos) +bool generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) { size_t len = iov_iter_count(iter); @@ -1840,7 +1840,7 @@ bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos) if (!is_power_of_2(len)) return false; - if (!IS_ALIGNED(pos, len)) + if (!IS_ALIGNED(iocb->ki_pos, len)) return false; return true; diff --git a/include/linux/fs.h b/include/linux/fs.h index e3c603d01337..fbfa032d1d90 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3721,6 +3721,6 @@ static inline bool vfs_empty_path(int dfd, const char __user *path) return !c; } -bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos); +bool generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); #endif /* _LINUX_FS_H */ From c3be7ebbbce5201e151f17e28a6c807602f369c9 Mon Sep 17 00:00:00 2001 From: John Garry Date: Sat, 19 Oct 2024 12:51:07 +0000 Subject: [PATCH 02/13] fs/block: Check for IOCB_DIRECT in generic_atomic_write_valid() Currently FMODE_CAN_ATOMIC_WRITE is set if the bdev can atomic write and the file is open for direct IO. This does not work if the file is not opened for direct IO, yet fcntl(O_DIRECT) is used on the fd later. Change to check for direct IO on a per-IO basis in generic_atomic_write_valid(). Since we want to report -EOPNOTSUPP for non-direct IO for an atomic write, change to return an error code. Relocate the block fops atomic write checks to the common write path, as to catch non-direct IO. Fixes: c34fc6f26ab8 ("fs: Initial atomic write support") Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: John Garry Link: https://lore.kernel.org/r/20241019125113.369994-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/fops.c | 18 ++++++++++-------- fs/read_write.c | 13 ++++++++----- include/linux/fs.h | 2 +- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/block/fops.c b/block/fops.c index 968b47b615c4..2d01c9007681 100644 --- a/block/fops.c +++ b/block/fops.c @@ -36,11 +36,8 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb) } static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb, - struct iov_iter *iter, bool is_atomic) + struct iov_iter *iter) { - if (is_atomic && !generic_atomic_write_valid(iocb, iter)) - return true; - return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) || !bdev_iter_is_aligned(bdev, iter); } @@ -368,13 +365,12 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); - bool is_atomic = iocb->ki_flags & IOCB_ATOMIC; unsigned int nr_pages; if (!iov_iter_count(iter)) return 0; - if (blkdev_dio_invalid(bdev, iocb, iter, is_atomic)) + if (blkdev_dio_invalid(bdev, iocb, iter)) return -EINVAL; nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); @@ -383,7 +379,7 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return __blkdev_direct_IO_simple(iocb, iter, bdev, nr_pages); return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages); - } else if (is_atomic) { + } else if (iocb->ki_flags & IOCB_ATOMIC) { return -EINVAL; } return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages)); @@ -625,7 +621,7 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (!bdev) return -ENXIO; - if (bdev_can_atomic_write(bdev) && filp->f_flags & O_DIRECT) + if (bdev_can_atomic_write(bdev)) filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; ret = bdev_open(bdev, mode, filp->private_data, NULL, filp); @@ -700,6 +696,12 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) return -EOPNOTSUPP; + if (iocb->ki_flags & IOCB_ATOMIC) { + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + size -= iocb->ki_pos; if (iov_iter_count(from) > size) { shorted = iov_iter_count(from) - size; diff --git a/fs/read_write.c b/fs/read_write.c index 2c3263530828..befec0b5c537 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1830,18 +1830,21 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out) return 0; } -bool generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) +int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) { size_t len = iov_iter_count(iter); if (!iter_is_ubuf(iter)) - return false; + return -EINVAL; if (!is_power_of_2(len)) - return false; + return -EINVAL; if (!IS_ALIGNED(iocb->ki_pos, len)) - return false; + return -EINVAL; - return true; + if (!(iocb->ki_flags & IOCB_DIRECT)) + return -EOPNOTSUPP; + + return 0; } diff --git a/include/linux/fs.h b/include/linux/fs.h index fbfa032d1d90..ba47fb283730 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3721,6 +3721,6 @@ static inline bool vfs_empty_path(int dfd, const char __user *path) return !c; } -bool generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); +int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); #endif /* _LINUX_FS_H */ From 1eadb157947163ca72ba8963b915fdc099ce6cca Mon Sep 17 00:00:00 2001 From: John Garry Date: Sat, 19 Oct 2024 12:51:08 +0000 Subject: [PATCH 03/13] block: Add bdev atomic write limits helpers Add helpers to get atomic write limits for a bdev, so that we don't access request_queue helpers outside the block layer. We check if the bdev can actually atomic write in these helpers, so we can avoid users missing using this check. Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: John Garry Link: https://lore.kernel.org/r/20241019125113.369994-4-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 50c3b959da28..c2cc3c146d74 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1674,6 +1674,22 @@ static inline bool bdev_can_atomic_write(struct block_device *bdev) return true; } +static inline unsigned int +bdev_atomic_write_unit_min_bytes(struct block_device *bdev) +{ + if (!bdev_can_atomic_write(bdev)) + return 0; + return queue_atomic_write_unit_min_bytes(bdev_get_queue(bdev)); +} + +static inline unsigned int +bdev_atomic_write_unit_max_bytes(struct block_device *bdev) +{ + if (!bdev_can_atomic_write(bdev)) + return 0; + return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev)); +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ From a570bad16b9f5252db2f342622bd71febb39a19c Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 4 Nov 2024 16:14:02 -0800 Subject: [PATCH 04/13] fs: Export generic_atomic_write_valid() The XFS code will need this. Reviewed-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: John Garry Signed-off-by: Darrick J. Wong --- fs/read_write.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/read_write.c b/fs/read_write.c index befec0b5c537..3e5dad12a5b4 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1848,3 +1848,4 @@ int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) return 0; } +EXPORT_SYMBOL_GPL(generic_atomic_write_valid); From 9e0933c21c128d6d8ac4d8aae0babaf9a43100b8 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 4 Nov 2024 16:14:02 -0800 Subject: [PATCH 05/13] fs: iomap: Atomic write support Support direct I/O atomic writes by producing a single bio with REQ_ATOMIC flag set. Initially FSes (XFS) should only support writing a single FS block atomically. As with any atomic write, we should produce a single bio which covers the complete write length. Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: John Garry Reviewed-by: Ritesh Harjani (IBM) [djwong: clarify a couple of things in the docs] Signed-off-by: Darrick J. Wong --- .../filesystems/iomap/operations.rst | 15 ++++++++ fs/iomap/direct-io.c | 38 +++++++++++++++++-- fs/iomap/trace.h | 3 +- include/linux/iomap.h | 1 + 4 files changed, 52 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index 8e6c721d2330..ee790f843cfa 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -513,6 +513,21 @@ IOMAP_WRITE`` with any combination of the following enhancements: if the mapping is unwritten and the filesystem cannot handle zeroing the unaligned regions without exposing stale contents. + * ``IOMAP_ATOMIC``: This write is being issued with torn-write + protection. + Only a single bio can be created for the write, and the write must + not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be + set. + The file range to write must be aligned to satisfy the requirements + of both the filesystem and the underlying block device's atomic + commit capabilities. + If filesystem metadata updates are required (e.g. unwritten extent + conversion or copy on write), all updates for the entire file range + must be committed atomically as well. + Only one space mapping is allowed per untorn write. + Untorn writes must be aligned to, and must not be longer than, a + single file block. + Callers commonly hold ``i_rwsem`` in shared or exclusive mode before calling this function. diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index f637aa0706a3..ed4764e3b8f0 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -271,7 +271,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, * clearing the WRITE_THROUGH flag in the dio request. */ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, - const struct iomap *iomap, bool use_fua) + const struct iomap *iomap, bool use_fua, bool atomic) { blk_opf_t opflags = REQ_SYNC | REQ_IDLE; @@ -283,6 +283,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, opflags |= REQ_FUA; else dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + if (atomic) + opflags |= REQ_ATOMIC; return opflags; } @@ -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; unsigned int fs_block_size = i_blocksize(inode), pad; - loff_t length = iomap_length(iter); + const loff_t length = iomap_length(iter); + bool atomic = iter->flags & IOMAP_ATOMIC; loff_t pos = iter->pos; blk_opf_t bio_opf; struct bio *bio; @@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, size_t copied = 0; size_t orig_count; + if (atomic && length != fs_block_size) + return -EINVAL; + if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) return -EINVAL; @@ -382,7 +388,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, * can set up the page vector appropriately for a ZONE_APPEND * operation. */ - bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); + bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic); nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { @@ -415,6 +421,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, } n = bio->bi_iter.bi_size; + if (WARN_ON_ONCE(atomic && n != length)) { + /* + * This bio should have covered the complete length, + * which it doesn't, so error. We may need to zero out + * the tail (complete FS block), similar to when + * bio_iov_iter_get_pages() returns an error, above. + */ + ret = -EINVAL; + bio_put(bio); + goto zero_tail; + } if (dio->flags & IOMAP_DIO_WRITE) { task_io_account_write(n); } else { @@ -598,6 +615,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; + if (iocb->ki_flags & IOCB_ATOMIC) + iomi.flags |= IOMAP_ATOMIC; + if (iov_iter_rw(iter) == READ) { /* reads can always complete inline */ dio->flags |= IOMAP_DIO_INLINE_COMP; @@ -659,7 +679,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret != -EAGAIN) { trace_iomap_dio_invalidate_fail(inode, iomi.pos, iomi.len); - ret = -ENOTBLK; + if (iocb->ki_flags & IOCB_ATOMIC) { + /* + * folio invalidation failed, maybe + * this is transient, unlock and see if + * the caller tries again. + */ + ret = -EAGAIN; + } else { + /* fall back to buffered write */ + ret = -ENOTBLK; + } } goto out_free_dio; } diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 0a991c4ce87d..4118a42cdab0 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued); { IOMAP_REPORT, "REPORT" }, \ { IOMAP_FAULT, "FAULT" }, \ { IOMAP_DIRECT, "DIRECT" }, \ - { IOMAP_NOWAIT, "NOWAIT" } + { IOMAP_NOWAIT, "NOWAIT" }, \ + { IOMAP_ATOMIC, "ATOMIC" } #define IOMAP_F_FLAGS_STRINGS \ { IOMAP_F_NEW, "NEW" }, \ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 4ad12a3c8bae..c7644bdcfca3 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -178,6 +178,7 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ +#define IOMAP_ATOMIC (1 << 9) struct iomap_ops { /* From 6432c6e723fffd93e5cb65117ff48a3aa734e259 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 4 Nov 2024 16:14:03 -0800 Subject: [PATCH 06/13] xfs: Support atomic write for statx Support providing info on atomic write unit min and max for an inode. For simplicity, currently we limit the min at the FS block size. As for max, we limit also at FS block size, as there is no current method to guarantee extent alignment or granularity for regular files. Reviewed-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: John Garry Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 7 +++++++ fs/xfs/xfs_buf.h | 4 ++++ fs/xfs/xfs_inode.h | 15 +++++++++++++++ fs/xfs/xfs_iops.c | 22 ++++++++++++++++++++++ 4 files changed, 48 insertions(+) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index aa4dbda7b536..e8196f5778e2 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2115,6 +2115,13 @@ xfs_alloc_buftarg( btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, mp, ops); + if (bdev_can_atomic_write(btp->bt_bdev)) { + btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes( + btp->bt_bdev); + btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes( + btp->bt_bdev); + } + /* * When allocating the buftargs we have not yet read the super block and * thus don't know the file system sector size yet. diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 209a389f2abc..3d56bc7a35cc 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -124,6 +124,10 @@ struct xfs_buftarg { struct percpu_counter bt_io_count; struct ratelimit_state bt_ioerror_rl; + /* Atomic write unit values */ + unsigned int bt_bdev_awu_min; + unsigned int bt_bdev_awu_max; + /* built-in cache, if we're not using the perag one */ struct xfs_buf_cache bt_cache[]; }; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 97ed912306fd..73009a25a119 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -327,6 +327,21 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) (XFS_IS_REALTIME_INODE(ip) ? \ (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp) +static inline bool +xfs_inode_can_atomicwrite( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + + if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min) + return false; + if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max) + return false; + + return true; +} + /* * In-core inode flags. */ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ee79cf161312..5cd804812efd 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -570,6 +570,20 @@ xfs_stat_blksize( return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize); } +static void +xfs_get_atomic_write_attr( + struct xfs_inode *ip, + unsigned int *unit_min, + unsigned int *unit_max) +{ + if (!xfs_inode_can_atomicwrite(ip)) { + *unit_min = *unit_max = 0; + return; + } + + *unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize; +} + STATIC int xfs_vn_getattr( struct mnt_idmap *idmap, @@ -643,6 +657,14 @@ xfs_vn_getattr( stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; stat->dio_offset_align = bdev_logical_block_size(bdev); } + if (request_mask & STATX_WRITE_ATOMIC) { + unsigned int unit_min, unit_max; + + xfs_get_atomic_write_attr(ip, &unit_min, + &unit_max); + generic_fill_statx_atomic_writes(stat, + unit_min, unit_max); + } fallthrough; default: stat->blksize = xfs_stat_blksize(ip); From f096207d327692a066954435dafb4bedbc031d9e Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 4 Nov 2024 16:14:04 -0800 Subject: [PATCH 07/13] xfs: Validate atomic writes Validate that an atomic write adheres to length/offset rules. Currently we can only write a single FS block. For an IOCB with IOCB_ATOMIC set to get as far as xfs_file_write_iter(), FMODE_CAN_ATOMIC_WRITE will need to be set for the file; for this, ATOMICWRITES flags would also need to be set for the inode. Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: John Garry Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 412b1d71b52b..44be10490342 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -822,6 +822,20 @@ xfs_file_write_iter( if (IS_DAX(inode)) return xfs_file_dax_write(iocb, from); + if (iocb->ki_flags & IOCB_ATOMIC) { + /* + * Currently only atomic writing of a single FS block is + * supported. It would be possible to atomic write smaller than + * a FS block, but there is no requirement to support this. + * Note that iomap also does not support this yet. + */ + if (ocount != ip->i_mount->m_sb.sb_blocksize) + return -EINVAL; + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + if (iocb->ki_flags & IOCB_DIRECT) { /* * Allow a directio write to fall back to a buffered From 3af5298ce94bb672845adadf347874ba5bcfad2c Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 4 Nov 2024 16:14:04 -0800 Subject: [PATCH 08/13] xfs: Support setting FMODE_CAN_ATOMIC_WRITE Set FMODE_CAN_ATOMIC_WRITE flag if we can atomic write for that inode. Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: John Garry Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: John Garry Tested-by: Ojaswin Mujoo #On ppc64 --- fs/xfs/xfs_file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 44be10490342..c94019b12537 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1223,6 +1223,8 @@ xfs_file_open( if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; + if (xfs_inode_can_atomicwrite(XFS_I(inode))) + file->f_mode |= FMODE_CAN_ATOMIC_WRITE; return generic_file_open(inode, file); } From 6dfc1c1d597f8b6ebffe25f51f013494994f9b84 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Mon, 4 Nov 2024 16:22:57 -0800 Subject: [PATCH 09/13] ext4: Add statx support for atomic writes This patch adds base support for atomic writes via statx getattr. On bs < ps systems, we can create FS with say bs of 16k. That means both atomic write min and max unit can be set to 16k for supporting atomic writes. Co-developed-by: Ojaswin Mujoo Signed-off-by: Ojaswin Mujoo Signed-off-by: Ritesh Harjani (IBM) Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 10 ++++++++++ fs/ext4/inode.c | 12 ++++++++++++ fs/ext4/super.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 44b0d418143c..494d443e9fc9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1729,6 +1729,10 @@ struct ext4_sb_info { */ struct work_struct s_sb_upd_work; + /* Atomic write unit values in bytes */ + unsigned int s_awu_min; + unsigned int s_awu_max; + /* Ext4 fast commit sub transaction ID */ atomic_t s_fc_subtid; @@ -3855,6 +3859,12 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) return buffer_uptodate(bh); } +static inline bool ext4_inode_can_atomic_write(struct inode *inode) +{ + + return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0; +} + extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 54bdd4884fe6..3e827cfa762e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5578,6 +5578,18 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, } } + if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned int awu_min = 0, awu_max = 0; + + if (ext4_inode_can_atomic_write(inode)) { + awu_min = sbi->s_awu_min; + awu_max = sbi->s_awu_max; + } + + generic_fill_statx_atomic_writes(stat, awu_min, awu_max); + } + flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (flags & EXT4_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 16a4ce704460..ebe1660bd840 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4425,6 +4425,36 @@ static int ext4_handle_clustersize(struct super_block *sb) return 0; } +/* + * ext4_atomic_write_init: Initializes filesystem min & max atomic write units. + * @sb: super block + * TODO: Later add support for bigalloc + */ +static void ext4_atomic_write_init(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct block_device *bdev = sb->s_bdev; + + if (!bdev_can_atomic_write(bdev)) + return; + + if (!ext4_has_feature_extents(sb)) + return; + + sbi->s_awu_min = max(sb->s_blocksize, + bdev_atomic_write_unit_min_bytes(bdev)); + sbi->s_awu_max = min(sb->s_blocksize, + bdev_atomic_write_unit_max_bytes(bdev)); + if (sbi->s_awu_min && sbi->s_awu_max && + sbi->s_awu_min <= sbi->s_awu_max) { + ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u", + sbi->s_awu_min, sbi->s_awu_max); + } else { + sbi->s_awu_min = 0; + sbi->s_awu_max = 0; + } +} + static void ext4_fast_commit_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -5336,6 +5366,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) spin_lock_init(&sbi->s_bdev_wb_lock); + ext4_atomic_write_init(sb); ext4_fast_commit_init(sb); sb->s_root = NULL; From 43c696f9d094061e958e31be7f1dae66bc25d389 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Mon, 4 Nov 2024 16:22:58 -0800 Subject: [PATCH 10/13] ext4: Check for atomic writes support in write iter Let's validate the given constraints for atomic write request. Otherwise it will fail with -EINVAL. Currently atomic write is only supported on DIO, so for buffered-io it will return -EOPNOTSUPP. Reviewed-by: John Garry Reviewed-by: Darrick J. Wong Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara --- fs/ext4/file.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index f14aed14b9cf..a7b9b9751a3f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -692,6 +692,20 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif + + if (iocb->ki_flags & IOCB_ATOMIC) { + size_t len = iov_iter_count(from); + int ret; + + if (len < EXT4_SB(inode->i_sb)->s_awu_min || + len > EXT4_SB(inode->i_sb)->s_awu_max) + return -EINVAL; + + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_write_iter(iocb, from); else From b7987a7d69a4a17b7f334f5c100d6729ebcc2ccb Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Mon, 4 Nov 2024 16:22:59 -0800 Subject: [PATCH 11/13] ext4: Support setting FMODE_CAN_ATOMIC_WRITE FS needs to add the fmode capability in order to support atomic writes during file open (refer kiocb_set_rw_flags()). Set this capability on a regular file if ext4 can do atomic write. Reviewed-by: John Garry Reviewed-by: Darrick J. Wong Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara --- fs/ext4/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index a7b9b9751a3f..96d936f5584b 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -898,6 +898,9 @@ static int ext4_file_open(struct inode *inode, struct file *filp) return ret; } + if (ext4_inode_can_atomic_write(inode)) + filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; return dquot_file_open(inode, filp); } From 299537e9dfac2ecd08e7dae87a6437b92612568a Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Mon, 4 Nov 2024 16:22:59 -0800 Subject: [PATCH 12/13] ext4: Do not fallback to buffered-io for DIO atomic write atomic writes is currently only supported for single fsblock and only for direct-io. We should not return -ENOTBLK for atomic writes since we want the atomic write request to either complete fully or fail otherwise. Hence, we should never fallback to buffered-io in case of DIO atomic write requests. Let's also catch if this ever happens by adding some WARN_ON_ONCE before buffered-io handling for direct-io atomic writes. More details of the discussion [1]. While at it let's add an inline helper ext4_want_directio_fallback() which simplifies the logic checks and inherently fixes condition on when to return -ENOTBLK which otherwise was always returning true for any write or directio in ext4_iomap_end(). It was ok since ext4 only supports direct-io via iomap. [1]: https://lore.kernel.org/linux-xfs/cover.1729825985.git.ritesh.list@gmail.com/T/#m9dbecc11bed713ed0d7a486432c56b105b555f04 Suggested-by: Darrick J. Wong # inline helper Signed-off-by: Ritesh Harjani (IBM) Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara --- fs/ext4/file.c | 7 +++++++ fs/ext4/inode.c | 27 ++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 96d936f5584b..a7de03e47db0 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -599,6 +599,13 @@ out: ssize_t err; loff_t endbyte; + /* + * There is no support for atomic writes on buffered-io yet, + * we should never fallback to buffered-io for DIO atomic + * writes. + */ + WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC); + offset = iocb->ki_pos; err = ext4_buffered_write_iter(iocb, from); if (err < 0) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3e827cfa762e..5b9eeb74ce47 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3444,17 +3444,34 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, return ret; } +static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written) +{ + /* must be a directio to fall back to buffered */ + if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != + (IOMAP_WRITE | IOMAP_DIRECT)) + return false; + + /* atomic writes are all-or-nothing */ + if (flags & IOMAP_ATOMIC) + return false; + + /* can only try again if we wrote nothing */ + return written == 0; +} + static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { /* * Check to see whether an error occurred while writing out the data to - * the allocated blocks. If so, return the magic error code so that we - * fallback to buffered I/O and attempt to complete the remainder of - * the I/O. Any blocks that may have been allocated in preparation for - * the direct I/O will be reused during buffered I/O. + * the allocated blocks. If so, return the magic error code for + * non-atomic write so that we fallback to buffered I/O and attempt to + * complete the remainder of the I/O. + * For non-atomic writes, any blocks that may have been + * allocated in preparation for the direct I/O will be reused during + * buffered I/O. For atomic write, we never fallback to buffered-io. */ - if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) + if (ext4_want_directio_fallback(flags, written)) return -ENOTBLK; return 0; From 54079430c5dbf041363ab39a0c254cd9e4f6aed5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 11 Nov 2024 13:13:40 +0100 Subject: [PATCH 13/13] iomap: drop an obsolete comment in iomap_dio_bio_iter No more zone append special casing in iomap for quite a while. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241111121340.1390540-1-hch@lst.de Signed-off-by: Christian Brauner --- fs/iomap/direct-io.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index ed4764e3b8f0..b521eb15759e 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -383,11 +383,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; } - /* - * Set the operation flags early so that bio_iov_iter_get_pages - * can set up the page vector appropriately for a ZONE_APPEND - * operation. - */ bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic); nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);