diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index b93115ab8748..ef082e5a4e0c 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -513,6 +513,21 @@ IOMAP_WRITE`` with any combination of the following enhancements: if the mapping is unwritten and the filesystem cannot handle zeroing the unaligned regions without exposing stale contents. + * ``IOMAP_ATOMIC``: This write is being issued with torn-write + protection. + Only a single bio can be created for the write, and the write must + not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be + set. + The file range to write must be aligned to satisfy the requirements + of both the filesystem and the underlying block device's atomic + commit capabilities. + If filesystem metadata updates are required (e.g. unwritten extent + conversion or copy on write), all updates for the entire file range + must be committed atomically as well. + Only one space mapping is allowed per untorn write. + Untorn writes must be aligned to, and must not be longer than, a + single file block. + Callers commonly hold ``i_rwsem`` in shared or exclusive mode before calling this function. diff --git a/block/fops.c b/block/fops.c index e696ae53bf1e..2d01c9007681 100644 --- a/block/fops.c +++ b/block/fops.c @@ -35,13 +35,10 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb) return opf; } -static bool blkdev_dio_invalid(struct block_device *bdev, loff_t pos, - struct iov_iter *iter, bool is_atomic) +static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb, + struct iov_iter *iter) { - if (is_atomic && !generic_atomic_write_valid(iter, pos)) - return true; - - return pos & (bdev_logical_block_size(bdev) - 1) || + return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) || !bdev_iter_is_aligned(bdev, iter); } @@ -368,13 +365,12 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); - bool is_atomic = iocb->ki_flags & IOCB_ATOMIC; unsigned int nr_pages; if (!iov_iter_count(iter)) return 0; - if (blkdev_dio_invalid(bdev, iocb->ki_pos, iter, is_atomic)) + if (blkdev_dio_invalid(bdev, iocb, iter)) return -EINVAL; nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); @@ -383,7 +379,7 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return __blkdev_direct_IO_simple(iocb, iter, bdev, nr_pages); return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages); - } else if (is_atomic) { + } else if (iocb->ki_flags & IOCB_ATOMIC) { return -EINVAL; } return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages)); @@ -625,7 +621,7 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (!bdev) return -ENXIO; - if (bdev_can_atomic_write(bdev) && filp->f_flags & O_DIRECT) + if (bdev_can_atomic_write(bdev)) filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; ret = bdev_open(bdev, mode, filp->private_data, NULL, filp); @@ -700,6 +696,12 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) return -EOPNOTSUPP; + if (iocb->ki_flags & IOCB_ATOMIC) { + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + size -= iocb->ki_pos; if (iov_iter_count(from) > size) { shorted = iov_iter_count(from) - size; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 44b0d418143c..494d443e9fc9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1729,6 +1729,10 @@ struct ext4_sb_info { */ struct work_struct s_sb_upd_work; + /* Atomic write unit values in bytes */ + unsigned int s_awu_min; + unsigned int s_awu_max; + /* Ext4 fast commit sub transaction ID */ atomic_t s_fc_subtid; @@ -3855,6 +3859,12 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) return buffer_uptodate(bh); } +static inline bool ext4_inode_can_atomic_write(struct inode *inode) +{ + + return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0; +} + extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index f14aed14b9cf..a7de03e47db0 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -599,6 +599,13 @@ out: ssize_t err; loff_t endbyte; + /* + * There is no support for atomic writes on buffered-io yet, + * we should never fallback to buffered-io for DIO atomic + * writes. + */ + WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC); + offset = iocb->ki_pos; err = ext4_buffered_write_iter(iocb, from); if (err < 0) @@ -692,6 +699,20 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif + + if (iocb->ki_flags & IOCB_ATOMIC) { + size_t len = iov_iter_count(from); + int ret; + + if (len < EXT4_SB(inode->i_sb)->s_awu_min || + len > EXT4_SB(inode->i_sb)->s_awu_max) + return -EINVAL; + + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_write_iter(iocb, from); else @@ -884,6 +905,9 @@ static int ext4_file_open(struct inode *inode, struct file *filp) return ret; } + if (ext4_inode_can_atomic_write(inode)) + filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; return dquot_file_open(inode, filp); } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 54bdd4884fe6..5b9eeb74ce47 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3444,17 +3444,34 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, return ret; } +static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written) +{ + /* must be a directio to fall back to buffered */ + if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != + (IOMAP_WRITE | IOMAP_DIRECT)) + return false; + + /* atomic writes are all-or-nothing */ + if (flags & IOMAP_ATOMIC) + return false; + + /* can only try again if we wrote nothing */ + return written == 0; +} + static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { /* * Check to see whether an error occurred while writing out the data to - * the allocated blocks. If so, return the magic error code so that we - * fallback to buffered I/O and attempt to complete the remainder of - * the I/O. Any blocks that may have been allocated in preparation for - * the direct I/O will be reused during buffered I/O. + * the allocated blocks. If so, return the magic error code for + * non-atomic write so that we fallback to buffered I/O and attempt to + * complete the remainder of the I/O. + * For non-atomic writes, any blocks that may have been + * allocated in preparation for the direct I/O will be reused during + * buffered I/O. For atomic write, we never fallback to buffered-io. */ - if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) + if (ext4_want_directio_fallback(flags, written)) return -ENOTBLK; return 0; @@ -5578,6 +5595,18 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, } } + if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned int awu_min = 0, awu_max = 0; + + if (ext4_inode_can_atomic_write(inode)) { + awu_min = sbi->s_awu_min; + awu_max = sbi->s_awu_max; + } + + generic_fill_statx_atomic_writes(stat, awu_min, awu_max); + } + flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (flags & EXT4_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b77acba4a719..7ea7178750f2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4425,6 +4425,36 @@ static int ext4_handle_clustersize(struct super_block *sb) return 0; } +/* + * ext4_atomic_write_init: Initializes filesystem min & max atomic write units. + * @sb: super block + * TODO: Later add support for bigalloc + */ +static void ext4_atomic_write_init(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct block_device *bdev = sb->s_bdev; + + if (!bdev_can_atomic_write(bdev)) + return; + + if (!ext4_has_feature_extents(sb)) + return; + + sbi->s_awu_min = max(sb->s_blocksize, + bdev_atomic_write_unit_min_bytes(bdev)); + sbi->s_awu_max = min(sb->s_blocksize, + bdev_atomic_write_unit_max_bytes(bdev)); + if (sbi->s_awu_min && sbi->s_awu_max && + sbi->s_awu_min <= sbi->s_awu_max) { + ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u", + sbi->s_awu_min, sbi->s_awu_max); + } else { + sbi->s_awu_min = 0; + sbi->s_awu_max = 0; + } +} + static void ext4_fast_commit_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -5336,6 +5366,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) spin_lock_init(&sbi->s_bdev_wb_lock); + ext4_atomic_write_init(sb); ext4_fast_commit_init(sb); sb->s_root = NULL; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index f637aa0706a3..b521eb15759e 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -271,7 +271,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, * clearing the WRITE_THROUGH flag in the dio request. */ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, - const struct iomap *iomap, bool use_fua) + const struct iomap *iomap, bool use_fua, bool atomic) { blk_opf_t opflags = REQ_SYNC | REQ_IDLE; @@ -283,6 +283,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, opflags |= REQ_FUA; else dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + if (atomic) + opflags |= REQ_ATOMIC; return opflags; } @@ -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; unsigned int fs_block_size = i_blocksize(inode), pad; - loff_t length = iomap_length(iter); + const loff_t length = iomap_length(iter); + bool atomic = iter->flags & IOMAP_ATOMIC; loff_t pos = iter->pos; blk_opf_t bio_opf; struct bio *bio; @@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, size_t copied = 0; size_t orig_count; + if (atomic && length != fs_block_size) + return -EINVAL; + if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) return -EINVAL; @@ -377,12 +383,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; } - /* - * Set the operation flags early so that bio_iov_iter_get_pages - * can set up the page vector appropriately for a ZONE_APPEND - * operation. - */ - bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); + bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic); nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { @@ -415,6 +416,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, } n = bio->bi_iter.bi_size; + if (WARN_ON_ONCE(atomic && n != length)) { + /* + * This bio should have covered the complete length, + * which it doesn't, so error. We may need to zero out + * the tail (complete FS block), similar to when + * bio_iov_iter_get_pages() returns an error, above. + */ + ret = -EINVAL; + bio_put(bio); + goto zero_tail; + } if (dio->flags & IOMAP_DIO_WRITE) { task_io_account_write(n); } else { @@ -598,6 +610,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; + if (iocb->ki_flags & IOCB_ATOMIC) + iomi.flags |= IOMAP_ATOMIC; + if (iov_iter_rw(iter) == READ) { /* reads can always complete inline */ dio->flags |= IOMAP_DIO_INLINE_COMP; @@ -659,7 +674,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret != -EAGAIN) { trace_iomap_dio_invalidate_fail(inode, iomi.pos, iomi.len); - ret = -ENOTBLK; + if (iocb->ki_flags & IOCB_ATOMIC) { + /* + * folio invalidation failed, maybe + * this is transient, unlock and see if + * the caller tries again. + */ + ret = -EAGAIN; + } else { + /* fall back to buffered write */ + ret = -ENOTBLK; + } } goto out_free_dio; } diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 0a991c4ce87d..4118a42cdab0 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued); { IOMAP_REPORT, "REPORT" }, \ { IOMAP_FAULT, "FAULT" }, \ { IOMAP_DIRECT, "DIRECT" }, \ - { IOMAP_NOWAIT, "NOWAIT" } + { IOMAP_NOWAIT, "NOWAIT" }, \ + { IOMAP_ATOMIC, "ATOMIC" } #define IOMAP_F_FLAGS_STRINGS \ { IOMAP_F_NEW, "NEW" }, \ diff --git a/fs/read_write.c b/fs/read_write.c index 64dc24afdb3a..3e5dad12a5b4 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1830,18 +1830,22 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out) return 0; } -bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos) +int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) { size_t len = iov_iter_count(iter); if (!iter_is_ubuf(iter)) - return false; + return -EINVAL; if (!is_power_of_2(len)) - return false; + return -EINVAL; - if (!IS_ALIGNED(pos, len)) - return false; + if (!IS_ALIGNED(iocb->ki_pos, len)) + return -EINVAL; - return true; + if (!(iocb->ki_flags & IOCB_DIRECT)) + return -EOPNOTSUPP; + + return 0; } +EXPORT_SYMBOL_GPL(generic_atomic_write_valid); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index aa4dbda7b536..e8196f5778e2 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2115,6 +2115,13 @@ xfs_alloc_buftarg( btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, mp, ops); + if (bdev_can_atomic_write(btp->bt_bdev)) { + btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes( + btp->bt_bdev); + btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes( + btp->bt_bdev); + } + /* * When allocating the buftargs we have not yet read the super block and * thus don't know the file system sector size yet. diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 209a389f2abc..3d56bc7a35cc 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -124,6 +124,10 @@ struct xfs_buftarg { struct percpu_counter bt_io_count; struct ratelimit_state bt_ioerror_rl; + /* Atomic write unit values */ + unsigned int bt_bdev_awu_min; + unsigned int bt_bdev_awu_max; + /* built-in cache, if we're not using the perag one */ struct xfs_buf_cache bt_cache[]; }; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b19916b11fd5..ca47cae5a40a 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -852,6 +852,20 @@ xfs_file_write_iter( if (IS_DAX(inode)) return xfs_file_dax_write(iocb, from); + if (iocb->ki_flags & IOCB_ATOMIC) { + /* + * Currently only atomic writing of a single FS block is + * supported. It would be possible to atomic write smaller than + * a FS block, but there is no requirement to support this. + * Note that iomap also does not support this yet. + */ + if (ocount != ip->i_mount->m_sb.sb_blocksize) + return -EINVAL; + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + if (iocb->ki_flags & IOCB_DIRECT) { /* * Allow a directio write to fall back to a buffered @@ -1239,6 +1253,8 @@ xfs_file_open( if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; + if (xfs_inode_can_atomicwrite(XFS_I(inode))) + file->f_mode |= FMODE_CAN_ATOMIC_WRITE; return generic_file_open(inode, file); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 03944b6c5fba..a2a6b5fd2545 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -332,6 +332,21 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) (XFS_IS_REALTIME_INODE(ip) ? \ (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp) +static inline bool +xfs_inode_can_atomicwrite( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + + if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min) + return false; + if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max) + return false; + + return true; +} + /* * In-core inode flags. */ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index b5d0c5c157e7..4084d26f0d78 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -570,6 +570,20 @@ xfs_stat_blksize( return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize); } +static void +xfs_get_atomic_write_attr( + struct xfs_inode *ip, + unsigned int *unit_min, + unsigned int *unit_max) +{ + if (!xfs_inode_can_atomicwrite(ip)) { + *unit_min = *unit_max = 0; + return; + } + + *unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize; +} + STATIC int xfs_vn_getattr( struct mnt_idmap *idmap, @@ -639,6 +653,14 @@ xfs_vn_getattr( stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; stat->dio_offset_align = bdev_logical_block_size(bdev); } + if (request_mask & STATX_WRITE_ATOMIC) { + unsigned int unit_min, unit_max; + + xfs_get_atomic_write_attr(ip, &unit_min, + &unit_max); + generic_fill_statx_atomic_writes(stat, + unit_min, unit_max); + } fallthrough; default: stat->blksize = xfs_stat_blksize(ip); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 50c3b959da28..c2cc3c146d74 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1674,6 +1674,22 @@ static inline bool bdev_can_atomic_write(struct block_device *bdev) return true; } +static inline unsigned int +bdev_atomic_write_unit_min_bytes(struct block_device *bdev) +{ + if (!bdev_can_atomic_write(bdev)) + return 0; + return queue_atomic_write_unit_min_bytes(bdev_get_queue(bdev)); +} + +static inline unsigned int +bdev_atomic_write_unit_max_bytes(struct block_device *bdev) +{ + if (!bdev_can_atomic_write(bdev)) + return 0; + return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev)); +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index eae7ce884030..db4c0d45107a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3798,6 +3798,6 @@ static inline bool vfs_empty_path(int dfd, const char __user *path) return !c; } -bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos); +int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); #endif /* _LINUX_FS_H */ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index f61407e3b121..27048ec10e1c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -178,6 +178,7 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ +#define IOMAP_ATOMIC (1 << 9) struct iomap_ops { /*