vfs-6.13.untorn.writes

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCZzcopwAKCRCRxhvAZXjc
 oitWAQD68PGFI6/ES9x+qGsDFEZBH08icuO+a9dyaZXyNRosDgD/ex2zHj6F7IzS
 Ghgb9jiqWQ8l2+PDYfisxa/0jiqCbAk=
 =DmXf
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.13.untorn.writes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs untorn write support from Christian Brauner:
 "An atomic write is a write issed with torn-write protection. This
  means for a power failure or any hardware failure all or none of the
  data from the write will be stored, never a mix of old and new data.

  This work is already supported for block devices. If a block device is
  opened with O_DIRECT and the block device supports atomic write, then
  FMODE_CAN_ATOMIC_WRITE is added to the file of the opened block
  device.

  This contains the work to expand atomic write support to filesystems,
  specifically ext4 and XFS. Currently, only support for writing exactly
  one filesystem block atomically is added.

  Since it's now possible to have filesystem block size > page size for
  XFS, it's possible to write 4K+ blocks atomically on x86"

* tag 'vfs-6.13.untorn.writes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  iomap: drop an obsolete comment in iomap_dio_bio_iter
  ext4: Do not fallback to buffered-io for DIO atomic write
  ext4: Support setting FMODE_CAN_ATOMIC_WRITE
  ext4: Check for atomic writes support in write iter
  ext4: Add statx support for atomic writes
  xfs: Support setting FMODE_CAN_ATOMIC_WRITE
  xfs: Validate atomic writes
  xfs: Support atomic write for statx
  fs: iomap: Atomic write support
  fs: Export generic_atomic_write_valid()
  block: Add bdev atomic write limits helpers
  fs/block: Check for IOCB_DIRECT in generic_atomic_write_valid()
  block/fs: Pass an iocb to generic_atomic_write_valid()
This commit is contained in:
Linus Torvalds 2024-11-18 11:30:09 -08:00
commit 241c7ed4d4
17 changed files with 254 additions and 32 deletions

View File

@ -513,6 +513,21 @@ IOMAP_WRITE`` with any combination of the following enhancements:
if the mapping is unwritten and the filesystem cannot handle zeroing
the unaligned regions without exposing stale contents.
* ``IOMAP_ATOMIC``: This write is being issued with torn-write
protection.
Only a single bio can be created for the write, and the write must
not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be
set.
The file range to write must be aligned to satisfy the requirements
of both the filesystem and the underlying block device's atomic
commit capabilities.
If filesystem metadata updates are required (e.g. unwritten extent
conversion or copy on write), all updates for the entire file range
must be committed atomically as well.
Only one space mapping is allowed per untorn write.
Untorn writes must be aligned to, and must not be longer than, a
single file block.
Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
calling this function.

View File

@ -35,13 +35,10 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
return opf;
}
static bool blkdev_dio_invalid(struct block_device *bdev, loff_t pos,
struct iov_iter *iter, bool is_atomic)
static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb,
struct iov_iter *iter)
{
if (is_atomic && !generic_atomic_write_valid(iter, pos))
return true;
return pos & (bdev_logical_block_size(bdev) - 1) ||
return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) ||
!bdev_iter_is_aligned(bdev, iter);
}
@ -368,13 +365,12 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
bool is_atomic = iocb->ki_flags & IOCB_ATOMIC;
unsigned int nr_pages;
if (!iov_iter_count(iter))
return 0;
if (blkdev_dio_invalid(bdev, iocb->ki_pos, iter, is_atomic))
if (blkdev_dio_invalid(bdev, iocb, iter))
return -EINVAL;
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
@ -383,7 +379,7 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
return __blkdev_direct_IO_simple(iocb, iter, bdev,
nr_pages);
return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
} else if (is_atomic) {
} else if (iocb->ki_flags & IOCB_ATOMIC) {
return -EINVAL;
}
return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
@ -625,7 +621,7 @@ static int blkdev_open(struct inode *inode, struct file *filp)
if (!bdev)
return -ENXIO;
if (bdev_can_atomic_write(bdev) && filp->f_flags & O_DIRECT)
if (bdev_can_atomic_write(bdev))
filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
ret = bdev_open(bdev, mode, filp->private_data, NULL, filp);
@ -700,6 +696,12 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
return -EOPNOTSUPP;
if (iocb->ki_flags & IOCB_ATOMIC) {
ret = generic_atomic_write_valid(iocb, from);
if (ret)
return ret;
}
size -= iocb->ki_pos;
if (iov_iter_count(from) > size) {
shorted = iov_iter_count(from) - size;

View File

@ -1729,6 +1729,10 @@ struct ext4_sb_info {
*/
struct work_struct s_sb_upd_work;
/* Atomic write unit values in bytes */
unsigned int s_awu_min;
unsigned int s_awu_max;
/* Ext4 fast commit sub transaction ID */
atomic_t s_fc_subtid;
@ -3855,6 +3859,12 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
return buffer_uptodate(bh);
}
static inline bool ext4_inode_can_atomic_write(struct inode *inode)
{
return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0;
}
extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
loff_t pos, unsigned len,
get_block_t *get_block);

View File

@ -599,6 +599,13 @@ out:
ssize_t err;
loff_t endbyte;
/*
* There is no support for atomic writes on buffered-io yet,
* we should never fallback to buffered-io for DIO atomic
* writes.
*/
WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC);
offset = iocb->ki_pos;
err = ext4_buffered_write_iter(iocb, from);
if (err < 0)
@ -692,6 +699,20 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (IS_DAX(inode))
return ext4_dax_write_iter(iocb, from);
#endif
if (iocb->ki_flags & IOCB_ATOMIC) {
size_t len = iov_iter_count(from);
int ret;
if (len < EXT4_SB(inode->i_sb)->s_awu_min ||
len > EXT4_SB(inode->i_sb)->s_awu_max)
return -EINVAL;
ret = generic_atomic_write_valid(iocb, from);
if (ret)
return ret;
}
if (iocb->ki_flags & IOCB_DIRECT)
return ext4_dio_write_iter(iocb, from);
else
@ -884,6 +905,9 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
return ret;
}
if (ext4_inode_can_atomic_write(inode))
filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
return dquot_file_open(inode, filp);
}

View File

@ -3444,17 +3444,34 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
return ret;
}
static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written)
{
/* must be a directio to fall back to buffered */
if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) !=
(IOMAP_WRITE | IOMAP_DIRECT))
return false;
/* atomic writes are all-or-nothing */
if (flags & IOMAP_ATOMIC)
return false;
/* can only try again if we wrote nothing */
return written == 0;
}
static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
ssize_t written, unsigned flags, struct iomap *iomap)
{
/*
* Check to see whether an error occurred while writing out the data to
* the allocated blocks. If so, return the magic error code so that we
* fallback to buffered I/O and attempt to complete the remainder of
* the I/O. Any blocks that may have been allocated in preparation for
* the direct I/O will be reused during buffered I/O.
* the allocated blocks. If so, return the magic error code for
* non-atomic write so that we fallback to buffered I/O and attempt to
* complete the remainder of the I/O.
* For non-atomic writes, any blocks that may have been
* allocated in preparation for the direct I/O will be reused during
* buffered I/O. For atomic write, we never fallback to buffered-io.
*/
if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
if (ext4_want_directio_fallback(flags, written))
return -ENOTBLK;
return 0;
@ -5578,6 +5595,18 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
}
}
if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned int awu_min = 0, awu_max = 0;
if (ext4_inode_can_atomic_write(inode)) {
awu_min = sbi->s_awu_min;
awu_max = sbi->s_awu_max;
}
generic_fill_statx_atomic_writes(stat, awu_min, awu_max);
}
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
if (flags & EXT4_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;

View File

@ -4425,6 +4425,36 @@ static int ext4_handle_clustersize(struct super_block *sb)
return 0;
}
/*
* ext4_atomic_write_init: Initializes filesystem min & max atomic write units.
* @sb: super block
* TODO: Later add support for bigalloc
*/
static void ext4_atomic_write_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct block_device *bdev = sb->s_bdev;
if (!bdev_can_atomic_write(bdev))
return;
if (!ext4_has_feature_extents(sb))
return;
sbi->s_awu_min = max(sb->s_blocksize,
bdev_atomic_write_unit_min_bytes(bdev));
sbi->s_awu_max = min(sb->s_blocksize,
bdev_atomic_write_unit_max_bytes(bdev));
if (sbi->s_awu_min && sbi->s_awu_max &&
sbi->s_awu_min <= sbi->s_awu_max) {
ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u",
sbi->s_awu_min, sbi->s_awu_max);
} else {
sbi->s_awu_min = 0;
sbi->s_awu_max = 0;
}
}
static void ext4_fast_commit_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@ -5336,6 +5366,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
spin_lock_init(&sbi->s_bdev_wb_lock);
ext4_atomic_write_init(sb);
ext4_fast_commit_init(sb);
sb->s_root = NULL;

View File

@ -271,7 +271,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
* clearing the WRITE_THROUGH flag in the dio request.
*/
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
const struct iomap *iomap, bool use_fua)
const struct iomap *iomap, bool use_fua, bool atomic)
{
blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
@ -283,6 +283,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
opflags |= REQ_FUA;
else
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
if (atomic)
opflags |= REQ_ATOMIC;
return opflags;
}
@ -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
unsigned int fs_block_size = i_blocksize(inode), pad;
loff_t length = iomap_length(iter);
const loff_t length = iomap_length(iter);
bool atomic = iter->flags & IOMAP_ATOMIC;
loff_t pos = iter->pos;
blk_opf_t bio_opf;
struct bio *bio;
@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
size_t copied = 0;
size_t orig_count;
if (atomic && length != fs_block_size)
return -EINVAL;
if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
!bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
return -EINVAL;
@ -377,12 +383,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
}
/*
* Set the operation flags early so that bio_iov_iter_get_pages
* can set up the page vector appropriately for a ZONE_APPEND
* operation.
*/
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
do {
@ -415,6 +416,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
}
n = bio->bi_iter.bi_size;
if (WARN_ON_ONCE(atomic && n != length)) {
/*
* This bio should have covered the complete length,
* which it doesn't, so error. We may need to zero out
* the tail (complete FS block), similar to when
* bio_iov_iter_get_pages() returns an error, above.
*/
ret = -EINVAL;
bio_put(bio);
goto zero_tail;
}
if (dio->flags & IOMAP_DIO_WRITE) {
task_io_account_write(n);
} else {
@ -598,6 +610,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iocb->ki_flags & IOCB_NOWAIT)
iomi.flags |= IOMAP_NOWAIT;
if (iocb->ki_flags & IOCB_ATOMIC)
iomi.flags |= IOMAP_ATOMIC;
if (iov_iter_rw(iter) == READ) {
/* reads can always complete inline */
dio->flags |= IOMAP_DIO_INLINE_COMP;
@ -659,7 +674,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (ret != -EAGAIN) {
trace_iomap_dio_invalidate_fail(inode, iomi.pos,
iomi.len);
ret = -ENOTBLK;
if (iocb->ki_flags & IOCB_ATOMIC) {
/*
* folio invalidation failed, maybe
* this is transient, unlock and see if
* the caller tries again.
*/
ret = -EAGAIN;
} else {
/* fall back to buffered write */
ret = -ENOTBLK;
}
}
goto out_free_dio;
}

View File

@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
{ IOMAP_REPORT, "REPORT" }, \
{ IOMAP_FAULT, "FAULT" }, \
{ IOMAP_DIRECT, "DIRECT" }, \
{ IOMAP_NOWAIT, "NOWAIT" }
{ IOMAP_NOWAIT, "NOWAIT" }, \
{ IOMAP_ATOMIC, "ATOMIC" }
#define IOMAP_F_FLAGS_STRINGS \
{ IOMAP_F_NEW, "NEW" }, \

View File

@ -1830,18 +1830,22 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
return 0;
}
bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos)
int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
{
size_t len = iov_iter_count(iter);
if (!iter_is_ubuf(iter))
return false;
return -EINVAL;
if (!is_power_of_2(len))
return false;
return -EINVAL;
if (!IS_ALIGNED(pos, len))
return false;
if (!IS_ALIGNED(iocb->ki_pos, len))
return -EINVAL;
return true;
if (!(iocb->ki_flags & IOCB_DIRECT))
return -EOPNOTSUPP;
return 0;
}
EXPORT_SYMBOL_GPL(generic_atomic_write_valid);

View File

@ -2115,6 +2115,13 @@ xfs_alloc_buftarg(
btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
mp, ops);
if (bdev_can_atomic_write(btp->bt_bdev)) {
btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes(
btp->bt_bdev);
btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes(
btp->bt_bdev);
}
/*
* When allocating the buftargs we have not yet read the super block and
* thus don't know the file system sector size yet.

View File

@ -124,6 +124,10 @@ struct xfs_buftarg {
struct percpu_counter bt_io_count;
struct ratelimit_state bt_ioerror_rl;
/* Atomic write unit values */
unsigned int bt_bdev_awu_min;
unsigned int bt_bdev_awu_max;
/* built-in cache, if we're not using the perag one */
struct xfs_buf_cache bt_cache[];
};

View File

@ -852,6 +852,20 @@ xfs_file_write_iter(
if (IS_DAX(inode))
return xfs_file_dax_write(iocb, from);
if (iocb->ki_flags & IOCB_ATOMIC) {
/*
* Currently only atomic writing of a single FS block is
* supported. It would be possible to atomic write smaller than
* a FS block, but there is no requirement to support this.
* Note that iomap also does not support this yet.
*/
if (ocount != ip->i_mount->m_sb.sb_blocksize)
return -EINVAL;
ret = generic_atomic_write_valid(iocb, from);
if (ret)
return ret;
}
if (iocb->ki_flags & IOCB_DIRECT) {
/*
* Allow a directio write to fall back to a buffered
@ -1239,6 +1253,8 @@ xfs_file_open(
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
if (xfs_inode_can_atomicwrite(XFS_I(inode)))
file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
return generic_file_open(inode, file);
}

View File

@ -332,6 +332,21 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip)
(XFS_IS_REALTIME_INODE(ip) ? \
(ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp)
static inline bool
xfs_inode_can_atomicwrite(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min)
return false;
if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max)
return false;
return true;
}
/*
* In-core inode flags.
*/

View File

@ -570,6 +570,20 @@ xfs_stat_blksize(
return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize);
}
static void
xfs_get_atomic_write_attr(
struct xfs_inode *ip,
unsigned int *unit_min,
unsigned int *unit_max)
{
if (!xfs_inode_can_atomicwrite(ip)) {
*unit_min = *unit_max = 0;
return;
}
*unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize;
}
STATIC int
xfs_vn_getattr(
struct mnt_idmap *idmap,
@ -639,6 +653,14 @@ xfs_vn_getattr(
stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
stat->dio_offset_align = bdev_logical_block_size(bdev);
}
if (request_mask & STATX_WRITE_ATOMIC) {
unsigned int unit_min, unit_max;
xfs_get_atomic_write_attr(ip, &unit_min,
&unit_max);
generic_fill_statx_atomic_writes(stat,
unit_min, unit_max);
}
fallthrough;
default:
stat->blksize = xfs_stat_blksize(ip);

View File

@ -1674,6 +1674,22 @@ static inline bool bdev_can_atomic_write(struct block_device *bdev)
return true;
}
static inline unsigned int
bdev_atomic_write_unit_min_bytes(struct block_device *bdev)
{
if (!bdev_can_atomic_write(bdev))
return 0;
return queue_atomic_write_unit_min_bytes(bdev_get_queue(bdev));
}
static inline unsigned int
bdev_atomic_write_unit_max_bytes(struct block_device *bdev)
{
if (!bdev_can_atomic_write(bdev))
return 0;
return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev));
}
#define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { }
#endif /* _LINUX_BLKDEV_H */

View File

@ -3798,6 +3798,6 @@ static inline bool vfs_empty_path(int dfd, const char __user *path)
return !c;
}
bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos);
int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter);
#endif /* _LINUX_FS_H */

View File

@ -178,6 +178,7 @@ struct iomap_folio_ops {
#else
#define IOMAP_DAX 0
#endif /* CONFIG_FS_DAX */
#define IOMAP_ATOMIC (1 << 9)
struct iomap_ops {
/*