fs: iomap: Atomic write support

Support direct I/O atomic writes by producing a single bio with REQ_ATOMIC
flag set.

Initially FSes (XFS) should only support writing a single FS block
atomically.

As with any atomic write, we should produce a single bio which covers the
complete write length.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
[djwong: clarify a couple of things in the docs]
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
This commit is contained in:
John Garry 2024-11-04 16:14:02 -08:00 committed by Darrick J. Wong
parent a570bad16b
commit 9e0933c21c
4 changed files with 52 additions and 5 deletions

View File

@ -513,6 +513,21 @@ IOMAP_WRITE`` with any combination of the following enhancements:
if the mapping is unwritten and the filesystem cannot handle zeroing
the unaligned regions without exposing stale contents.
* ``IOMAP_ATOMIC``: This write is being issued with torn-write
protection.
Only a single bio can be created for the write, and the write must
not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be
set.
The file range to write must be aligned to satisfy the requirements
of both the filesystem and the underlying block device's atomic
commit capabilities.
If filesystem metadata updates are required (e.g. unwritten extent
conversion or copy on write), all updates for the entire file range
must be committed atomically as well.
Only one space mapping is allowed per untorn write.
Untorn writes must be aligned to, and must not be longer than, a
single file block.
Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
calling this function.

View File

@ -271,7 +271,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
* clearing the WRITE_THROUGH flag in the dio request.
*/
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
const struct iomap *iomap, bool use_fua)
const struct iomap *iomap, bool use_fua, bool atomic)
{
blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
@ -283,6 +283,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
opflags |= REQ_FUA;
else
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
if (atomic)
opflags |= REQ_ATOMIC;
return opflags;
}
@ -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
unsigned int fs_block_size = i_blocksize(inode), pad;
loff_t length = iomap_length(iter);
const loff_t length = iomap_length(iter);
bool atomic = iter->flags & IOMAP_ATOMIC;
loff_t pos = iter->pos;
blk_opf_t bio_opf;
struct bio *bio;
@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
size_t copied = 0;
size_t orig_count;
if (atomic && length != fs_block_size)
return -EINVAL;
if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
!bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
return -EINVAL;
@ -382,7 +388,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
* can set up the page vector appropriately for a ZONE_APPEND
* operation.
*/
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
do {
@ -415,6 +421,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
}
n = bio->bi_iter.bi_size;
if (WARN_ON_ONCE(atomic && n != length)) {
/*
* This bio should have covered the complete length,
* which it doesn't, so error. We may need to zero out
* the tail (complete FS block), similar to when
* bio_iov_iter_get_pages() returns an error, above.
*/
ret = -EINVAL;
bio_put(bio);
goto zero_tail;
}
if (dio->flags & IOMAP_DIO_WRITE) {
task_io_account_write(n);
} else {
@ -598,6 +615,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iocb->ki_flags & IOCB_NOWAIT)
iomi.flags |= IOMAP_NOWAIT;
if (iocb->ki_flags & IOCB_ATOMIC)
iomi.flags |= IOMAP_ATOMIC;
if (iov_iter_rw(iter) == READ) {
/* reads can always complete inline */
dio->flags |= IOMAP_DIO_INLINE_COMP;
@ -659,7 +679,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (ret != -EAGAIN) {
trace_iomap_dio_invalidate_fail(inode, iomi.pos,
iomi.len);
ret = -ENOTBLK;
if (iocb->ki_flags & IOCB_ATOMIC) {
/*
* folio invalidation failed, maybe
* this is transient, unlock and see if
* the caller tries again.
*/
ret = -EAGAIN;
} else {
/* fall back to buffered write */
ret = -ENOTBLK;
}
}
goto out_free_dio;
}

View File

@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
{ IOMAP_REPORT, "REPORT" }, \
{ IOMAP_FAULT, "FAULT" }, \
{ IOMAP_DIRECT, "DIRECT" }, \
{ IOMAP_NOWAIT, "NOWAIT" }
{ IOMAP_NOWAIT, "NOWAIT" }, \
{ IOMAP_ATOMIC, "ATOMIC" }
#define IOMAP_F_FLAGS_STRINGS \
{ IOMAP_F_NEW, "NEW" }, \

View File

@ -178,6 +178,7 @@ struct iomap_folio_ops {
#else
#define IOMAP_DAX 0
#endif /* CONFIG_FS_DAX */
#define IOMAP_ATOMIC (1 << 9)
struct iomap_ops {
/*