2022-06-13 13:27:03 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/file.h>
|
|
|
|
#include <linux/blk-mq.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/fsnotify.h>
|
|
|
|
#include <linux/poll.h>
|
|
|
|
#include <linux/nospec.h>
|
|
|
|
#include <linux/compat.h>
|
2023-12-01 00:57:35 +00:00
|
|
|
#include <linux/io_uring/cmd.h>
|
2024-02-12 23:42:36 +00:00
|
|
|
#include <linux/indirect_call_wrapper.h>
|
2022-06-13 13:27:03 +00:00
|
|
|
|
|
|
|
#include <uapi/linux/io_uring.h>
|
|
|
|
|
|
|
|
#include "io_uring.h"
|
|
|
|
#include "opdef.h"
|
|
|
|
#include "kbuf.h"
|
2024-03-20 21:19:44 +00:00
|
|
|
#include "alloc_cache.h"
|
2022-06-13 13:27:03 +00:00
|
|
|
#include "rsrc.h"
|
io_uring/rw: ensure poll based multishot read retries appropriately
io_read_mshot() always relies on poll triggering retries, and this works
fine as long as we do a retry per size of the buffer being read. The
buffer size is given by the size of the buffer(s) in the given buffer
group ID.
But if we're reading less than what is available, then we don't always
get to read everything that is available. For example, if the buffers
available are 32 bytes and we have 64 bytes to read, then we'll
correctly read the first 32 bytes and then wait for another poll trigger
before we attempt the next read. This next poll trigger may never
happen, in which case we just sit forever and never make progress, or it
may trigger at some point in the future, and now we're just delivering
the available data much later than we should have.
io_read_mshot() could do retries itself, but that is wasteful as we'll
be going through all of __io_read() again, and most likely in vain.
Rather than do that, bump our poll reference count and have
io_poll_check_events() do one more loop and check with vfs_poll() if we
have more data to read. If we do, io_read_mshot() will get invoked again
directly and we'll read the next chunk.
io_poll_multishot_retry() must only get called from inside
io_poll_issue(), which is our multishot retry handler, as we know we
already "own" the request at this point.
Cc: stable@vger.kernel.org
Link: https://github.com/axboe/liburing/issues/1041
Fixes: fc68fcda0491 ("io_uring/rw: add support for IORING_OP_READ_MULTISHOT")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-01-27 20:44:58 +00:00
|
|
|
#include "poll.h"
|
2022-06-13 13:27:03 +00:00
|
|
|
#include "rw.h"
|
|
|
|
|
|
|
|
struct io_rw {
|
|
|
|
/* NOTE: kiocb has the file as the first member, so don't do it here */
|
|
|
|
struct kiocb kiocb;
|
|
|
|
u64 addr;
|
|
|
|
u32 len;
|
|
|
|
rwf_t flags;
|
|
|
|
};
|
|
|
|
|
2024-10-06 16:40:36 +00:00
|
|
|
static bool io_file_supports_nowait(struct io_kiocb *req, __poll_t mask)
|
2022-06-13 13:27:03 +00:00
|
|
|
{
|
2024-10-06 16:40:36 +00:00
|
|
|
/* If FMODE_NOWAIT is set for a file, we're golden */
|
|
|
|
if (req->flags & REQ_F_SUPPORT_NOWAIT)
|
|
|
|
return true;
|
|
|
|
/* No FMODE_NOWAIT, if we can poll, check the status */
|
|
|
|
if (io_file_can_poll(req)) {
|
|
|
|
struct poll_table_struct pt = { ._key = mask };
|
|
|
|
|
|
|
|
return vfs_poll(req->file, &pt) & mask;
|
|
|
|
}
|
|
|
|
/* No FMODE_NOWAIT support, and file isn't pollable. Tough luck. */
|
|
|
|
return false;
|
2022-06-13 13:27:03 +00:00
|
|
|
}
|
|
|
|
|
2022-09-07 16:51:52 +00:00
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
static int io_iov_compat_buffer_select_prep(struct io_rw *rw)
|
|
|
|
{
|
|
|
|
struct compat_iovec __user *uiov;
|
|
|
|
compat_ssize_t clen;
|
|
|
|
|
|
|
|
uiov = u64_to_user_ptr(rw->addr);
|
|
|
|
if (!access_ok(uiov, sizeof(*uiov)))
|
|
|
|
return -EFAULT;
|
|
|
|
if (__get_user(clen, &uiov->iov_len))
|
|
|
|
return -EFAULT;
|
|
|
|
if (clen < 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
rw->len = clen;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static int io_iov_buffer_select_prep(struct io_kiocb *req)
|
|
|
|
{
|
|
|
|
struct iovec __user *uiov;
|
|
|
|
struct iovec iov;
|
|
|
|
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
|
|
|
|
|
|
|
if (rw->len != 1)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
if (req->ctx->compat)
|
|
|
|
return io_iov_compat_buffer_select_prep(rw);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
uiov = u64_to_user_ptr(rw->addr);
|
|
|
|
if (copy_from_user(&iov, uiov, sizeof(*uiov)))
|
|
|
|
return -EFAULT;
|
|
|
|
rw->len = iov.iov_len;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-03-18 22:13:01 +00:00
|
|
|
static int __io_import_iovec(int ddir, struct io_kiocb *req,
|
|
|
|
struct io_async_rw *io,
|
|
|
|
unsigned int issue_flags)
|
|
|
|
{
|
|
|
|
const struct io_issue_def *def = &io_issue_defs[req->opcode];
|
|
|
|
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
2024-03-18 22:31:44 +00:00
|
|
|
struct iovec *iov;
|
2024-03-18 22:13:01 +00:00
|
|
|
void __user *buf;
|
2024-03-18 22:31:44 +00:00
|
|
|
int nr_segs, ret;
|
2024-03-18 22:13:01 +00:00
|
|
|
size_t sqe_len;
|
|
|
|
|
|
|
|
buf = u64_to_user_ptr(rw->addr);
|
|
|
|
sqe_len = rw->len;
|
|
|
|
|
|
|
|
if (!def->vectored || req->flags & REQ_F_BUFFER_SELECT) {
|
|
|
|
if (io_do_buffer_select(req)) {
|
|
|
|
buf = io_buffer_select(req, &sqe_len, issue_flags);
|
|
|
|
if (!buf)
|
|
|
|
return -ENOBUFS;
|
|
|
|
rw->addr = (unsigned long) buf;
|
|
|
|
rw->len = sqe_len;
|
|
|
|
}
|
|
|
|
|
2024-03-18 22:25:58 +00:00
|
|
|
return import_ubuf(ddir, buf, sqe_len, &io->iter);
|
2024-03-18 22:13:01 +00:00
|
|
|
}
|
|
|
|
|
2024-03-18 22:31:44 +00:00
|
|
|
if (io->free_iovec) {
|
|
|
|
nr_segs = io->free_iov_nr;
|
|
|
|
iov = io->free_iovec;
|
|
|
|
} else {
|
|
|
|
iov = &io->fast_iov;
|
|
|
|
nr_segs = 1;
|
|
|
|
}
|
|
|
|
ret = __import_iovec(ddir, buf, sqe_len, nr_segs, &iov, &io->iter,
|
|
|
|
req->ctx->compat);
|
|
|
|
if (unlikely(ret < 0))
|
|
|
|
return ret;
|
|
|
|
if (iov) {
|
|
|
|
req->flags |= REQ_F_NEED_CLEANUP;
|
|
|
|
io->free_iov_nr = io->iter.nr_segs;
|
|
|
|
kfree(io->free_iovec);
|
|
|
|
io->free_iovec = iov;
|
|
|
|
}
|
|
|
|
return 0;
|
2024-03-18 22:13:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int io_import_iovec(int rw, struct io_kiocb *req,
|
|
|
|
struct io_async_rw *io,
|
|
|
|
unsigned int issue_flags)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = __io_import_iovec(rw, req, io, issue_flags);
|
|
|
|
if (unlikely(ret < 0))
|
|
|
|
return ret;
|
|
|
|
|
2024-03-18 22:25:58 +00:00
|
|
|
iov_iter_save_state(&io->iter, &io->iter_state);
|
2024-03-18 22:13:01 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void io_rw_iovec_free(struct io_async_rw *rw)
|
|
|
|
{
|
|
|
|
if (rw->free_iovec) {
|
|
|
|
kfree(rw->free_iovec);
|
2024-03-18 22:31:44 +00:00
|
|
|
rw->free_iov_nr = 0;
|
2024-03-18 22:13:01 +00:00
|
|
|
rw->free_iovec = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags)
|
|
|
|
{
|
|
|
|
struct io_async_rw *rw = req->async_data;
|
2024-03-18 22:31:44 +00:00
|
|
|
struct iovec *iov;
|
2024-03-18 22:13:01 +00:00
|
|
|
|
|
|
|
if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
|
|
|
|
io_rw_iovec_free(rw);
|
|
|
|
return;
|
|
|
|
}
|
2024-03-18 22:31:44 +00:00
|
|
|
iov = rw->free_iovec;
|
2024-03-20 21:19:44 +00:00
|
|
|
if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) {
|
2024-03-18 22:31:44 +00:00
|
|
|
if (iov)
|
|
|
|
kasan_mempool_poison_object(iov);
|
2024-03-18 22:13:01 +00:00
|
|
|
req->async_data = NULL;
|
|
|
|
req->flags &= ~REQ_F_ASYNC_DATA;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Disable quick recycling for anything that's gone through io-wq.
|
|
|
|
* In theory, this should be fine to cleanup. However, some read or
|
|
|
|
* write iter handling touches the iovec AFTER having called into the
|
|
|
|
* handler, eg to reexpand or revert. This means we can have:
|
|
|
|
*
|
|
|
|
* task io-wq
|
|
|
|
* issue
|
|
|
|
* punt to io-wq
|
|
|
|
* issue
|
|
|
|
* blkdev_write_iter()
|
|
|
|
* ->ki_complete()
|
|
|
|
* io_complete_rw()
|
|
|
|
* queue tw complete
|
|
|
|
* run tw
|
|
|
|
* req_rw_cleanup
|
|
|
|
* iov_iter_count() <- look at iov_iter again
|
|
|
|
*
|
|
|
|
* which can lead to a UAF. This is only possible for io-wq offload
|
|
|
|
* as the cleanup can run in parallel. As io-wq is not the fast path,
|
|
|
|
* just leave cleanup to the end.
|
|
|
|
*
|
|
|
|
* This is really a bug in the core code that does this, any issue
|
|
|
|
* path should assume that a successful (or -EIOCBQUEUED) return can
|
|
|
|
* mean that the underlying data can be gone at any time. But that
|
|
|
|
* should be fixed seperately, and then this check could be killed.
|
|
|
|
*/
|
|
|
|
if (!(req->flags & REQ_F_REFCOUNT)) {
|
|
|
|
req->flags &= ~REQ_F_NEED_CLEANUP;
|
|
|
|
io_rw_recycle(req, issue_flags);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int io_rw_alloc_async(struct io_kiocb *req)
|
|
|
|
{
|
|
|
|
struct io_ring_ctx *ctx = req->ctx;
|
|
|
|
struct io_async_rw *rw;
|
|
|
|
|
2024-03-20 21:19:44 +00:00
|
|
|
rw = io_alloc_cache_get(&ctx->rw_cache);
|
|
|
|
if (rw) {
|
2024-03-18 22:31:44 +00:00
|
|
|
if (rw->free_iovec) {
|
|
|
|
kasan_mempool_unpoison_object(rw->free_iovec,
|
|
|
|
rw->free_iov_nr * sizeof(struct iovec));
|
|
|
|
req->flags |= REQ_F_NEED_CLEANUP;
|
|
|
|
}
|
2024-03-18 22:13:01 +00:00
|
|
|
req->flags |= REQ_F_ASYNC_DATA;
|
|
|
|
req->async_data = rw;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!io_alloc_async_data(req)) {
|
|
|
|
rw = req->async_data;
|
|
|
|
rw->free_iovec = NULL;
|
2024-03-18 22:31:44 +00:00
|
|
|
rw->free_iov_nr = 0;
|
|
|
|
done:
|
2024-03-18 22:13:01 +00:00
|
|
|
rw->bytes_done = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
|
|
|
|
{
|
|
|
|
struct io_async_rw *rw;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (io_rw_alloc_async(req))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
if (!do_import || io_do_buffer_select(req))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
rw = req->async_data;
|
|
|
|
ret = io_import_iovec(ddir, req, rw, 0);
|
|
|
|
if (unlikely(ret < 0))
|
|
|
|
return ret;
|
|
|
|
|
2024-03-18 22:25:58 +00:00
|
|
|
iov_iter_save_state(&rw->iter, &rw->iter_state);
|
2024-03-18 22:13:01 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
|
|
|
|
int ddir, bool do_import)
|
2022-06-13 13:27:03 +00:00
|
|
|
{
|
2022-08-11 07:11:15 +00:00
|
|
|
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
2022-06-13 13:27:03 +00:00
|
|
|
unsigned ioprio;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
rw->kiocb.ki_pos = READ_ONCE(sqe->off);
|
|
|
|
/* used for fixed read/write too - just read unconditionally */
|
|
|
|
req->buf_index = READ_ONCE(sqe->buf_index);
|
|
|
|
|
|
|
|
ioprio = READ_ONCE(sqe->ioprio);
|
|
|
|
if (ioprio) {
|
|
|
|
ret = ioprio_check_cap(ioprio);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
rw->kiocb.ki_ioprio = ioprio;
|
|
|
|
} else {
|
|
|
|
rw->kiocb.ki_ioprio = get_current_ioprio();
|
|
|
|
}
|
2023-07-08 16:03:52 +00:00
|
|
|
rw->kiocb.dio_complete = NULL;
|
2022-06-13 13:27:03 +00:00
|
|
|
|
|
|
|
rw->addr = READ_ONCE(sqe->addr);
|
|
|
|
rw->len = READ_ONCE(sqe->len);
|
|
|
|
rw->flags = READ_ONCE(sqe->rw_flags);
|
2024-03-18 22:13:01 +00:00
|
|
|
return io_prep_rw_setup(req, ddir, do_import);
|
2023-11-06 14:41:17 +00:00
|
|
|
}
|
|
|
|
|
2024-03-18 22:13:01 +00:00
|
|
|
int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
2023-11-06 14:41:17 +00:00
|
|
|
{
|
2024-03-18 22:13:01 +00:00
|
|
|
return io_prep_rw(req, sqe, ITER_DEST, true);
|
2023-11-06 14:41:17 +00:00
|
|
|
}
|
|
|
|
|
2024-03-18 22:13:01 +00:00
|
|
|
int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
2023-11-06 14:41:17 +00:00
|
|
|
{
|
2024-03-18 22:13:01 +00:00
|
|
|
return io_prep_rw(req, sqe, ITER_SOURCE, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe,
|
|
|
|
int ddir)
|
|
|
|
{
|
|
|
|
const bool do_import = !(req->flags & REQ_F_BUFFER_SELECT);
|
2023-11-06 14:41:17 +00:00
|
|
|
int ret;
|
2022-09-07 16:51:52 +00:00
|
|
|
|
2024-03-18 22:13:01 +00:00
|
|
|
ret = io_prep_rw(req, sqe, ddir, do_import);
|
2023-11-06 14:41:17 +00:00
|
|
|
if (unlikely(ret))
|
|
|
|
return ret;
|
2024-03-18 22:13:01 +00:00
|
|
|
if (do_import)
|
|
|
|
return 0;
|
2023-11-06 14:41:17 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Have to do this validation here, as this is in io_read() rw->len
|
|
|
|
* might have chanaged due to buffer selection
|
2022-09-07 16:51:52 +00:00
|
|
|
*/
|
2024-03-18 22:13:01 +00:00
|
|
|
return io_iov_buffer_select_prep(req);
|
|
|
|
}
|
2022-09-07 16:51:52 +00:00
|
|
|
|
2024-03-18 22:13:01 +00:00
|
|
|
int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
|
|
|
{
|
|
|
|
return io_prep_rwv(req, sqe, ITER_DEST);
|
|
|
|
}
|
|
|
|
|
|
|
|
int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
|
|
|
{
|
|
|
|
return io_prep_rwv(req, sqe, ITER_SOURCE);
|
2022-06-13 13:27:03 +00:00
|
|
|
}
|
|
|
|
|
2024-03-18 22:13:01 +00:00
|
|
|
static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe,
|
|
|
|
int ddir)
|
2023-11-06 14:43:16 +00:00
|
|
|
{
|
2024-03-18 22:13:01 +00:00
|
|
|
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
2023-11-06 14:43:16 +00:00
|
|
|
struct io_ring_ctx *ctx = req->ctx;
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
struct io_rsrc_node *node;
|
2024-03-18 22:13:01 +00:00
|
|
|
struct io_async_rw *io;
|
2023-11-06 14:43:16 +00:00
|
|
|
int ret;
|
|
|
|
|
2024-03-18 22:13:01 +00:00
|
|
|
ret = io_prep_rw(req, sqe, ddir, false);
|
2023-11-06 14:43:16 +00:00
|
|
|
if (unlikely(ret))
|
|
|
|
return ret;
|
|
|
|
|
2024-10-27 15:08:31 +00:00
|
|
|
node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
|
|
|
|
if (!node)
|
2023-11-06 14:43:16 +00:00
|
|
|
return -EFAULT;
|
2024-11-07 11:01:36 +00:00
|
|
|
io_req_assign_buf_node(req, node);
|
2024-03-18 22:13:01 +00:00
|
|
|
|
|
|
|
io = req->async_data;
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len);
|
2024-03-18 22:25:58 +00:00
|
|
|
iov_iter_save_state(&io->iter, &io->iter_state);
|
2024-03-18 22:13:01 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
|
|
|
{
|
|
|
|
return io_prep_rw_fixed(req, sqe, ITER_DEST);
|
|
|
|
}
|
|
|
|
|
|
|
|
int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
|
|
|
{
|
|
|
|
return io_prep_rw_fixed(req, sqe, ITER_SOURCE);
|
2023-11-06 14:43:16 +00:00
|
|
|
}
|
|
|
|
|
2023-09-11 19:35:42 +00:00
|
|
|
/*
|
|
|
|
* Multishot read is prepared just like a normal read/write request, only
|
|
|
|
* difference is that we set the MULTISHOT flag.
|
|
|
|
*/
|
|
|
|
int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
|
|
|
{
|
2023-11-06 20:39:08 +00:00
|
|
|
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
2023-09-11 19:35:42 +00:00
|
|
|
int ret;
|
|
|
|
|
2023-11-03 15:26:13 +00:00
|
|
|
/* must be used with provided buffers */
|
|
|
|
if (!(req->flags & REQ_F_BUFFER_SELECT))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2024-03-18 22:13:01 +00:00
|
|
|
ret = io_prep_rw(req, sqe, ITER_DEST, false);
|
2023-09-11 19:35:42 +00:00
|
|
|
if (unlikely(ret))
|
|
|
|
return ret;
|
|
|
|
|
2023-11-06 20:39:08 +00:00
|
|
|
if (rw->addr || rw->len)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2023-09-11 19:35:42 +00:00
|
|
|
req->flags |= REQ_F_APOLL_MULTISHOT;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-06-13 13:27:03 +00:00
|
|
|
void io_readv_writev_cleanup(struct io_kiocb *req)
|
|
|
|
{
|
2024-03-18 22:13:01 +00:00
|
|
|
io_rw_iovec_free(req->async_data);
|
2022-06-13 13:27:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
|
|
|
|
{
|
2022-08-11 07:11:15 +00:00
|
|
|
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
2022-06-13 13:27:03 +00:00
|
|
|
|
|
|
|
if (rw->kiocb.ki_pos != -1)
|
|
|
|
return &rw->kiocb.ki_pos;
|
|
|
|
|
|
|
|
if (!(req->file->f_mode & FMODE_STREAM)) {
|
|
|
|
req->flags |= REQ_F_CUR_POS;
|
|
|
|
rw->kiocb.ki_pos = req->file->f_pos;
|
|
|
|
return &rw->kiocb.ki_pos;
|
|
|
|
}
|
|
|
|
|
|
|
|
rw->kiocb.ki_pos = 0;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_BLOCK
|
2024-04-25 15:04:32 +00:00
|
|
|
static void io_resubmit_prep(struct io_kiocb *req)
|
2022-06-13 13:27:03 +00:00
|
|
|
{
|
|
|
|
struct io_async_rw *io = req->async_data;
|
|
|
|
|
2024-04-25 15:04:32 +00:00
|
|
|
iov_iter_restore(&io->iter, &io->iter_state);
|
2022-06-13 13:27:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool io_rw_should_reissue(struct io_kiocb *req)
|
|
|
|
{
|
|
|
|
umode_t mode = file_inode(req->file)->i_mode;
|
|
|
|
struct io_ring_ctx *ctx = req->ctx;
|
|
|
|
|
|
|
|
if (!S_ISBLK(mode) && !S_ISREG(mode))
|
|
|
|
return false;
|
|
|
|
if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
|
|
|
|
!(ctx->flags & IORING_SETUP_IOPOLL)))
|
|
|
|
return false;
|
|
|
|
/*
|
|
|
|
* If ref is dying, we might be running poll reap from the exit work.
|
|
|
|
* Don't attempt to reissue from that path, just let it fail with
|
|
|
|
* -EAGAIN.
|
|
|
|
*/
|
|
|
|
if (percpu_ref_is_dying(&ctx->refs))
|
|
|
|
return false;
|
|
|
|
/*
|
|
|
|
* Play it safe and assume not safe to re-import and reissue if we're
|
|
|
|
* not in the original thread group (or in task context).
|
|
|
|
*/
|
io_uring: move struct io_kiocb from task_struct to io_uring_task
Rather than store the task_struct itself in struct io_kiocb, store
the io_uring specific task_struct. The life times are the same in terms
of io_uring, and this avoids doing some dereferences through the
task_struct. For the hot path of putting local task references, we can
deref req->tctx instead, which we'll need anyway in that function
regardless of whether it's local or remote references.
This is mostly straight forward, except the original task PF_EXITING
check needs a bit of tweaking. task_work is _always_ run from the
originating task, except in the fallback case, where it's run from a
kernel thread. Replace the potentially racy (in case of fallback work)
checks for req->task->flags with current->flags. It's either the still
the original task, in which case PF_EXITING will be sane, or it has
PF_KTHREAD set, in which case it's fallback work. Both cases should
prevent moving forward with the given request.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-03 17:23:38 +00:00
|
|
|
if (!same_thread_group(req->tctx->task, current) || !in_task())
|
2022-06-13 13:27:03 +00:00
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
#else
|
2024-04-25 15:04:32 +00:00
|
|
|
static void io_resubmit_prep(struct io_kiocb *req)
|
2022-06-13 13:27:03 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
static bool io_rw_should_reissue(struct io_kiocb *req)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2023-08-17 14:13:31 +00:00
|
|
|
static void io_req_end_write(struct io_kiocb *req)
|
2022-06-13 13:27:03 +00:00
|
|
|
{
|
|
|
|
if (req->flags & REQ_F_ISREG) {
|
2023-08-17 14:13:34 +00:00
|
|
|
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
2022-06-13 13:27:03 +00:00
|
|
|
|
2023-08-17 14:13:34 +00:00
|
|
|
kiocb_end_write(&rw->kiocb);
|
2022-06-13 13:27:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-10-11 15:06:23 +00:00
|
|
|
/*
|
|
|
|
* Trigger the notifications after having done some IO, and finish the write
|
|
|
|
* accounting, if any.
|
|
|
|
*/
|
|
|
|
static void io_req_io_end(struct io_kiocb *req)
|
|
|
|
{
|
|
|
|
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
|
|
|
|
|
|
|
if (rw->kiocb.ki_flags & IOCB_WRITE) {
|
2023-08-17 14:13:31 +00:00
|
|
|
io_req_end_write(req);
|
2022-10-11 15:06:23 +00:00
|
|
|
fsnotify_modify(req->file);
|
|
|
|
} else {
|
|
|
|
fsnotify_access(req->file);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-06-13 13:27:03 +00:00
|
|
|
static bool __io_complete_rw_common(struct io_kiocb *req, long res)
|
|
|
|
{
|
|
|
|
if (unlikely(res != req->cqe.res)) {
|
2024-09-10 14:57:04 +00:00
|
|
|
if (res == -EAGAIN && io_rw_should_reissue(req)) {
|
2022-10-11 15:06:23 +00:00
|
|
|
/*
|
|
|
|
* Reissue will start accounting again, finish the
|
|
|
|
* current cycle.
|
|
|
|
*/
|
|
|
|
io_req_io_end(req);
|
2024-03-07 19:53:24 +00:00
|
|
|
req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
|
2022-06-13 13:27:03 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
req_set_fail(req);
|
|
|
|
req->cqe.res = res;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2022-09-13 12:21:23 +00:00
|
|
|
static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
|
2022-09-09 11:11:49 +00:00
|
|
|
{
|
|
|
|
struct io_async_rw *io = req->async_data;
|
|
|
|
|
|
|
|
/* add previously done IO, if any */
|
|
|
|
if (req_has_async_data(req) && io->bytes_done > 0) {
|
|
|
|
if (res < 0)
|
|
|
|
res = io->bytes_done;
|
|
|
|
else
|
|
|
|
res += io->bytes_done;
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2023-06-02 14:41:46 +00:00
|
|
|
void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
|
2022-09-29 16:57:05 +00:00
|
|
|
{
|
2023-07-08 16:03:52 +00:00
|
|
|
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
|
|
|
struct kiocb *kiocb = &rw->kiocb;
|
|
|
|
|
|
|
|
if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) {
|
|
|
|
long res = kiocb->dio_complete(rw->kiocb.private);
|
|
|
|
|
|
|
|
io_req_set_res(req, io_fixup_rw_res(req, res), 0);
|
|
|
|
}
|
|
|
|
|
2022-10-11 15:06:23 +00:00
|
|
|
io_req_io_end(req);
|
2022-11-04 10:59:40 +00:00
|
|
|
|
2024-03-18 22:00:30 +00:00
|
|
|
if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
|
2024-08-27 14:26:07 +00:00
|
|
|
req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0);
|
2022-11-04 10:59:40 +00:00
|
|
|
|
2024-03-18 22:13:01 +00:00
|
|
|
io_req_rw_cleanup(req, 0);
|
2023-03-27 15:38:15 +00:00
|
|
|
io_req_task_complete(req, ts);
|
2022-09-29 16:57:05 +00:00
|
|
|
}
|
|
|
|
|
2022-06-13 13:27:03 +00:00
|
|
|
static void io_complete_rw(struct kiocb *kiocb, long res)
|
|
|
|
{
|
|
|
|
struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
|
|
|
|
struct io_kiocb *req = cmd_to_io_kiocb(rw);
|
|
|
|
|
2023-07-08 16:03:52 +00:00
|
|
|
if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) {
|
|
|
|
if (__io_complete_rw_common(req, res))
|
|
|
|
return;
|
|
|
|
io_req_set_res(req, io_fixup_rw_res(req, res), 0);
|
|
|
|
}
|
2022-09-29 16:57:05 +00:00
|
|
|
req->io_task_work.func = io_req_rw_complete;
|
2023-04-06 13:20:12 +00:00
|
|
|
__io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE);
|
2022-06-13 13:27:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
|
|
|
|
{
|
|
|
|
struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
|
|
|
|
struct io_kiocb *req = cmd_to_io_kiocb(rw);
|
|
|
|
|
|
|
|
if (kiocb->ki_flags & IOCB_WRITE)
|
2023-08-17 14:13:31 +00:00
|
|
|
io_req_end_write(req);
|
2022-06-13 13:27:03 +00:00
|
|
|
if (unlikely(res != req->cqe.res)) {
|
|
|
|
if (res == -EAGAIN && io_rw_should_reissue(req)) {
|
2024-03-07 19:53:24 +00:00
|
|
|
req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
|
2022-06-13 13:27:03 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
req->cqe.res = res;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* order with io_iopoll_complete() checking ->iopoll_completed */
|
|
|
|
smp_store_release(&req->iopoll_completed, 1);
|
|
|
|
}
|
|
|
|
|
2024-01-10 17:05:32 +00:00
|
|
|
static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
|
|
|
|
{
|
|
|
|
/* IO was queued async, completion will happen later */
|
|
|
|
if (ret == -EIOCBQUEUED)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* transform internal restart error codes */
|
|
|
|
if (unlikely(ret < 0)) {
|
|
|
|
switch (ret) {
|
|
|
|
case -ERESTARTSYS:
|
|
|
|
case -ERESTARTNOINTR:
|
|
|
|
case -ERESTARTNOHAND:
|
|
|
|
case -ERESTART_RESTARTBLOCK:
|
|
|
|
/*
|
|
|
|
* We can't just restart the syscall, since previously
|
|
|
|
* submitted sqes may already be in progress. Just fail
|
|
|
|
* this IO with EINTR.
|
|
|
|
*/
|
|
|
|
ret = -EINTR;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
INDIRECT_CALL_2(kiocb->ki_complete, io_complete_rw_iopoll,
|
|
|
|
io_complete_rw, kiocb, ret);
|
|
|
|
}
|
|
|
|
|
2022-06-16 09:21:57 +00:00
|
|
|
static int kiocb_done(struct io_kiocb *req, ssize_t ret,
|
2022-06-13 13:27:03 +00:00
|
|
|
unsigned int issue_flags)
|
|
|
|
{
|
2022-08-11 07:11:15 +00:00
|
|
|
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
2022-09-09 11:11:49 +00:00
|
|
|
unsigned final_ret = io_fixup_rw_res(req, ret);
|
2022-06-13 13:27:03 +00:00
|
|
|
|
2023-08-28 22:47:31 +00:00
|
|
|
if (ret >= 0 && req->flags & REQ_F_CUR_POS)
|
2022-06-13 13:27:03 +00:00
|
|
|
req->file->f_pos = rw->kiocb.ki_pos;
|
2022-06-16 09:21:57 +00:00
|
|
|
if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) {
|
|
|
|
if (!__io_complete_rw_common(req, ret)) {
|
2022-10-11 15:06:23 +00:00
|
|
|
/*
|
|
|
|
* Safe to call io_end from here as we're inline
|
|
|
|
* from the submission path.
|
|
|
|
*/
|
|
|
|
io_req_io_end(req);
|
2022-09-09 11:11:49 +00:00
|
|
|
io_req_set_res(req, final_ret,
|
2024-08-27 14:26:07 +00:00
|
|
|
io_put_kbuf(req, ret, issue_flags));
|
2024-03-18 22:13:01 +00:00
|
|
|
io_req_rw_cleanup(req, issue_flags);
|
2022-06-16 09:21:57 +00:00
|
|
|
return IOU_OK;
|
|
|
|
}
|
|
|
|
} else {
|
2022-06-13 13:27:03 +00:00
|
|
|
io_rw_done(&rw->kiocb, ret);
|
2022-06-16 09:21:57 +00:00
|
|
|
}
|
2022-06-13 13:27:03 +00:00
|
|
|
|
|
|
|
if (req->flags & REQ_F_REISSUE) {
|
|
|
|
req->flags &= ~REQ_F_REISSUE;
|
2024-04-25 15:04:32 +00:00
|
|
|
io_resubmit_prep(req);
|
2024-03-18 22:13:01 +00:00
|
|
|
return -EAGAIN;
|
2022-06-13 13:27:03 +00:00
|
|
|
}
|
2022-06-16 09:21:57 +00:00
|
|
|
return IOU_ISSUE_SKIP_COMPLETE;
|
2022-06-13 13:27:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
|
|
|
|
{
|
|
|
|
return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For files that don't have ->read_iter() and ->write_iter(), handle them
|
|
|
|
* by looping over ->read() or ->write() manually.
|
|
|
|
*/
|
|
|
|
static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
|
|
|
|
{
|
|
|
|
struct kiocb *kiocb = &rw->kiocb;
|
|
|
|
struct file *file = kiocb->ki_filp;
|
|
|
|
ssize_t ret = 0;
|
|
|
|
loff_t *ppos;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't support polled IO through this interface, and we can't
|
|
|
|
* support non-blocking either. For the latter, this just causes
|
|
|
|
* the kiocb to be handled from an async context.
|
|
|
|
*/
|
|
|
|
if (kiocb->ki_flags & IOCB_HIPRI)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
if ((kiocb->ki_flags & IOCB_NOWAIT) &&
|
|
|
|
!(kiocb->ki_filp->f_flags & O_NONBLOCK))
|
|
|
|
return -EAGAIN;
|
|
|
|
|
|
|
|
ppos = io_kiocb_ppos(kiocb);
|
|
|
|
|
|
|
|
while (iov_iter_count(iter)) {
|
2023-03-29 15:16:45 +00:00
|
|
|
void __user *addr;
|
|
|
|
size_t len;
|
2022-06-13 13:27:03 +00:00
|
|
|
ssize_t nr;
|
|
|
|
|
2023-01-05 19:07:32 +00:00
|
|
|
if (iter_is_ubuf(iter)) {
|
2023-03-29 15:16:45 +00:00
|
|
|
addr = iter->ubuf + iter->iov_offset;
|
|
|
|
len = iov_iter_count(iter);
|
2023-01-05 19:07:32 +00:00
|
|
|
} else if (!iov_iter_is_bvec(iter)) {
|
2023-03-29 15:16:45 +00:00
|
|
|
addr = iter_iov_addr(iter);
|
|
|
|
len = iter_iov_len(iter);
|
2022-06-13 13:27:03 +00:00
|
|
|
} else {
|
2023-03-29 15:16:45 +00:00
|
|
|
addr = u64_to_user_ptr(rw->addr);
|
|
|
|
len = rw->len;
|
2022-06-13 13:27:03 +00:00
|
|
|
}
|
|
|
|
|
2023-03-29 15:16:45 +00:00
|
|
|
if (ddir == READ)
|
|
|
|
nr = file->f_op->read(file, addr, len, ppos);
|
|
|
|
else
|
|
|
|
nr = file->f_op->write(file, addr, len, ppos);
|
2022-06-13 13:27:03 +00:00
|
|
|
|
|
|
|
if (nr < 0) {
|
|
|
|
if (!ret)
|
|
|
|
ret = nr;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
ret += nr;
|
|
|
|
if (!iov_iter_is_bvec(iter)) {
|
|
|
|
iov_iter_advance(iter, nr);
|
|
|
|
} else {
|
|
|
|
rw->addr += nr;
|
|
|
|
rw->len -= nr;
|
|
|
|
if (!rw->len)
|
|
|
|
break;
|
|
|
|
}
|
2023-03-29 15:16:45 +00:00
|
|
|
if (nr != len)
|
2022-06-13 13:27:03 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is our waitqueue callback handler, registered through __folio_lock_async()
|
|
|
|
* when we initially tried to do the IO with the iocb armed our waitqueue.
|
|
|
|
* This gets called when the page is unlocked, and we generally expect that to
|
|
|
|
* happen when the page IO is completed and the page is now uptodate. This will
|
|
|
|
* queue a task_work based retry of the operation, attempting to copy the data
|
|
|
|
* again. If the latter fails because the page was NOT uptodate, then we will
|
|
|
|
* do a thread based blocking retry of the operation. That's the unexpected
|
|
|
|
* slow path.
|
|
|
|
*/
|
|
|
|
static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
|
|
|
|
int sync, void *arg)
|
|
|
|
{
|
|
|
|
struct wait_page_queue *wpq;
|
|
|
|
struct io_kiocb *req = wait->private;
|
2022-08-11 07:11:15 +00:00
|
|
|
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
2022-06-13 13:27:03 +00:00
|
|
|
struct wait_page_key *key = arg;
|
|
|
|
|
|
|
|
wpq = container_of(wait, struct wait_page_queue, wait);
|
|
|
|
|
|
|
|
if (!wake_page_match(wpq, key))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
rw->kiocb.ki_flags &= ~IOCB_WAITQ;
|
|
|
|
list_del_init(&wait->entry);
|
|
|
|
io_req_task_queue(req);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This controls whether a given IO request should be armed for async page
|
|
|
|
* based retry. If we return false here, the request is handed to the async
|
|
|
|
* worker threads for retry. If we're doing buffered reads on a regular file,
|
|
|
|
* we prepare a private wait_page_queue entry and retry the operation. This
|
|
|
|
* will either succeed because the page is now uptodate and unlocked, or it
|
|
|
|
* will register a callback when the page is unlocked at IO completion. Through
|
|
|
|
* that callback, io_uring uses task_work to setup a retry of the operation.
|
|
|
|
* That retry will attempt the buffered read again. The retry will generally
|
|
|
|
* succeed, or in rare cases where it fails, we then fall back to using the
|
|
|
|
* async worker threads for a blocking retry.
|
|
|
|
*/
|
|
|
|
static bool io_rw_should_retry(struct io_kiocb *req)
|
|
|
|
{
|
|
|
|
struct io_async_rw *io = req->async_data;
|
|
|
|
struct wait_page_queue *wait = &io->wpq;
|
2022-08-11 07:11:15 +00:00
|
|
|
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
2022-06-13 13:27:03 +00:00
|
|
|
struct kiocb *kiocb = &rw->kiocb;
|
|
|
|
|
|
|
|
/* never retry for NOWAIT, we just complete with -EAGAIN */
|
|
|
|
if (req->flags & REQ_F_NOWAIT)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* Only for buffered IO */
|
|
|
|
if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* just use poll if we can, and don't attempt if the fs doesn't
|
|
|
|
* support callback based unlocks
|
|
|
|
*/
|
2024-03-28 12:27:24 +00:00
|
|
|
if (io_file_can_poll(req) ||
|
|
|
|
!(req->file->f_op->fop_flags & FOP_BUFFER_RASYNC))
|
2022-06-13 13:27:03 +00:00
|
|
|
return false;
|
|
|
|
|
|
|
|
wait->wait.func = io_async_buf_func;
|
|
|
|
wait->wait.private = req;
|
|
|
|
wait->wait.flags = 0;
|
|
|
|
INIT_LIST_HEAD(&wait->wait.entry);
|
|
|
|
kiocb->ki_flags |= IOCB_WAITQ;
|
|
|
|
kiocb->ki_flags &= ~IOCB_NOWAIT;
|
|
|
|
kiocb->ki_waitq = wait;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter)
|
|
|
|
{
|
|
|
|
struct file *file = rw->kiocb.ki_filp;
|
|
|
|
|
|
|
|
if (likely(file->f_op->read_iter))
|
2023-08-28 15:13:18 +00:00
|
|
|
return file->f_op->read_iter(&rw->kiocb, iter);
|
2022-06-13 13:27:03 +00:00
|
|
|
else if (file->f_op->read)
|
|
|
|
return loop_rw_iter(READ, rw, iter);
|
|
|
|
else
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2022-06-16 21:22:18 +00:00
|
|
|
static bool need_complete_io(struct io_kiocb *req)
|
2022-06-13 13:27:03 +00:00
|
|
|
{
|
|
|
|
return req->flags & REQ_F_ISREG ||
|
|
|
|
S_ISBLK(file_inode(req->file)->i_mode);
|
|
|
|
}
|
|
|
|
|
fs: Initial atomic write support
An atomic write is a write issued with torn-write protection, meaning
that for a power failure or any other hardware failure, all or none of the
data from the write will be stored, but never a mix of old and new data.
Userspace may add flag RWF_ATOMIC to pwritev2() to indicate that the
write is to be issued with torn-write prevention, according to special
alignment and length rules.
For any syscall interface utilizing struct iocb, add IOCB_ATOMIC for
iocb->ki_flags field to indicate the same.
A call to statx will give the relevant atomic write info for a file:
- atomic_write_unit_min
- atomic_write_unit_max
- atomic_write_segments_max
Both min and max values must be a power-of-2.
Applications can avail of atomic write feature by ensuring that the total
length of a write is a power-of-2 in size and also sized between
atomic_write_unit_min and atomic_write_unit_max, inclusive. Applications
must ensure that the write is at a naturally-aligned offset in the file
wrt the total write length. The value in atomic_write_segments_max
indicates the upper limit for IOV_ITER iovcnt.
Add file mode flag FMODE_CAN_ATOMIC_WRITE, so files which do not have the
flag set will have RWF_ATOMIC rejected and not just ignored.
Add a type argument to kiocb_set_rw_flags() to allows reads which have
RWF_ATOMIC set to be rejected.
Helper function generic_atomic_write_valid() can be used by FSes to verify
compliant writes. There we check for iov_iter type is for ubuf, which
implies iovcnt==1 for pwritev2(), which is an initial restriction for
atomic_write_segments_max. Initially the only user will be bdev file
operations write handler. We will rely on the block BIO submission path to
ensure write sizes are compliant for the bdev, so we don't need to check
atomic writes sizes yet.
Signed-off-by: Prasad Singamsetty <prasad.singamsetty@oracle.com>
jpg: merge into single patch and much rewrite
Acked-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20240620125359.2684798-4-john.g.garry@oracle.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-06-20 12:53:52 +00:00
|
|
|
static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
|
2022-06-13 13:27:03 +00:00
|
|
|
{
|
2022-08-11 07:11:15 +00:00
|
|
|
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
2022-06-13 13:27:03 +00:00
|
|
|
struct kiocb *kiocb = &rw->kiocb;
|
|
|
|
struct io_ring_ctx *ctx = req->ctx;
|
|
|
|
struct file *file = req->file;
|
|
|
|
int ret;
|
|
|
|
|
2024-01-29 03:52:21 +00:00
|
|
|
if (unlikely(!(file->f_mode & mode)))
|
2022-06-13 13:27:03 +00:00
|
|
|
return -EBADF;
|
|
|
|
|
2023-06-20 11:32:31 +00:00
|
|
|
if (!(req->flags & REQ_F_FIXED_FILE))
|
2023-06-20 11:32:32 +00:00
|
|
|
req->flags |= io_file_get_flags(file);
|
2022-06-13 13:27:03 +00:00
|
|
|
|
iov_iter work, part 1 - isolated cleanups and optimizations.
One of the goals is to reduce the overhead of using ->read_iter()
and ->write_iter() instead of ->read()/->write(); new_sync_{read,write}()
has a surprising amount of overhead, in particular inside iocb_flags().
That's why the beginning of the series is in this pile; it's not directly
iov_iter-related, but it's a part of the same work...
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-----BEGIN PGP SIGNATURE-----
iHUEABYIAB0WIQQqUNBr3gm4hGXdBJlZ7Krx/gZQ6wUCYurGOQAKCRBZ7Krx/gZQ
6ysyAP91lvBfMRepcxpd9kvtuzWkU8A3rfSziZZteEHANB9Q7QEAiPn2a2OjWkcZ
uAyUWfCkHCNx+dSMkEvUgR5okQ0exAM=
=9UCV
-----END PGP SIGNATURE-----
Merge tag 'pull-work.iov_iter-base' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull vfs iov_iter updates from Al Viro:
"Part 1 - isolated cleanups and optimizations.
One of the goals is to reduce the overhead of using ->read_iter() and
->write_iter() instead of ->read()/->write().
new_sync_{read,write}() has a surprising amount of overhead, in
particular inside iocb_flags(). That's the explanation for the
beginning of the series is in this pile; it's not directly
iov_iter-related, but it's a part of the same work..."
* tag 'pull-work.iov_iter-base' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
first_iovec_segment(): just return address
iov_iter: massage calling conventions for first_{iovec,bvec}_segment()
iov_iter: first_{iovec,bvec}_segment() - simplify a bit
iov_iter: lift dealing with maxpages out of first_{iovec,bvec}_segment()
iov_iter_get_pages{,_alloc}(): cap the maxsize with MAX_RW_COUNT
iov_iter_bvec_advance(): don't bother with bvec_iter
copy_page_{to,from}_iter(): switch iovec variants to generic
keep iocb_flags() result cached in struct file
iocb: delay evaluation of IS_SYNC(...) until we want to check IOCB_DSYNC
struct file: use anonymous union member for rcuhead and llist
btrfs: use IOMAP_DIO_NOSYNC
teach iomap_dio_rw() to suppress dsync
No need of likely/unlikely on calls of check_copy_size()
2022-08-03 20:50:22 +00:00
|
|
|
kiocb->ki_flags = file->f_iocb_flags;
|
fs: Initial atomic write support
An atomic write is a write issued with torn-write protection, meaning
that for a power failure or any other hardware failure, all or none of the
data from the write will be stored, but never a mix of old and new data.
Userspace may add flag RWF_ATOMIC to pwritev2() to indicate that the
write is to be issued with torn-write prevention, according to special
alignment and length rules.
For any syscall interface utilizing struct iocb, add IOCB_ATOMIC for
iocb->ki_flags field to indicate the same.
A call to statx will give the relevant atomic write info for a file:
- atomic_write_unit_min
- atomic_write_unit_max
- atomic_write_segments_max
Both min and max values must be a power-of-2.
Applications can avail of atomic write feature by ensuring that the total
length of a write is a power-of-2 in size and also sized between
atomic_write_unit_min and atomic_write_unit_max, inclusive. Applications
must ensure that the write is at a naturally-aligned offset in the file
wrt the total write length. The value in atomic_write_segments_max
indicates the upper limit for IOV_ITER iovcnt.
Add file mode flag FMODE_CAN_ATOMIC_WRITE, so files which do not have the
flag set will have RWF_ATOMIC rejected and not just ignored.
Add a type argument to kiocb_set_rw_flags() to allows reads which have
RWF_ATOMIC set to be rejected.
Helper function generic_atomic_write_valid() can be used by FSes to verify
compliant writes. There we check for iov_iter type is for ubuf, which
implies iovcnt==1 for pwritev2(), which is an initial restriction for
atomic_write_segments_max. Initially the only user will be bdev file
operations write handler. We will rely on the block BIO submission path to
ensure write sizes are compliant for the bdev, so we don't need to check
atomic writes sizes yet.
Signed-off-by: Prasad Singamsetty <prasad.singamsetty@oracle.com>
jpg: merge into single patch and much rewrite
Acked-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20240620125359.2684798-4-john.g.garry@oracle.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-06-20 12:53:52 +00:00
|
|
|
ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type);
|
2022-06-13 13:27:03 +00:00
|
|
|
if (unlikely(ret))
|
|
|
|
return ret;
|
2022-11-02 15:18:24 +00:00
|
|
|
kiocb->ki_flags |= IOCB_ALLOC_CACHE;
|
2022-06-13 13:27:03 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the file is marked O_NONBLOCK, still allow retry for it if it
|
|
|
|
* supports async. Otherwise it's impossible to use O_NONBLOCK files
|
|
|
|
* reliably. If not, or it IOCB_NOWAIT is set, don't retry.
|
|
|
|
*/
|
2024-10-06 16:40:36 +00:00
|
|
|
if (kiocb->ki_flags & IOCB_NOWAIT ||
|
2024-10-19 15:16:51 +00:00
|
|
|
((file->f_flags & O_NONBLOCK && !(req->flags & REQ_F_SUPPORT_NOWAIT))))
|
2022-06-13 13:27:03 +00:00
|
|
|
req->flags |= REQ_F_NOWAIT;
|
|
|
|
|
|
|
|
if (ctx->flags & IORING_SETUP_IOPOLL) {
|
|
|
|
if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
kiocb->private = NULL;
|
2022-11-02 15:18:24 +00:00
|
|
|
kiocb->ki_flags |= IOCB_HIPRI;
|
2022-06-13 13:27:03 +00:00
|
|
|
kiocb->ki_complete = io_complete_rw_iopoll;
|
|
|
|
req->iopoll_completed = 0;
|
2024-11-01 09:19:57 +00:00
|
|
|
if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) {
|
|
|
|
/* make sure every req only blocks once*/
|
|
|
|
req->flags &= ~REQ_F_IOPOLL_STATE;
|
|
|
|
req->iopoll_start = ktime_get_ns();
|
|
|
|
}
|
2022-06-13 13:27:03 +00:00
|
|
|
} else {
|
|
|
|
if (kiocb->ki_flags & IOCB_HIPRI)
|
|
|
|
return -EINVAL;
|
|
|
|
kiocb->ki_complete = io_complete_rw;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-09-11 19:31:56 +00:00
|
|
|
static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
|
2022-06-13 13:27:03 +00:00
|
|
|
{
|
2024-03-18 22:13:01 +00:00
|
|
|
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
|
2022-08-11 07:11:15 +00:00
|
|
|
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
2024-03-18 22:13:01 +00:00
|
|
|
struct io_async_rw *io = req->async_data;
|
2022-06-13 13:27:03 +00:00
|
|
|
struct kiocb *kiocb = &rw->kiocb;
|
2024-03-18 22:13:01 +00:00
|
|
|
ssize_t ret;
|
2022-06-13 13:27:03 +00:00
|
|
|
loff_t *ppos;
|
|
|
|
|
2024-03-18 22:13:01 +00:00
|
|
|
if (io_do_buffer_select(req)) {
|
|
|
|
ret = io_import_iovec(ITER_DEST, req, io, issue_flags);
|
2022-06-13 13:27:03 +00:00
|
|
|
if (unlikely(ret < 0))
|
|
|
|
return ret;
|
|
|
|
}
|
fs: Initial atomic write support
An atomic write is a write issued with torn-write protection, meaning
that for a power failure or any other hardware failure, all or none of the
data from the write will be stored, but never a mix of old and new data.
Userspace may add flag RWF_ATOMIC to pwritev2() to indicate that the
write is to be issued with torn-write prevention, according to special
alignment and length rules.
For any syscall interface utilizing struct iocb, add IOCB_ATOMIC for
iocb->ki_flags field to indicate the same.
A call to statx will give the relevant atomic write info for a file:
- atomic_write_unit_min
- atomic_write_unit_max
- atomic_write_segments_max
Both min and max values must be a power-of-2.
Applications can avail of atomic write feature by ensuring that the total
length of a write is a power-of-2 in size and also sized between
atomic_write_unit_min and atomic_write_unit_max, inclusive. Applications
must ensure that the write is at a naturally-aligned offset in the file
wrt the total write length. The value in atomic_write_segments_max
indicates the upper limit for IOV_ITER iovcnt.
Add file mode flag FMODE_CAN_ATOMIC_WRITE, so files which do not have the
flag set will have RWF_ATOMIC rejected and not just ignored.
Add a type argument to kiocb_set_rw_flags() to allows reads which have
RWF_ATOMIC set to be rejected.
Helper function generic_atomic_write_valid() can be used by FSes to verify
compliant writes. There we check for iov_iter type is for ubuf, which
implies iovcnt==1 for pwritev2(), which is an initial restriction for
atomic_write_segments_max. Initially the only user will be bdev file
operations write handler. We will rely on the block BIO submission path to
ensure write sizes are compliant for the bdev, so we don't need to check
atomic writes sizes yet.
Signed-off-by: Prasad Singamsetty <prasad.singamsetty@oracle.com>
jpg: merge into single patch and much rewrite
Acked-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20240620125359.2684798-4-john.g.garry@oracle.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-06-20 12:53:52 +00:00
|
|
|
ret = io_rw_init_file(req, FMODE_READ, READ);
|
2024-03-18 22:13:01 +00:00
|
|
|
if (unlikely(ret))
|
2022-06-13 13:27:03 +00:00
|
|
|
return ret;
|
2024-03-18 22:25:58 +00:00
|
|
|
req->cqe.res = iov_iter_count(&io->iter);
|
2022-06-13 13:27:03 +00:00
|
|
|
|
|
|
|
if (force_nonblock) {
|
|
|
|
/* If the file doesn't support async, just async punt */
|
2024-10-06 16:40:36 +00:00
|
|
|
if (unlikely(!io_file_supports_nowait(req, EPOLLIN)))
|
2024-03-18 22:13:01 +00:00
|
|
|
return -EAGAIN;
|
2022-06-13 13:27:03 +00:00
|
|
|
kiocb->ki_flags |= IOCB_NOWAIT;
|
|
|
|
} else {
|
|
|
|
/* Ensure we clear previously set non-block flag */
|
|
|
|
kiocb->ki_flags &= ~IOCB_NOWAIT;
|
|
|
|
}
|
|
|
|
|
|
|
|
ppos = io_kiocb_update_pos(req);
|
|
|
|
|
|
|
|
ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
|
2024-03-18 22:13:01 +00:00
|
|
|
if (unlikely(ret))
|
2022-06-13 13:27:03 +00:00
|
|
|
return ret;
|
|
|
|
|
2024-03-18 22:25:58 +00:00
|
|
|
ret = io_iter_do_read(rw, &io->iter);
|
2022-06-13 13:27:03 +00:00
|
|
|
|
2024-09-10 14:30:57 +00:00
|
|
|
/*
|
|
|
|
* Some file systems like to return -EOPNOTSUPP for an IOCB_NOWAIT
|
|
|
|
* issue, even though they should be returning -EAGAIN. To be safe,
|
|
|
|
* retry from blocking context for either.
|
|
|
|
*/
|
|
|
|
if (ret == -EOPNOTSUPP && force_nonblock)
|
|
|
|
ret = -EAGAIN;
|
|
|
|
|
2022-06-13 13:27:03 +00:00
|
|
|
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
|
|
|
|
req->flags &= ~REQ_F_REISSUE;
|
2024-03-18 22:13:01 +00:00
|
|
|
/* If we can poll, just do that. */
|
|
|
|
if (io_file_can_poll(req))
|
2022-06-13 13:27:03 +00:00
|
|
|
return -EAGAIN;
|
|
|
|
/* IOPOLL retry should happen for io-wq threads */
|
|
|
|
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
|
|
|
|
goto done;
|
|
|
|
/* no retry on NONBLOCK nor RWF_NOWAIT */
|
|
|
|
if (req->flags & REQ_F_NOWAIT)
|
|
|
|
goto done;
|
|
|
|
ret = 0;
|
|
|
|
} else if (ret == -EIOCBQUEUED) {
|
2022-06-16 09:21:57 +00:00
|
|
|
return IOU_ISSUE_SKIP_COMPLETE;
|
2022-06-13 13:27:03 +00:00
|
|
|
} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
|
2022-06-16 21:22:18 +00:00
|
|
|
(req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) {
|
2022-06-13 13:27:03 +00:00
|
|
|
/* read all, failed, already did sync or don't want to retry */
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't depend on the iter state matching what was consumed, or being
|
|
|
|
* untouched in case of error. Restore it and we'll advance it
|
|
|
|
* manually if we need to.
|
|
|
|
*/
|
2024-03-18 22:25:58 +00:00
|
|
|
iov_iter_restore(&io->iter, &io->iter_state);
|
2022-06-13 13:27:03 +00:00
|
|
|
|
|
|
|
do {
|
|
|
|
/*
|
|
|
|
* We end up here because of a partial read, either from
|
|
|
|
* above or inside this loop. Advance the iter by the bytes
|
|
|
|
* that were consumed.
|
|
|
|
*/
|
2024-03-18 22:25:58 +00:00
|
|
|
iov_iter_advance(&io->iter, ret);
|
|
|
|
if (!iov_iter_count(&io->iter))
|
2022-06-13 13:27:03 +00:00
|
|
|
break;
|
|
|
|
io->bytes_done += ret;
|
2024-03-18 22:25:58 +00:00
|
|
|
iov_iter_save_state(&io->iter, &io->iter_state);
|
2022-06-13 13:27:03 +00:00
|
|
|
|
|
|
|
/* if we can retry, do so with the callbacks armed */
|
|
|
|
if (!io_rw_should_retry(req)) {
|
|
|
|
kiocb->ki_flags &= ~IOCB_WAITQ;
|
|
|
|
return -EAGAIN;
|
|
|
|
}
|
|
|
|
|
2024-03-18 22:25:58 +00:00
|
|
|
req->cqe.res = iov_iter_count(&io->iter);
|
2022-06-13 13:27:03 +00:00
|
|
|
/*
|
|
|
|
* Now retry read with the IOCB_WAITQ parts set in the iocb. If
|
|
|
|
* we get -EIOCBQUEUED, then we'll get a notification when the
|
|
|
|
* desired page gets unlocked. We can also get a partial read
|
|
|
|
* here, and if we do, then just retry at the new offset.
|
|
|
|
*/
|
2024-03-18 22:25:58 +00:00
|
|
|
ret = io_iter_do_read(rw, &io->iter);
|
2022-06-13 13:27:03 +00:00
|
|
|
if (ret == -EIOCBQUEUED)
|
|
|
|
return IOU_ISSUE_SKIP_COMPLETE;
|
|
|
|
/* we got some bytes, but not all. retry. */
|
|
|
|
kiocb->ki_flags &= ~IOCB_WAITQ;
|
2024-03-18 22:25:58 +00:00
|
|
|
iov_iter_restore(&io->iter, &io->iter_state);
|
2022-06-13 13:27:03 +00:00
|
|
|
} while (ret > 0);
|
|
|
|
done:
|
|
|
|
/* it's faster to check here then delegate to kfree */
|
2023-09-11 19:31:56 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int io_read(struct io_kiocb *req, unsigned int issue_flags)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = __io_read(req, issue_flags);
|
|
|
|
if (ret >= 0)
|
|
|
|
return kiocb_done(req, ret, issue_flags);
|
|
|
|
|
|
|
|
return ret;
|
2022-06-13 13:27:03 +00:00
|
|
|
}
|
|
|
|
|
2023-09-11 19:35:42 +00:00
|
|
|
int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
|
|
|
|
{
|
2023-11-06 20:39:09 +00:00
|
|
|
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
2023-09-11 19:35:42 +00:00
|
|
|
unsigned int cflags = 0;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Multishot MUST be used on a pollable file
|
|
|
|
*/
|
2024-01-29 03:08:24 +00:00
|
|
|
if (!io_file_can_poll(req))
|
2023-09-11 19:35:42 +00:00
|
|
|
return -EBADFD;
|
|
|
|
|
|
|
|
ret = __io_read(req, issue_flags);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we get -EAGAIN, recycle our buffer and just let normal poll
|
|
|
|
* handling arm it.
|
|
|
|
*/
|
|
|
|
if (ret == -EAGAIN) {
|
2023-11-06 20:39:09 +00:00
|
|
|
/*
|
|
|
|
* Reset rw->len to 0 again to avoid clamping future mshot
|
|
|
|
* reads, in case the buffer size varies.
|
|
|
|
*/
|
|
|
|
if (io_kbuf_recycle(req, issue_flags))
|
|
|
|
rw->len = 0;
|
2024-03-12 14:29:47 +00:00
|
|
|
if (issue_flags & IO_URING_F_MULTISHOT)
|
|
|
|
return IOU_ISSUE_SKIP_COMPLETE;
|
2023-09-11 19:35:42 +00:00
|
|
|
return -EAGAIN;
|
2024-10-06 01:06:50 +00:00
|
|
|
} else if (ret <= 0) {
|
|
|
|
io_kbuf_recycle(req, issue_flags);
|
|
|
|
if (ret < 0)
|
|
|
|
req_set_fail(req);
|
|
|
|
} else {
|
2023-09-11 19:35:42 +00:00
|
|
|
/*
|
2024-10-06 01:06:50 +00:00
|
|
|
* Any successful return value will keep the multishot read
|
|
|
|
* armed, if it's still set. Put our buffer and post a CQE. If
|
|
|
|
* we fail to post a CQE, or multishot is no longer set, then
|
2023-09-11 19:35:42 +00:00
|
|
|
* jump to the termination path. This request is then done.
|
|
|
|
*/
|
2024-08-27 14:26:07 +00:00
|
|
|
cflags = io_put_kbuf(req, ret, issue_flags);
|
2023-11-06 20:39:09 +00:00
|
|
|
rw->len = 0; /* similarly to above, reset len to 0 */
|
2023-09-11 19:35:42 +00:00
|
|
|
|
2024-03-18 22:00:31 +00:00
|
|
|
if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
|
io_uring/rw: ensure poll based multishot read retries appropriately
io_read_mshot() always relies on poll triggering retries, and this works
fine as long as we do a retry per size of the buffer being read. The
buffer size is given by the size of the buffer(s) in the given buffer
group ID.
But if we're reading less than what is available, then we don't always
get to read everything that is available. For example, if the buffers
available are 32 bytes and we have 64 bytes to read, then we'll
correctly read the first 32 bytes and then wait for another poll trigger
before we attempt the next read. This next poll trigger may never
happen, in which case we just sit forever and never make progress, or it
may trigger at some point in the future, and now we're just delivering
the available data much later than we should have.
io_read_mshot() could do retries itself, but that is wasteful as we'll
be going through all of __io_read() again, and most likely in vain.
Rather than do that, bump our poll reference count and have
io_poll_check_events() do one more loop and check with vfs_poll() if we
have more data to read. If we do, io_read_mshot() will get invoked again
directly and we'll read the next chunk.
io_poll_multishot_retry() must only get called from inside
io_poll_issue(), which is our multishot retry handler, as we know we
already "own" the request at this point.
Cc: stable@vger.kernel.org
Link: https://github.com/axboe/liburing/issues/1041
Fixes: fc68fcda0491 ("io_uring/rw: add support for IORING_OP_READ_MULTISHOT")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-01-27 20:44:58 +00:00
|
|
|
if (issue_flags & IO_URING_F_MULTISHOT) {
|
|
|
|
/*
|
|
|
|
* Force retry, as we might have more data to
|
|
|
|
* be read and otherwise it won't get retried
|
|
|
|
* until (if ever) another poll is triggered.
|
|
|
|
*/
|
|
|
|
io_poll_multishot_retry(req);
|
2023-09-11 19:35:42 +00:00
|
|
|
return IOU_ISSUE_SKIP_COMPLETE;
|
io_uring/rw: ensure poll based multishot read retries appropriately
io_read_mshot() always relies on poll triggering retries, and this works
fine as long as we do a retry per size of the buffer being read. The
buffer size is given by the size of the buffer(s) in the given buffer
group ID.
But if we're reading less than what is available, then we don't always
get to read everything that is available. For example, if the buffers
available are 32 bytes and we have 64 bytes to read, then we'll
correctly read the first 32 bytes and then wait for another poll trigger
before we attempt the next read. This next poll trigger may never
happen, in which case we just sit forever and never make progress, or it
may trigger at some point in the future, and now we're just delivering
the available data much later than we should have.
io_read_mshot() could do retries itself, but that is wasteful as we'll
be going through all of __io_read() again, and most likely in vain.
Rather than do that, bump our poll reference count and have
io_poll_check_events() do one more loop and check with vfs_poll() if we
have more data to read. If we do, io_read_mshot() will get invoked again
directly and we'll read the next chunk.
io_poll_multishot_retry() must only get called from inside
io_poll_issue(), which is our multishot retry handler, as we know we
already "own" the request at this point.
Cc: stable@vger.kernel.org
Link: https://github.com/axboe/liburing/issues/1041
Fixes: fc68fcda0491 ("io_uring/rw: add support for IORING_OP_READ_MULTISHOT")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-01-27 20:44:58 +00:00
|
|
|
}
|
2023-09-11 19:35:42 +00:00
|
|
|
return -EAGAIN;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Either an error, or we've hit overflow posting the CQE. For any
|
|
|
|
* multishot request, hitting overflow will terminate it.
|
|
|
|
*/
|
|
|
|
io_req_set_res(req, ret, cflags);
|
2024-03-18 22:13:01 +00:00
|
|
|
io_req_rw_cleanup(req, issue_flags);
|
2023-09-11 19:35:42 +00:00
|
|
|
if (issue_flags & IO_URING_F_MULTISHOT)
|
|
|
|
return IOU_STOP_MULTISHOT;
|
|
|
|
return IOU_OK;
|
2022-06-13 13:27:03 +00:00
|
|
|
}
|
|
|
|
|
io_uring/rw: fix missing NOWAIT check for O_DIRECT start write
When io_uring starts a write, it'll call kiocb_start_write() to bump the
super block rwsem, preventing any freezes from happening while that
write is in-flight. The freeze side will grab that rwsem for writing,
excluding any new writers from happening and waiting for existing writes
to finish. But io_uring unconditionally uses kiocb_start_write(), which
will block if someone is currently attempting to freeze the mount point.
This causes a deadlock where freeze is waiting for previous writes to
complete, but the previous writes cannot complete, as the task that is
supposed to complete them is blocked waiting on starting a new write.
This results in the following stuck trace showing that dependency with
the write blocked starting a new write:
task:fio state:D stack:0 pid:886 tgid:886 ppid:876
Call trace:
__switch_to+0x1d8/0x348
__schedule+0x8e8/0x2248
schedule+0x110/0x3f0
percpu_rwsem_wait+0x1e8/0x3f8
__percpu_down_read+0xe8/0x500
io_write+0xbb8/0xff8
io_issue_sqe+0x10c/0x1020
io_submit_sqes+0x614/0x2110
__arm64_sys_io_uring_enter+0x524/0x1038
invoke_syscall+0x74/0x268
el0_svc_common.constprop.0+0x160/0x238
do_el0_svc+0x44/0x60
el0_svc+0x44/0xb0
el0t_64_sync_handler+0x118/0x128
el0t_64_sync+0x168/0x170
INFO: task fsfreeze:7364 blocked for more than 15 seconds.
Not tainted 6.12.0-rc5-00063-g76aaf945701c #7963
with the attempting freezer stuck trying to grab the rwsem:
task:fsfreeze state:D stack:0 pid:7364 tgid:7364 ppid:995
Call trace:
__switch_to+0x1d8/0x348
__schedule+0x8e8/0x2248
schedule+0x110/0x3f0
percpu_down_write+0x2b0/0x680
freeze_super+0x248/0x8a8
do_vfs_ioctl+0x149c/0x1b18
__arm64_sys_ioctl+0xd0/0x1a0
invoke_syscall+0x74/0x268
el0_svc_common.constprop.0+0x160/0x238
do_el0_svc+0x44/0x60
el0_svc+0x44/0xb0
el0t_64_sync_handler+0x118/0x128
el0t_64_sync+0x168/0x170
Fix this by having the io_uring side honor IOCB_NOWAIT, and only attempt a
blocking grab of the super block rwsem if it isn't set. For normal issue
where IOCB_NOWAIT would always be set, this returns -EAGAIN which will
have io_uring core issue a blocking attempt of the write. That will in
turn also get completions run, ensuring forward progress.
Since freezing requires CAP_SYS_ADMIN in the first place, this isn't
something that can be triggered by a regular user.
Cc: stable@vger.kernel.org # 5.10+
Reported-by: Peter Mann <peter.mann@sh.cz>
Link: https://lore.kernel.org/io-uring/38c94aec-81c9-4f62-b44e-1d87f5597644@sh.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-31 14:05:44 +00:00
|
|
|
static bool io_kiocb_start_write(struct io_kiocb *req, struct kiocb *kiocb)
|
|
|
|
{
|
|
|
|
struct inode *inode;
|
|
|
|
bool ret;
|
|
|
|
|
|
|
|
if (!(req->flags & REQ_F_ISREG))
|
|
|
|
return true;
|
|
|
|
if (!(kiocb->ki_flags & IOCB_NOWAIT)) {
|
|
|
|
kiocb_start_write(kiocb);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
inode = file_inode(kiocb->ki_filp);
|
|
|
|
ret = sb_start_write_trylock(inode->i_sb);
|
|
|
|
if (ret)
|
|
|
|
__sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-06-13 13:27:03 +00:00
|
|
|
int io_write(struct io_kiocb *req, unsigned int issue_flags)
|
|
|
|
{
|
2024-03-18 22:13:01 +00:00
|
|
|
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
|
2022-08-11 07:11:15 +00:00
|
|
|
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
2024-03-18 22:13:01 +00:00
|
|
|
struct io_async_rw *io = req->async_data;
|
2022-06-13 13:27:03 +00:00
|
|
|
struct kiocb *kiocb = &rw->kiocb;
|
|
|
|
ssize_t ret, ret2;
|
|
|
|
loff_t *ppos;
|
|
|
|
|
fs: Initial atomic write support
An atomic write is a write issued with torn-write protection, meaning
that for a power failure or any other hardware failure, all or none of the
data from the write will be stored, but never a mix of old and new data.
Userspace may add flag RWF_ATOMIC to pwritev2() to indicate that the
write is to be issued with torn-write prevention, according to special
alignment and length rules.
For any syscall interface utilizing struct iocb, add IOCB_ATOMIC for
iocb->ki_flags field to indicate the same.
A call to statx will give the relevant atomic write info for a file:
- atomic_write_unit_min
- atomic_write_unit_max
- atomic_write_segments_max
Both min and max values must be a power-of-2.
Applications can avail of atomic write feature by ensuring that the total
length of a write is a power-of-2 in size and also sized between
atomic_write_unit_min and atomic_write_unit_max, inclusive. Applications
must ensure that the write is at a naturally-aligned offset in the file
wrt the total write length. The value in atomic_write_segments_max
indicates the upper limit for IOV_ITER iovcnt.
Add file mode flag FMODE_CAN_ATOMIC_WRITE, so files which do not have the
flag set will have RWF_ATOMIC rejected and not just ignored.
Add a type argument to kiocb_set_rw_flags() to allows reads which have
RWF_ATOMIC set to be rejected.
Helper function generic_atomic_write_valid() can be used by FSes to verify
compliant writes. There we check for iov_iter type is for ubuf, which
implies iovcnt==1 for pwritev2(), which is an initial restriction for
atomic_write_segments_max. Initially the only user will be bdev file
operations write handler. We will rely on the block BIO submission path to
ensure write sizes are compliant for the bdev, so we don't need to check
atomic writes sizes yet.
Signed-off-by: Prasad Singamsetty <prasad.singamsetty@oracle.com>
jpg: merge into single patch and much rewrite
Acked-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20240620125359.2684798-4-john.g.garry@oracle.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-06-20 12:53:52 +00:00
|
|
|
ret = io_rw_init_file(req, FMODE_WRITE, WRITE);
|
2024-03-18 22:13:01 +00:00
|
|
|
if (unlikely(ret))
|
2022-06-13 13:27:03 +00:00
|
|
|
return ret;
|
2024-03-18 22:25:58 +00:00
|
|
|
req->cqe.res = iov_iter_count(&io->iter);
|
2022-06-13 13:27:03 +00:00
|
|
|
|
|
|
|
if (force_nonblock) {
|
|
|
|
/* If the file doesn't support async, just async punt */
|
2024-10-06 16:40:36 +00:00
|
|
|
if (unlikely(!io_file_supports_nowait(req, EPOLLOUT)))
|
2024-03-18 22:13:01 +00:00
|
|
|
goto ret_eagain;
|
2022-06-13 13:27:03 +00:00
|
|
|
|
2024-03-28 12:27:24 +00:00
|
|
|
/* Check if we can support NOWAIT. */
|
2022-06-16 21:22:18 +00:00
|
|
|
if (!(kiocb->ki_flags & IOCB_DIRECT) &&
|
2024-03-28 12:27:24 +00:00
|
|
|
!(req->file->f_op->fop_flags & FOP_BUFFER_WASYNC) &&
|
|
|
|
(req->flags & REQ_F_ISREG))
|
2024-03-18 22:13:01 +00:00
|
|
|
goto ret_eagain;
|
2022-06-13 13:27:03 +00:00
|
|
|
|
|
|
|
kiocb->ki_flags |= IOCB_NOWAIT;
|
|
|
|
} else {
|
|
|
|
/* Ensure we clear previously set non-block flag */
|
|
|
|
kiocb->ki_flags &= ~IOCB_NOWAIT;
|
|
|
|
}
|
|
|
|
|
|
|
|
ppos = io_kiocb_update_pos(req);
|
|
|
|
|
|
|
|
ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
|
2024-03-18 22:13:01 +00:00
|
|
|
if (unlikely(ret))
|
2022-06-16 09:21:57 +00:00
|
|
|
return ret;
|
2022-06-13 13:27:03 +00:00
|
|
|
|
io_uring/rw: fix missing NOWAIT check for O_DIRECT start write
When io_uring starts a write, it'll call kiocb_start_write() to bump the
super block rwsem, preventing any freezes from happening while that
write is in-flight. The freeze side will grab that rwsem for writing,
excluding any new writers from happening and waiting for existing writes
to finish. But io_uring unconditionally uses kiocb_start_write(), which
will block if someone is currently attempting to freeze the mount point.
This causes a deadlock where freeze is waiting for previous writes to
complete, but the previous writes cannot complete, as the task that is
supposed to complete them is blocked waiting on starting a new write.
This results in the following stuck trace showing that dependency with
the write blocked starting a new write:
task:fio state:D stack:0 pid:886 tgid:886 ppid:876
Call trace:
__switch_to+0x1d8/0x348
__schedule+0x8e8/0x2248
schedule+0x110/0x3f0
percpu_rwsem_wait+0x1e8/0x3f8
__percpu_down_read+0xe8/0x500
io_write+0xbb8/0xff8
io_issue_sqe+0x10c/0x1020
io_submit_sqes+0x614/0x2110
__arm64_sys_io_uring_enter+0x524/0x1038
invoke_syscall+0x74/0x268
el0_svc_common.constprop.0+0x160/0x238
do_el0_svc+0x44/0x60
el0_svc+0x44/0xb0
el0t_64_sync_handler+0x118/0x128
el0t_64_sync+0x168/0x170
INFO: task fsfreeze:7364 blocked for more than 15 seconds.
Not tainted 6.12.0-rc5-00063-g76aaf945701c #7963
with the attempting freezer stuck trying to grab the rwsem:
task:fsfreeze state:D stack:0 pid:7364 tgid:7364 ppid:995
Call trace:
__switch_to+0x1d8/0x348
__schedule+0x8e8/0x2248
schedule+0x110/0x3f0
percpu_down_write+0x2b0/0x680
freeze_super+0x248/0x8a8
do_vfs_ioctl+0x149c/0x1b18
__arm64_sys_ioctl+0xd0/0x1a0
invoke_syscall+0x74/0x268
el0_svc_common.constprop.0+0x160/0x238
do_el0_svc+0x44/0x60
el0_svc+0x44/0xb0
el0t_64_sync_handler+0x118/0x128
el0t_64_sync+0x168/0x170
Fix this by having the io_uring side honor IOCB_NOWAIT, and only attempt a
blocking grab of the super block rwsem if it isn't set. For normal issue
where IOCB_NOWAIT would always be set, this returns -EAGAIN which will
have io_uring core issue a blocking attempt of the write. That will in
turn also get completions run, ensuring forward progress.
Since freezing requires CAP_SYS_ADMIN in the first place, this isn't
something that can be triggered by a regular user.
Cc: stable@vger.kernel.org # 5.10+
Reported-by: Peter Mann <peter.mann@sh.cz>
Link: https://lore.kernel.org/io-uring/38c94aec-81c9-4f62-b44e-1d87f5597644@sh.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-31 14:05:44 +00:00
|
|
|
if (unlikely(!io_kiocb_start_write(req, kiocb)))
|
|
|
|
return -EAGAIN;
|
2022-06-13 13:27:03 +00:00
|
|
|
kiocb->ki_flags |= IOCB_WRITE;
|
|
|
|
|
|
|
|
if (likely(req->file->f_op->write_iter))
|
2024-05-21 20:11:44 +00:00
|
|
|
ret2 = req->file->f_op->write_iter(kiocb, &io->iter);
|
2022-06-13 13:27:03 +00:00
|
|
|
else if (req->file->f_op->write)
|
2024-03-18 22:25:58 +00:00
|
|
|
ret2 = loop_rw_iter(WRITE, rw, &io->iter);
|
2022-06-13 13:27:03 +00:00
|
|
|
else
|
|
|
|
ret2 = -EINVAL;
|
|
|
|
|
|
|
|
if (req->flags & REQ_F_REISSUE) {
|
|
|
|
req->flags &= ~REQ_F_REISSUE;
|
|
|
|
ret2 = -EAGAIN;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
|
|
|
|
* retry them without IOCB_NOWAIT.
|
|
|
|
*/
|
|
|
|
if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
|
|
|
|
ret2 = -EAGAIN;
|
|
|
|
/* no retry on NONBLOCK nor RWF_NOWAIT */
|
|
|
|
if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
|
|
|
|
goto done;
|
|
|
|
if (!force_nonblock || ret2 != -EAGAIN) {
|
|
|
|
/* IOPOLL retry should happen for io-wq threads */
|
|
|
|
if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
|
2024-03-18 22:13:01 +00:00
|
|
|
goto ret_eagain;
|
2022-06-16 21:22:18 +00:00
|
|
|
|
|
|
|
if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) {
|
2022-06-16 21:22:19 +00:00
|
|
|
trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2,
|
|
|
|
req->cqe.res, ret2);
|
|
|
|
|
2022-06-16 21:22:18 +00:00
|
|
|
/* This is a partial write. The file pos has already been
|
|
|
|
* updated, setup the async struct to complete the request
|
|
|
|
* in the worker. Also update bytes_done to account for
|
|
|
|
* the bytes already written.
|
|
|
|
*/
|
2024-03-18 22:25:58 +00:00
|
|
|
iov_iter_save_state(&io->iter, &io->iter_state);
|
2024-03-18 22:13:01 +00:00
|
|
|
io->bytes_done += ret2;
|
2022-06-16 21:22:18 +00:00
|
|
|
|
2022-06-24 16:24:45 +00:00
|
|
|
if (kiocb->ki_flags & IOCB_WRITE)
|
2023-08-17 14:13:31 +00:00
|
|
|
io_req_end_write(req);
|
2024-03-18 22:13:01 +00:00
|
|
|
return -EAGAIN;
|
2022-06-16 21:22:18 +00:00
|
|
|
}
|
2022-06-13 13:27:03 +00:00
|
|
|
done:
|
2024-03-23 02:41:18 +00:00
|
|
|
return kiocb_done(req, ret2, issue_flags);
|
2022-06-13 13:27:03 +00:00
|
|
|
} else {
|
2024-03-18 22:13:01 +00:00
|
|
|
ret_eagain:
|
2024-03-18 22:25:58 +00:00
|
|
|
iov_iter_restore(&io->iter, &io->iter_state);
|
2024-03-18 22:13:01 +00:00
|
|
|
if (kiocb->ki_flags & IOCB_WRITE)
|
|
|
|
io_req_end_write(req);
|
|
|
|
return -EAGAIN;
|
2022-06-13 13:27:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-21 11:17:47 +00:00
|
|
|
void io_rw_fail(struct io_kiocb *req)
|
|
|
|
{
|
|
|
|
int res;
|
|
|
|
|
|
|
|
res = io_fixup_rw_res(req, req->cqe.res);
|
|
|
|
io_req_set_res(req, res, req->cqe.flags);
|
|
|
|
}
|
|
|
|
|
2024-11-01 09:19:57 +00:00
|
|
|
static int io_uring_classic_poll(struct io_kiocb *req, struct io_comp_batch *iob,
|
|
|
|
unsigned int poll_flags)
|
|
|
|
{
|
|
|
|
struct file *file = req->file;
|
|
|
|
|
|
|
|
if (req->opcode == IORING_OP_URING_CMD) {
|
|
|
|
struct io_uring_cmd *ioucmd;
|
|
|
|
|
|
|
|
ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
|
|
|
|
return file->f_op->uring_cmd_iopoll(ioucmd, iob, poll_flags);
|
|
|
|
} else {
|
|
|
|
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
|
|
|
|
|
|
|
return file->f_op->iopoll(&rw->kiocb, iob, poll_flags);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static u64 io_hybrid_iopoll_delay(struct io_ring_ctx *ctx, struct io_kiocb *req)
|
|
|
|
{
|
|
|
|
struct hrtimer_sleeper timer;
|
|
|
|
enum hrtimer_mode mode;
|
|
|
|
ktime_t kt;
|
|
|
|
u64 sleep_time;
|
|
|
|
|
|
|
|
if (req->flags & REQ_F_IOPOLL_STATE)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (ctx->hybrid_poll_time == LLONG_MAX)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Using half the running time to do schedule */
|
|
|
|
sleep_time = ctx->hybrid_poll_time / 2;
|
|
|
|
|
|
|
|
kt = ktime_set(0, sleep_time);
|
|
|
|
req->flags |= REQ_F_IOPOLL_STATE;
|
|
|
|
|
|
|
|
mode = HRTIMER_MODE_REL;
|
A rather large update for timekeeping and timers:
- The final step to get rid of auto-rearming posix-timers
posix-timers are currently auto-rearmed by the kernel when the signal
of the timer is ignored so that the timer signal can be delivered once
the corresponding signal is unignored.
This requires to throttle the timer to prevent a DoS by small intervals
and keeps the system pointlessly out of low power states for no value.
This is a long standing non-trivial problem due to the lock order of
posix-timer lock and the sighand lock along with life time issues as
the timer and the sigqueue have different life time rules.
Cure this by:
* Embedding the sigqueue into the timer struct to have the same life
time rules. Aside of that this also avoids the lookup of the timer
in the signal delivery and rearm path as it's just a always valid
container_of() now.
* Queuing ignored timer signals onto a seperate ignored list.
* Moving queued timer signals onto the ignored list when the signal is
switched to SIG_IGN before it could be delivered.
* Walking the ignored list when SIG_IGN is lifted and requeue the
signals to the actual signal lists. This allows the signal delivery
code to rearm the timer.
This also required to consolidate the signal delivery rules so they are
consistent across all situations. With that all self test scenarios
finally succeed.
- Core infrastructure for VFS multigrain timestamping
This is required to allow the kernel to use coarse grained time stamps
by default and switch to fine grained time stamps when inode attributes
are actively observed via getattr().
These changes have been provided to the VFS tree as well, so that the
VFS specific infrastructure could be built on top.
- Cleanup and consolidation of the sleep() infrastructure
* Move all sleep and timeout functions into one file
* Rework udelay() and ndelay() into proper documented inline functions
and replace the hardcoded magic numbers by proper defines.
* Rework the fsleep() implementation to take the reality of the timer
wheel granularity on different HZ values into account. Right now the
boundaries are hard coded time ranges which fail to provide the
requested accuracy on different HZ settings.
* Update documentation for all sleep/timeout related functions and fix
up stale documentation links all over the place
* Fixup a few usage sites
- Rework of timekeeping and adjtimex(2) to prepare for multiple PTP clocks
A system can have multiple PTP clocks which are participating in
seperate and independent PTP clock domains. So far the kernel only
considers the PTP clock which is based on CLOCK TAI relevant as that's
the clock which drives the timekeeping adjustments via the various user
space daemons through adjtimex(2).
The non TAI based clock domains are accessible via the file descriptor
based posix clocks, but their usability is very limited. They can't be
accessed fast as they always go all the way out to the hardware and
they cannot be utilized in the kernel itself.
As Time Sensitive Networking (TSN) gains traction it is required to
provide fast user and kernel space access to these clocks.
The approach taken is to utilize the timekeeping and adjtimex(2)
infrastructure to provide this access in a similar way how the kernel
provides access to clock MONOTONIC, REALTIME etc.
Instead of creating a duplicated infrastructure this rework converts
timekeeping and adjtimex(2) into generic functionality which operates
on pointers to data structures instead of using static variables.
This allows to provide time accessors and adjtimex(2) functionality for
the independent PTP clocks in a subsequent step.
- Consolidate hrtimer initialization
hrtimers are set up by initializing the data structure and then
seperately setting the callback function for historical reasons.
That's an extra unnecessary step and makes Rust support less straight
forward than it should be.
Provide a new set of hrtimer_setup*() functions and convert the core
code and a few usage sites of the less frequently used interfaces over.
The bulk of the htimer_init() to hrtimer_setup() conversion is already
prepared and scheduled for the next merge window.
- Drivers:
* Ensure that the global timekeeping clocksource is utilizing the
cluster 0 timer on MIPS multi-cluster systems.
Otherwise CPUs on different clusters use their cluster specific
clocksource which is not guaranteed to be synchronized with other
clusters.
* Mostly boring cleanups, fixes, improvements and code movement
-----BEGIN PGP SIGNATURE-----
iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmc7kPITHHRnbHhAbGlu
dXRyb25peC5kZQAKCRCmGPVMDXSYoZKkD/9OUL6fOJrDUmOYBa4QVeMyfTef4EaL
tvwIMM/29XQFeiq3xxCIn+EMnHjXn2lvIhYGQ7GKsbKYwvJ7ZBDpQb+UMhZ2nKI9
6D6BP6WomZohKeH2fZbJQAdqOi3KRYdvQdIsVZUexkqiaVPphRvOH9wOr45gHtZM
EyMRSotPlQTDqcrbUejDMEO94GyjDCYXRsyATLxjmTzL/N4xD4NRIiotjM2vL/a9
8MuCgIhrKUEyYlFoOxxeokBsF3kk3/ez2jlG9b/N8VLH3SYIc2zgL58FBgWxlmgG
bY71nVG3nUgEjxBd2dcXAVVqvb+5widk8p6O7xxOAQKTLMcJ4H0tQDkMnzBtUzvB
DGAJDHAmAr0g+ja9O35Pkhunkh4HYFIbq0Il4d1HMKObhJV0JumcKuQVxrXycdm3
UZfq3seqHsZJQbPgCAhlFU0/2WWScocbee9bNebGT33KVwSp5FoVv89C/6Vjb+vV
Gusc3thqrQuMAZW5zV8g4UcBAA/xH4PB0I+vHib+9XPZ4UQ7/6xKl2jE0kd5hX7n
AAUeZvFNFqIsY+B6vz+Jx/yzyM7u5cuXq87pof5EHVFzv56lyTp4ToGcOGYRgKH5
JXeYV1OxGziSDrd5vbf9CzdWMzqMvTefXrHbWrjkjhNOe8E1A8O88RZ5uRKZhmSw
hZZ4hdM9+3T7cg==
=2VC6
-----END PGP SIGNATURE-----
Merge tag 'timers-core-2024-11-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull timer updates from Thomas Gleixner:
"A rather large update for timekeeping and timers:
- The final step to get rid of auto-rearming posix-timers
posix-timers are currently auto-rearmed by the kernel when the
signal of the timer is ignored so that the timer signal can be
delivered once the corresponding signal is unignored.
This requires to throttle the timer to prevent a DoS by small
intervals and keeps the system pointlessly out of low power states
for no value. This is a long standing non-trivial problem due to
the lock order of posix-timer lock and the sighand lock along with
life time issues as the timer and the sigqueue have different life
time rules.
Cure this by:
- Embedding the sigqueue into the timer struct to have the same
life time rules. Aside of that this also avoids the lookup of
the timer in the signal delivery and rearm path as it's just a
always valid container_of() now.
- Queuing ignored timer signals onto a seperate ignored list.
- Moving queued timer signals onto the ignored list when the
signal is switched to SIG_IGN before it could be delivered.
- Walking the ignored list when SIG_IGN is lifted and requeue the
signals to the actual signal lists. This allows the signal
delivery code to rearm the timer.
This also required to consolidate the signal delivery rules so they
are consistent across all situations. With that all self test
scenarios finally succeed.
- Core infrastructure for VFS multigrain timestamping
This is required to allow the kernel to use coarse grained time
stamps by default and switch to fine grained time stamps when inode
attributes are actively observed via getattr().
These changes have been provided to the VFS tree as well, so that
the VFS specific infrastructure could be built on top.
- Cleanup and consolidation of the sleep() infrastructure
- Move all sleep and timeout functions into one file
- Rework udelay() and ndelay() into proper documented inline
functions and replace the hardcoded magic numbers by proper
defines.
- Rework the fsleep() implementation to take the reality of the
timer wheel granularity on different HZ values into account.
Right now the boundaries are hard coded time ranges which fail
to provide the requested accuracy on different HZ settings.
- Update documentation for all sleep/timeout related functions
and fix up stale documentation links all over the place
- Fixup a few usage sites
- Rework of timekeeping and adjtimex(2) to prepare for multiple PTP
clocks
A system can have multiple PTP clocks which are participating in
seperate and independent PTP clock domains. So far the kernel only
considers the PTP clock which is based on CLOCK TAI relevant as
that's the clock which drives the timekeeping adjustments via the
various user space daemons through adjtimex(2).
The non TAI based clock domains are accessible via the file
descriptor based posix clocks, but their usability is very limited.
They can't be accessed fast as they always go all the way out to
the hardware and they cannot be utilized in the kernel itself.
As Time Sensitive Networking (TSN) gains traction it is required to
provide fast user and kernel space access to these clocks.
The approach taken is to utilize the timekeeping and adjtimex(2)
infrastructure to provide this access in a similar way how the
kernel provides access to clock MONOTONIC, REALTIME etc.
Instead of creating a duplicated infrastructure this rework
converts timekeeping and adjtimex(2) into generic functionality
which operates on pointers to data structures instead of using
static variables.
This allows to provide time accessors and adjtimex(2) functionality
for the independent PTP clocks in a subsequent step.
- Consolidate hrtimer initialization
hrtimers are set up by initializing the data structure and then
seperately setting the callback function for historical reasons.
That's an extra unnecessary step and makes Rust support less
straight forward than it should be.
Provide a new set of hrtimer_setup*() functions and convert the
core code and a few usage sites of the less frequently used
interfaces over.
The bulk of the htimer_init() to hrtimer_setup() conversion is
already prepared and scheduled for the next merge window.
- Drivers:
- Ensure that the global timekeeping clocksource is utilizing the
cluster 0 timer on MIPS multi-cluster systems.
Otherwise CPUs on different clusters use their cluster specific
clocksource which is not guaranteed to be synchronized with
other clusters.
- Mostly boring cleanups, fixes, improvements and code movement"
* tag 'timers-core-2024-11-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (140 commits)
posix-timers: Fix spurious warning on double enqueue versus do_exit()
clocksource/drivers/arm_arch_timer: Use of_property_present() for non-boolean properties
clocksource/drivers/gpx: Remove redundant casts
clocksource/drivers/timer-ti-dm: Fix child node refcount handling
dt-bindings: timer: actions,owl-timer: convert to YAML
clocksource/drivers/ralink: Add Ralink System Tick Counter driver
clocksource/drivers/mips-gic-timer: Always use cluster 0 counter as clocksource
clocksource/drivers/timer-ti-dm: Don't fail probe if int not found
clocksource/drivers:sp804: Make user selectable
clocksource/drivers/dw_apb: Remove unused dw_apb_clockevent functions
hrtimers: Delete hrtimer_init_on_stack()
alarmtimer: Switch to use hrtimer_setup() and hrtimer_setup_on_stack()
io_uring: Switch to use hrtimer_setup_on_stack()
sched/idle: Switch to use hrtimer_setup_on_stack()
hrtimers: Delete hrtimer_init_sleeper_on_stack()
wait: Switch to use hrtimer_setup_sleeper_on_stack()
timers: Switch to use hrtimer_setup_sleeper_on_stack()
net: pktgen: Switch to use hrtimer_setup_sleeper_on_stack()
futex: Switch to use hrtimer_setup_sleeper_on_stack()
fs/aio: Switch to use hrtimer_setup_sleeper_on_stack()
...
2024-11-20 00:35:06 +00:00
|
|
|
hrtimer_setup_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode);
|
2024-11-01 09:19:57 +00:00
|
|
|
hrtimer_set_expires(&timer.timer, kt);
|
|
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
|
|
hrtimer_sleeper_start_expires(&timer, mode);
|
|
|
|
|
|
|
|
if (timer.task)
|
|
|
|
io_schedule();
|
|
|
|
|
|
|
|
hrtimer_cancel(&timer.timer);
|
|
|
|
__set_current_state(TASK_RUNNING);
|
|
|
|
destroy_hrtimer_on_stack(&timer.timer);
|
|
|
|
return sleep_time;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int io_uring_hybrid_poll(struct io_kiocb *req,
|
|
|
|
struct io_comp_batch *iob, unsigned int poll_flags)
|
|
|
|
{
|
|
|
|
struct io_ring_ctx *ctx = req->ctx;
|
|
|
|
u64 runtime, sleep_time;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
sleep_time = io_hybrid_iopoll_delay(ctx, req);
|
|
|
|
ret = io_uring_classic_poll(req, iob, poll_flags);
|
|
|
|
runtime = ktime_get_ns() - req->iopoll_start - sleep_time;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use minimum sleep time if we're polling devices with different
|
|
|
|
* latencies. We could get more completions from the faster ones.
|
|
|
|
*/
|
|
|
|
if (ctx->hybrid_poll_time > runtime)
|
|
|
|
ctx->hybrid_poll_time = runtime;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-06-13 13:27:03 +00:00
|
|
|
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
|
|
|
|
{
|
|
|
|
struct io_wq_work_node *pos, *start, *prev;
|
2023-03-20 19:49:26 +00:00
|
|
|
unsigned int poll_flags = 0;
|
2022-06-13 13:27:03 +00:00
|
|
|
DEFINE_IO_COMP_BATCH(iob);
|
|
|
|
int nr_events = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only spin for completions if we don't have multiple devices hanging
|
|
|
|
* off our complete list.
|
|
|
|
*/
|
|
|
|
if (ctx->poll_multi_queue || force_nonspin)
|
|
|
|
poll_flags |= BLK_POLL_ONESHOT;
|
|
|
|
|
|
|
|
wq_list_for_each(pos, start, &ctx->iopoll_list) {
|
|
|
|
struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Move completed and retryable entries to our local lists.
|
|
|
|
* If we find a request that requires polling, break out
|
|
|
|
* and complete those lists first, if we have entries there.
|
|
|
|
*/
|
|
|
|
if (READ_ONCE(req->iopoll_completed))
|
|
|
|
break;
|
|
|
|
|
2024-11-01 09:19:57 +00:00
|
|
|
if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL)
|
|
|
|
ret = io_uring_hybrid_poll(req, &iob, poll_flags);
|
|
|
|
else
|
|
|
|
ret = io_uring_classic_poll(req, &iob, poll_flags);
|
2022-09-02 21:16:29 +00:00
|
|
|
|
2022-06-13 13:27:03 +00:00
|
|
|
if (unlikely(ret < 0))
|
|
|
|
return ret;
|
|
|
|
else if (ret)
|
|
|
|
poll_flags |= BLK_POLL_ONESHOT;
|
|
|
|
|
|
|
|
/* iopoll may have completed current req */
|
2024-11-13 15:20:44 +00:00
|
|
|
if (!rq_list_empty(&iob.req_list) ||
|
2022-06-13 13:27:03 +00:00
|
|
|
READ_ONCE(req->iopoll_completed))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2024-11-13 15:20:44 +00:00
|
|
|
if (!rq_list_empty(&iob.req_list))
|
2022-06-13 13:27:03 +00:00
|
|
|
iob.complete(&iob);
|
|
|
|
else if (!pos)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
prev = start;
|
|
|
|
wq_list_for_each_resume(pos, prev) {
|
|
|
|
struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
|
|
|
|
|
|
|
|
/* order with io_complete_rw_iopoll(), e.g. ->result updates */
|
|
|
|
if (!smp_load_acquire(&req->iopoll_completed))
|
|
|
|
break;
|
|
|
|
nr_events++;
|
2024-08-27 14:26:07 +00:00
|
|
|
req->cqe.flags = io_put_kbuf(req, req->cqe.res, 0);
|
2024-03-18 22:13:01 +00:00
|
|
|
if (req->opcode != IORING_OP_URING_CMD)
|
|
|
|
io_req_rw_cleanup(req, 0);
|
2022-06-13 13:27:03 +00:00
|
|
|
}
|
|
|
|
if (unlikely(!nr_events))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
pos = start ? start->next : ctx->iopoll_list.first;
|
|
|
|
wq_list_cut(&ctx->iopoll_list, prev, start);
|
2023-08-24 22:53:29 +00:00
|
|
|
|
|
|
|
if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs)))
|
|
|
|
return 0;
|
|
|
|
ctx->submit_state.compl_reqs.first = pos;
|
|
|
|
__io_submit_flush_completions(ctx);
|
2022-06-13 13:27:03 +00:00
|
|
|
return nr_events;
|
|
|
|
}
|
2024-03-18 22:13:01 +00:00
|
|
|
|
2024-03-20 21:19:44 +00:00
|
|
|
void io_rw_cache_free(const void *entry)
|
2024-03-18 22:13:01 +00:00
|
|
|
{
|
2024-03-20 21:19:44 +00:00
|
|
|
struct io_async_rw *rw = (struct io_async_rw *) entry;
|
2024-03-18 22:13:01 +00:00
|
|
|
|
2024-03-18 22:31:44 +00:00
|
|
|
if (rw->free_iovec) {
|
|
|
|
kasan_mempool_unpoison_object(rw->free_iovec,
|
|
|
|
rw->free_iov_nr * sizeof(struct iovec));
|
|
|
|
io_rw_iovec_free(rw);
|
|
|
|
}
|
2024-03-18 22:13:01 +00:00
|
|
|
kfree(rw);
|
|
|
|
}
|