mirror of
https://github.com/torvalds/linux.git
synced 2024-11-21 19:41:42 +00:00
8350142a4b
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmc7S3kQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpjHVEAC+CITBEcGy+S0IK0BpIAhuA+A621LtqBwy 0z/4MZKXMqvWxcFGQJ9Zr8MvxUnY4KFcssiaR5zk+I9TczNu7mLMuPYD1Gb0Klgz mwuFOylo1CAAC41IABYZZ/0qWbTaW0p8tpaGsTbTNk3tBxuMLB550+APAqC1OE9U bb7rP+FHc5+YGI9/7JNWt7NNTSHvVSO6oxjltCxHr1dRg93Jtr2jaY6letY3epFz TCFyfJlDtK8fPwtYRyG51M4g2Vdp9/4qsfPqvnXwUr9MdWaVh5/TFkyvqDi5sCKM zdK/sjRiimYzvqqKg6bzgYscITUPNk2TG6ZJq5U1L7lrglzVY69c7GIUnNzPrL/y AxQsR5Guxz3bRNYWZ4BKJDH+NNB+cgIFEXDsv72qoUy3HTzA6wOPZYxfjhZhKuG/ DjRwM7NGx5oPiKtpK99IulZttXdmtkH0csuLwKmOzrQskQdTuWyrEtU7UQql7oQ5 Rt3DhMXouzYZMicB8U5Q9gO2I3WN+2VVxXl4sa00LG8KsT6PzLnz4Q2k/1c83S6J rRivRbZAbZ1+BqKvF8T7GgzLCeaLgzbeoxmxj6xr87pf3SYEs2KhQeQ+n/C0HTOt GOcG1+bvh7t2aSvlBPKVCExWI4erwG6wXFhfGKsLW9CmwIMqRNxdePpRWe3Cueyp M3QRJuvTxQ== =bDvp -----END PGP SIGNATURE----- Merge tag 'for-6.13/io_uring-20241118' of git://git.kernel.dk/linux Pull io_uring updates from Jens Axboe: - Cleanups of the eventfd handling code, making it fully private. - Support for sending a sync message to another ring, without having a ring available to send a normal async message. - Get rid of the separate unlocked hash table, unify everything around the single locked one. - Add support for ring resizing. It can be hard to appropriately size the CQ ring upfront, if the application doesn't know how busy it will be. This results in applications sizing rings for the most busy case, which can be wasteful. With ring resizing, they can start small and grow the ring, if needed. - Add support for fixed wait regions, rather than needing to copy the same wait data tons of times for each wait operation. - Rewrite the resource node handling, which before was serialized per ring. This caused issues with particularly fixed files, where one file waiting on IO could hold up putting and freeing of other unrelated files. Now each node is handled separately. New code is much simpler too, and was a net 250 line reduction in code. - Add support for just doing partial buffer clones, rather than always cloning the entire buffer table. - Series adding static NAPI support, where a specific NAPI instance is used rather than having a list of them available that need lookup. - Add support for mapped regions, and also convert the fixed wait support mentioned above to that concept. This avoids doing special mappings for various planned features, and folds the existing registered wait into that too. - Add support for hybrid IO polling, which is a variant of strict IOPOLL but with an initial sleep delay to avoid spinning too early and wasting resources on devices that aren't necessarily in the < 5 usec category wrt latencies. - Various cleanups and little fixes. * tag 'for-6.13/io_uring-20241118' of git://git.kernel.dk/linux: (79 commits) io_uring/region: fix error codes after failed vmap io_uring: restore back registered wait arguments io_uring: add memory region registration io_uring: introduce concept of memory regions io_uring: temporarily disable registered waits io_uring: disable ENTER_EXT_ARG_REG for IOPOLL io_uring: fortify io_pin_pages with a warning switch io_msg_ring() to CLASS(fd) io_uring: fix invalid hybrid polling ctx leaks io_uring/uring_cmd: fix buffer index retrieval io_uring/rsrc: add & apply io_req_assign_buf_node() io_uring/rsrc: remove '->ctx_ptr' of 'struct io_rsrc_node' io_uring/rsrc: pass 'struct io_ring_ctx' reference to rsrc helpers io_uring: avoid normal tw intermediate fallback io_uring/napi: add static napi tracking strategy io_uring/napi: clean up __io_napi_do_busy_loop io_uring/napi: Use lock guards io_uring/napi: improve __io_napi_add io_uring/napi: fix io_napi_entry RCU accesses io_uring/napi: protect concurrent io_napi_entry timeout accesses ...
377 lines
9.9 KiB
C
377 lines
9.9 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#include <linux/kernel.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/file.h>
|
|
#include <linux/io_uring/cmd.h>
|
|
#include <linux/io_uring/net.h>
|
|
#include <linux/security.h>
|
|
#include <linux/nospec.h>
|
|
#include <net/sock.h>
|
|
|
|
#include <uapi/linux/io_uring.h>
|
|
#include <asm/ioctls.h>
|
|
|
|
#include "io_uring.h"
|
|
#include "alloc_cache.h"
|
|
#include "rsrc.h"
|
|
#include "uring_cmd.h"
|
|
|
|
static struct uring_cache *io_uring_async_get(struct io_kiocb *req)
|
|
{
|
|
struct io_ring_ctx *ctx = req->ctx;
|
|
struct uring_cache *cache;
|
|
|
|
cache = io_alloc_cache_get(&ctx->uring_cache);
|
|
if (cache) {
|
|
req->flags |= REQ_F_ASYNC_DATA;
|
|
req->async_data = cache;
|
|
return cache;
|
|
}
|
|
if (!io_alloc_async_data(req))
|
|
return req->async_data;
|
|
return NULL;
|
|
}
|
|
|
|
static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
|
|
{
|
|
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
|
|
struct uring_cache *cache = req->async_data;
|
|
|
|
if (issue_flags & IO_URING_F_UNLOCKED)
|
|
return;
|
|
if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) {
|
|
ioucmd->sqe = NULL;
|
|
req->async_data = NULL;
|
|
req->flags &= ~REQ_F_ASYNC_DATA;
|
|
}
|
|
}
|
|
|
|
bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
|
|
struct io_uring_task *tctx, bool cancel_all)
|
|
{
|
|
struct hlist_node *tmp;
|
|
struct io_kiocb *req;
|
|
bool ret = false;
|
|
|
|
lockdep_assert_held(&ctx->uring_lock);
|
|
|
|
hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd,
|
|
hash_node) {
|
|
struct io_uring_cmd *cmd = io_kiocb_to_cmd(req,
|
|
struct io_uring_cmd);
|
|
struct file *file = req->file;
|
|
|
|
if (!cancel_all && req->tctx != tctx)
|
|
continue;
|
|
|
|
if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
|
|
/* ->sqe isn't available if no async data */
|
|
if (!req_has_async_data(req))
|
|
cmd->sqe = NULL;
|
|
file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL |
|
|
IO_URING_F_COMPLETE_DEFER);
|
|
ret = true;
|
|
}
|
|
}
|
|
io_submit_flush_completions(ctx);
|
|
return ret;
|
|
}
|
|
|
|
static void io_uring_cmd_del_cancelable(struct io_uring_cmd *cmd,
|
|
unsigned int issue_flags)
|
|
{
|
|
struct io_kiocb *req = cmd_to_io_kiocb(cmd);
|
|
struct io_ring_ctx *ctx = req->ctx;
|
|
|
|
if (!(cmd->flags & IORING_URING_CMD_CANCELABLE))
|
|
return;
|
|
|
|
cmd->flags &= ~IORING_URING_CMD_CANCELABLE;
|
|
io_ring_submit_lock(ctx, issue_flags);
|
|
hlist_del(&req->hash_node);
|
|
io_ring_submit_unlock(ctx, issue_flags);
|
|
}
|
|
|
|
/*
|
|
* Mark this command as concelable, then io_uring_try_cancel_uring_cmd()
|
|
* will try to cancel this issued command by sending ->uring_cmd() with
|
|
* issue_flags of IO_URING_F_CANCEL.
|
|
*
|
|
* The command is guaranteed to not be done when calling ->uring_cmd()
|
|
* with IO_URING_F_CANCEL, but it is driver's responsibility to deal
|
|
* with race between io_uring canceling and normal completion.
|
|
*/
|
|
void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
|
|
unsigned int issue_flags)
|
|
{
|
|
struct io_kiocb *req = cmd_to_io_kiocb(cmd);
|
|
struct io_ring_ctx *ctx = req->ctx;
|
|
|
|
if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) {
|
|
cmd->flags |= IORING_URING_CMD_CANCELABLE;
|
|
io_ring_submit_lock(ctx, issue_flags);
|
|
hlist_add_head(&req->hash_node, &ctx->cancelable_uring_cmd);
|
|
io_ring_submit_unlock(ctx, issue_flags);
|
|
}
|
|
}
|
|
EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
|
|
|
|
static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
|
|
{
|
|
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
|
|
unsigned int flags = IO_URING_F_COMPLETE_DEFER;
|
|
|
|
if (current->flags & (PF_EXITING | PF_KTHREAD))
|
|
flags |= IO_URING_F_TASK_DEAD;
|
|
|
|
/* task_work executor checks the deffered list completion */
|
|
ioucmd->task_work_cb(ioucmd, flags);
|
|
}
|
|
|
|
void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
|
|
void (*task_work_cb)(struct io_uring_cmd *, unsigned),
|
|
unsigned flags)
|
|
{
|
|
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
|
|
|
|
ioucmd->task_work_cb = task_work_cb;
|
|
req->io_task_work.func = io_uring_cmd_work;
|
|
__io_req_task_work_add(req, flags);
|
|
}
|
|
EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task);
|
|
|
|
static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
|
|
u64 extra1, u64 extra2)
|
|
{
|
|
req->big_cqe.extra1 = extra1;
|
|
req->big_cqe.extra2 = extra2;
|
|
}
|
|
|
|
/*
|
|
* Called by consumers of io_uring_cmd, if they originally returned
|
|
* -EIOCBQUEUED upon receiving the command.
|
|
*/
|
|
void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2,
|
|
unsigned issue_flags)
|
|
{
|
|
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
|
|
|
|
io_uring_cmd_del_cancelable(ioucmd, issue_flags);
|
|
|
|
if (ret < 0)
|
|
req_set_fail(req);
|
|
|
|
io_req_set_res(req, ret, 0);
|
|
if (req->ctx->flags & IORING_SETUP_CQE32)
|
|
io_req_set_cqe32_extra(req, res2, 0);
|
|
io_req_uring_cleanup(req, issue_flags);
|
|
if (req->ctx->flags & IORING_SETUP_IOPOLL) {
|
|
/* order with io_iopoll_req_issued() checking ->iopoll_complete */
|
|
smp_store_release(&req->iopoll_completed, 1);
|
|
} else if (issue_flags & IO_URING_F_COMPLETE_DEFER) {
|
|
if (WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED))
|
|
return;
|
|
io_req_complete_defer(req);
|
|
} else {
|
|
req->io_task_work.func = io_req_task_complete;
|
|
io_req_task_work_add(req);
|
|
}
|
|
}
|
|
EXPORT_SYMBOL_GPL(io_uring_cmd_done);
|
|
|
|
static int io_uring_cmd_prep_setup(struct io_kiocb *req,
|
|
const struct io_uring_sqe *sqe)
|
|
{
|
|
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
|
|
struct uring_cache *cache;
|
|
|
|
cache = io_uring_async_get(req);
|
|
if (unlikely(!cache))
|
|
return -ENOMEM;
|
|
|
|
if (!(req->flags & REQ_F_FORCE_ASYNC)) {
|
|
/* defer memcpy until we need it */
|
|
ioucmd->sqe = sqe;
|
|
return 0;
|
|
}
|
|
|
|
memcpy(req->async_data, sqe, uring_sqe_size(req->ctx));
|
|
ioucmd->sqe = req->async_data;
|
|
return 0;
|
|
}
|
|
|
|
int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
|
{
|
|
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
|
|
|
|
if (sqe->__pad1)
|
|
return -EINVAL;
|
|
|
|
ioucmd->flags = READ_ONCE(sqe->uring_cmd_flags);
|
|
if (ioucmd->flags & ~IORING_URING_CMD_MASK)
|
|
return -EINVAL;
|
|
|
|
if (ioucmd->flags & IORING_URING_CMD_FIXED) {
|
|
struct io_ring_ctx *ctx = req->ctx;
|
|
struct io_rsrc_node *node;
|
|
u16 index = READ_ONCE(sqe->buf_index);
|
|
|
|
node = io_rsrc_node_lookup(&ctx->buf_table, index);
|
|
if (unlikely(!node))
|
|
return -EFAULT;
|
|
/*
|
|
* Pi node upfront, prior to io_uring_cmd_import_fixed()
|
|
* being called. This prevents destruction of the mapped buffer
|
|
* we'll need at actual import time.
|
|
*/
|
|
io_req_assign_buf_node(req, node);
|
|
}
|
|
ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
|
|
|
|
return io_uring_cmd_prep_setup(req, sqe);
|
|
}
|
|
|
|
int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
|
|
{
|
|
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
|
|
struct io_ring_ctx *ctx = req->ctx;
|
|
struct file *file = req->file;
|
|
int ret;
|
|
|
|
if (!file->f_op->uring_cmd)
|
|
return -EOPNOTSUPP;
|
|
|
|
ret = security_uring_cmd(ioucmd);
|
|
if (ret)
|
|
return ret;
|
|
|
|
if (ctx->flags & IORING_SETUP_SQE128)
|
|
issue_flags |= IO_URING_F_SQE128;
|
|
if (ctx->flags & IORING_SETUP_CQE32)
|
|
issue_flags |= IO_URING_F_CQE32;
|
|
if (ctx->compat)
|
|
issue_flags |= IO_URING_F_COMPAT;
|
|
if (ctx->flags & IORING_SETUP_IOPOLL) {
|
|
if (!file->f_op->uring_cmd_iopoll)
|
|
return -EOPNOTSUPP;
|
|
issue_flags |= IO_URING_F_IOPOLL;
|
|
req->iopoll_completed = 0;
|
|
}
|
|
|
|
ret = file->f_op->uring_cmd(ioucmd, issue_flags);
|
|
if (ret == -EAGAIN) {
|
|
struct uring_cache *cache = req->async_data;
|
|
|
|
if (ioucmd->sqe != (void *) cache)
|
|
memcpy(cache, ioucmd->sqe, uring_sqe_size(req->ctx));
|
|
return -EAGAIN;
|
|
} else if (ret == -EIOCBQUEUED) {
|
|
return -EIOCBQUEUED;
|
|
}
|
|
|
|
if (ret < 0)
|
|
req_set_fail(req);
|
|
io_req_uring_cleanup(req, issue_flags);
|
|
io_req_set_res(req, ret, 0);
|
|
return IOU_OK;
|
|
}
|
|
|
|
int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
|
|
struct iov_iter *iter, void *ioucmd)
|
|
{
|
|
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
|
|
struct io_rsrc_node *node = req->buf_node;
|
|
|
|
/* Must have had rsrc_node assigned at prep time */
|
|
if (node)
|
|
return io_import_fixed(rw, iter, node->buf, ubuf, len);
|
|
|
|
return -EFAULT;
|
|
}
|
|
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);
|
|
|
|
void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
|
|
{
|
|
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
|
|
|
|
io_req_queue_iowq(req);
|
|
}
|
|
|
|
static inline int io_uring_cmd_getsockopt(struct socket *sock,
|
|
struct io_uring_cmd *cmd,
|
|
unsigned int issue_flags)
|
|
{
|
|
bool compat = !!(issue_flags & IO_URING_F_COMPAT);
|
|
int optlen, optname, level, err;
|
|
void __user *optval;
|
|
|
|
level = READ_ONCE(cmd->sqe->level);
|
|
if (level != SOL_SOCKET)
|
|
return -EOPNOTSUPP;
|
|
|
|
optval = u64_to_user_ptr(READ_ONCE(cmd->sqe->optval));
|
|
optname = READ_ONCE(cmd->sqe->optname);
|
|
optlen = READ_ONCE(cmd->sqe->optlen);
|
|
|
|
err = do_sock_getsockopt(sock, compat, level, optname,
|
|
USER_SOCKPTR(optval),
|
|
KERNEL_SOCKPTR(&optlen));
|
|
if (err)
|
|
return err;
|
|
|
|
/* On success, return optlen */
|
|
return optlen;
|
|
}
|
|
|
|
static inline int io_uring_cmd_setsockopt(struct socket *sock,
|
|
struct io_uring_cmd *cmd,
|
|
unsigned int issue_flags)
|
|
{
|
|
bool compat = !!(issue_flags & IO_URING_F_COMPAT);
|
|
int optname, optlen, level;
|
|
void __user *optval;
|
|
sockptr_t optval_s;
|
|
|
|
optval = u64_to_user_ptr(READ_ONCE(cmd->sqe->optval));
|
|
optname = READ_ONCE(cmd->sqe->optname);
|
|
optlen = READ_ONCE(cmd->sqe->optlen);
|
|
level = READ_ONCE(cmd->sqe->level);
|
|
optval_s = USER_SOCKPTR(optval);
|
|
|
|
return do_sock_setsockopt(sock, compat, level, optname, optval_s,
|
|
optlen);
|
|
}
|
|
|
|
#if defined(CONFIG_NET)
|
|
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
|
|
{
|
|
struct socket *sock = cmd->file->private_data;
|
|
struct sock *sk = sock->sk;
|
|
struct proto *prot = READ_ONCE(sk->sk_prot);
|
|
int ret, arg = 0;
|
|
|
|
if (!prot || !prot->ioctl)
|
|
return -EOPNOTSUPP;
|
|
|
|
switch (cmd->sqe->cmd_op) {
|
|
case SOCKET_URING_OP_SIOCINQ:
|
|
ret = prot->ioctl(sk, SIOCINQ, &arg);
|
|
if (ret)
|
|
return ret;
|
|
return arg;
|
|
case SOCKET_URING_OP_SIOCOUTQ:
|
|
ret = prot->ioctl(sk, SIOCOUTQ, &arg);
|
|
if (ret)
|
|
return ret;
|
|
return arg;
|
|
case SOCKET_URING_OP_GETSOCKOPT:
|
|
return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
|
|
case SOCKET_URING_OP_SETSOCKOPT:
|
|
return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
|
|
default:
|
|
return -EOPNOTSUPP;
|
|
}
|
|
}
|
|
EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
|
|
#endif
|