From 2757be22c0f4c06d359f802fa1fc57286fa26975 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Fri, 4 Feb 2022 14:51:13 +0000 Subject: [PATCH 01/42] io_uring: remove trace for eventfd The information on whether eventfd is registered is not very useful and would result in the tracepoint being enclosed in an rcu_readlock in a later patch that tries to avoid ring quiesce for registering eventfd. Suggested-by: Jens Axboe Signed-off-by: Usama Arif Link: https://lore.kernel.org/r/20220204145117.1186568-2-usama.arif@bytedance.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- include/trace/events/io_uring.h | 13 +++++-------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4715980e9015..8bddfacc3382 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -11179,8 +11179,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); mutex_unlock(&ctx->uring_lock); - trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, - ctx->cq_ev_fd != NULL, ret); + trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); out_fput: fdput(f); return ret; diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 7346f0164cf4..098beda7601a 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -57,10 +57,9 @@ TRACE_EVENT(io_uring_create, * @opcode: describes which operation to perform * @nr_user_files: number of registered files * @nr_user_bufs: number of registered buffers - * @cq_ev_fd: whether eventfs registered or not * @ret: return code * - * Allows to trace fixed files/buffers/eventfds, that could be registered to + * Allows to trace fixed files/buffers, that could be registered to * avoid an overhead of getting references to them for every operation. This * event, together with io_uring_file_get, can provide a full picture of how * much overhead one can reduce via fixing. @@ -68,16 +67,15 @@ TRACE_EVENT(io_uring_create, TRACE_EVENT(io_uring_register, TP_PROTO(void *ctx, unsigned opcode, unsigned nr_files, - unsigned nr_bufs, bool eventfd, long ret), + unsigned nr_bufs, long ret), - TP_ARGS(ctx, opcode, nr_files, nr_bufs, eventfd, ret), + TP_ARGS(ctx, opcode, nr_files, nr_bufs, ret), TP_STRUCT__entry ( __field( void *, ctx ) __field( unsigned, opcode ) __field( unsigned, nr_files ) __field( unsigned, nr_bufs ) - __field( bool, eventfd ) __field( long, ret ) ), @@ -86,14 +84,13 @@ TRACE_EVENT(io_uring_register, __entry->opcode = opcode; __entry->nr_files = nr_files; __entry->nr_bufs = nr_bufs; - __entry->eventfd = eventfd; __entry->ret = ret; ), TP_printk("ring %p, opcode %d, nr_user_files %d, nr_user_bufs %d, " - "eventfd %d, ret %ld", + "ret %ld", __entry->ctx, __entry->opcode, __entry->nr_files, - __entry->nr_bufs, __entry->eventfd, __entry->ret) + __entry->nr_bufs, __entry->ret) ); /** From 77bc59b498174ea4f120d477d2cd0cf90fc58235 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Fri, 4 Feb 2022 14:51:14 +0000 Subject: [PATCH 02/42] io_uring: avoid ring quiesce while registering/unregistering eventfd This is done by creating a new RCU data structure (io_ev_fd) as part of io_ring_ctx that holds the eventfd_ctx. The function io_eventfd_signal is executed under rcu_read_lock with a single rcu_dereference to io_ev_fd so that if another thread unregisters the eventfd while io_eventfd_signal is still being executed, the eventfd_signal for which io_eventfd_signal was called completes successfully. The process of registering/unregistering eventfd is already done under uring_lock so multiple threads won't enter a race condition while registering/unregistering eventfd. With the above approach ring quiesce can be avoided which is much more expensive then using RCU lock. On the system tested, io_uring_register with IORING_REGISTER_EVENTFD takes less than 1ms with RCU lock, compared to 15ms before with ring quiesce. Signed-off-by: Usama Arif Link: https://lore.kernel.org/r/20220204145117.1186568-3-usama.arif@bytedance.com [axboe: long line fixups] Signed-off-by: Jens Axboe --- fs/io_uring.c | 89 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 21 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8bddfacc3382..6c35e88be4ea 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -326,6 +326,11 @@ struct io_submit_state { struct blk_plug plug; }; +struct io_ev_fd { + struct eventfd_ctx *cq_ev_fd; + struct rcu_head rcu; +}; + struct io_ring_ctx { /* const or read-mostly hot data */ struct { @@ -399,7 +404,7 @@ struct io_ring_ctx { struct { unsigned cached_cq_tail; unsigned cq_entries; - struct eventfd_ctx *cq_ev_fd; + struct io_ev_fd __rcu *io_ev_fd; struct wait_queue_head cq_wait; unsigned cq_extra; atomic_t cq_timeouts; @@ -1726,13 +1731,36 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) return &rings->cqes[tail & mask]; } -static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) +static void io_eventfd_signal(struct io_ring_ctx *ctx) { - if (likely(!ctx->cq_ev_fd)) - return false; + struct io_ev_fd *ev_fd; + + /* Return quickly if ctx->io_ev_fd doesn't exist */ + if (likely(!rcu_dereference_raw(ctx->io_ev_fd))) + return; + + rcu_read_lock(); + /* + * rcu_dereference ctx->io_ev_fd once and use it for both for checking + * and eventfd_signal + */ + ev_fd = rcu_dereference(ctx->io_ev_fd); + + /* + * Check again if ev_fd exists incase an io_eventfd_unregister call + * completed between the NULL check of ctx->io_ev_fd at the start of + * the function and rcu_read_lock. + */ + if (unlikely(!ev_fd)) + goto out; if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - return false; - return !ctx->eventfd_async || io_wq_current_is_worker(); + goto out; + + if (!ctx->eventfd_async || io_wq_current_is_worker()) + eventfd_signal(ev_fd->cq_ev_fd, 1); + +out: + rcu_read_unlock(); } /* @@ -1751,8 +1779,7 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) */ if (wq_has_sleeper(&ctx->cq_wait)) wake_up_all(&ctx->cq_wait); - if (io_should_trigger_evfd(ctx)) - eventfd_signal(ctx->cq_ev_fd, 1); + io_eventfd_signal(ctx); } static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) @@ -1764,8 +1791,7 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) if (waitqueue_active(&ctx->cq_wait)) wake_up_all(&ctx->cq_wait); } - if (io_should_trigger_evfd(ctx)) - eventfd_signal(ctx->cq_ev_fd, 1); + io_eventfd_signal(ctx); } /* Returns true if there are no backlogged entries after the flush */ @@ -9361,31 +9387,50 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg) { + struct io_ev_fd *ev_fd; __s32 __user *fds = arg; - int fd; + int fd, ret; - if (ctx->cq_ev_fd) + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) return -EBUSY; if (copy_from_user(&fd, fds, sizeof(*fds))) return -EFAULT; - ctx->cq_ev_fd = eventfd_ctx_fdget(fd); - if (IS_ERR(ctx->cq_ev_fd)) { - int ret = PTR_ERR(ctx->cq_ev_fd); + ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); + if (!ev_fd) + return -ENOMEM; - ctx->cq_ev_fd = NULL; + ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); + if (IS_ERR(ev_fd->cq_ev_fd)) { + ret = PTR_ERR(ev_fd->cq_ev_fd); + kfree(ev_fd); return ret; } - return 0; + rcu_assign_pointer(ctx->io_ev_fd, ev_fd); + return ret; +} + +static void io_eventfd_put(struct rcu_head *rcu) +{ + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + + eventfd_ctx_put(ev_fd->cq_ev_fd); + kfree(ev_fd); } static int io_eventfd_unregister(struct io_ring_ctx *ctx) { - if (ctx->cq_ev_fd) { - eventfd_ctx_put(ctx->cq_ev_fd); - ctx->cq_ev_fd = NULL; + struct io_ev_fd *ev_fd; + + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) { + rcu_assign_pointer(ctx->io_ev_fd, NULL); + call_rcu(&ev_fd->rcu, io_eventfd_put); return 0; } @@ -9450,8 +9495,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) __io_sqe_files_unregister(ctx); if (ctx->rings) __io_cqring_overflow_flush(ctx, true); - mutex_unlock(&ctx->uring_lock); io_eventfd_unregister(ctx); + mutex_unlock(&ctx->uring_lock); io_destroy_buffers(ctx); if (ctx->sq_creds) put_cred(ctx->sq_creds); @@ -10968,6 +11013,8 @@ static bool io_register_op_must_quiesce(int op) case IORING_REGISTER_FILES: case IORING_UNREGISTER_FILES: case IORING_REGISTER_FILES_UPDATE: + case IORING_REGISTER_EVENTFD: + case IORING_UNREGISTER_EVENTFD: case IORING_REGISTER_PROBE: case IORING_REGISTER_PERSONALITY: case IORING_UNREGISTER_PERSONALITY: From c75312dd592b31427caa88170b61fdc3ae5b2891 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Fri, 4 Feb 2022 14:51:15 +0000 Subject: [PATCH 03/42] io_uring: avoid ring quiesce while registering async eventfd This is done using the RCU data structure (io_ev_fd). eventfd_async is moved from io_ring_ctx to io_ev_fd which is RCU protected hence avoiding ring quiesce which is much more expensive than an RCU lock. The place where eventfd_async is read is already under rcu_read_lock so there is no extra RCU read-side critical section needed. Signed-off-by: Usama Arif Link: https://lore.kernel.org/r/20220204145117.1186568-4-usama.arif@bytedance.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6c35e88be4ea..961866b24b4f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -328,6 +328,7 @@ struct io_submit_state { struct io_ev_fd { struct eventfd_ctx *cq_ev_fd; + unsigned int eventfd_async: 1; struct rcu_head rcu; }; @@ -340,7 +341,6 @@ struct io_ring_ctx { unsigned int flags; unsigned int compat: 1; unsigned int drain_next: 1; - unsigned int eventfd_async: 1; unsigned int restricted: 1; unsigned int off_timeout_used: 1; unsigned int drain_active: 1; @@ -1756,7 +1756,7 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx) if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) goto out; - if (!ctx->eventfd_async || io_wq_current_is_worker()) + if (!ev_fd->eventfd_async || io_wq_current_is_worker()) eventfd_signal(ev_fd->cq_ev_fd, 1); out: @@ -9385,7 +9385,8 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, return done ? done : err; } -static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg) +static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int eventfd_async) { struct io_ev_fd *ev_fd; __s32 __user *fds = arg; @@ -9409,6 +9410,7 @@ static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg) kfree(ev_fd); return ret; } + ev_fd->eventfd_async = eventfd_async; rcu_assign_pointer(ctx->io_ev_fd, ev_fd); return ret; @@ -11014,6 +11016,7 @@ static bool io_register_op_must_quiesce(int op) case IORING_UNREGISTER_FILES: case IORING_REGISTER_FILES_UPDATE: case IORING_REGISTER_EVENTFD: + case IORING_REGISTER_EVENTFD_ASYNC: case IORING_UNREGISTER_EVENTFD: case IORING_REGISTER_PROBE: case IORING_REGISTER_PERSONALITY: @@ -11114,17 +11117,16 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = io_register_files_update(ctx, arg, nr_args); break; case IORING_REGISTER_EVENTFD: + ret = -EINVAL; + if (nr_args != 1) + break; + ret = io_eventfd_register(ctx, arg, 0); + break; case IORING_REGISTER_EVENTFD_ASYNC: ret = -EINVAL; if (nr_args != 1) break; - ret = io_eventfd_register(ctx, arg); - if (ret) - break; - if (opcode == IORING_REGISTER_EVENTFD_ASYNC) - ctx->eventfd_async = 1; - else - ctx->eventfd_async = 0; + ret = io_eventfd_register(ctx, arg, 1); break; case IORING_UNREGISTER_EVENTFD: ret = -EINVAL; From ff16cfcfdaafa3c4287be92c6944fb8ea6dc75d1 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Fri, 4 Feb 2022 14:51:16 +0000 Subject: [PATCH 04/42] io_uring: avoid ring quiesce while registering restrictions and enabling rings IORING_SETUP_R_DISABLED prevents submitting requests and so there will be no requests until IORING_REGISTER_ENABLE_RINGS is called. And IORING_REGISTER_RESTRICTIONS works only before IORING_REGISTER_ENABLE_RINGS is called. Hence ring quiesce is not needed for these opcodes. Signed-off-by: Usama Arif Link: https://lore.kernel.org/r/20220204145117.1186568-5-usama.arif@bytedance.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 961866b24b4f..6ff9f67e2a1e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -11021,6 +11021,8 @@ static bool io_register_op_must_quiesce(int op) case IORING_REGISTER_PROBE: case IORING_REGISTER_PERSONALITY: case IORING_UNREGISTER_PERSONALITY: + case IORING_REGISTER_ENABLE_RINGS: + case IORING_REGISTER_RESTRICTIONS: case IORING_REGISTER_FILES2: case IORING_REGISTER_FILES_UPDATE2: case IORING_REGISTER_BUFFERS2: From 8bb649ee1da321ec3a1bbec048a425c5ea18ea21 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Fri, 4 Feb 2022 14:51:17 +0000 Subject: [PATCH 05/42] io_uring: remove ring quiesce for io_uring_register None of the opcodes in io_uring_register use ring quiesce anymore. Hence io_register_op_must_quiesce always returns false and io_ctx_quiesce is never called. Signed-off-by: Usama Arif Link: https://lore.kernel.org/r/20220204145117.1186568-6-usama.arif@bytedance.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 83 --------------------------------------------------- 1 file changed, 83 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6ff9f67e2a1e..49bcc5fc688d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1292,18 +1292,6 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req) return __io_put_kbuf(req); } -static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl) -{ - bool got = percpu_ref_tryget(ref); - - /* already at zero, wait for ->release() */ - if (!got) - wait_for_completion(compl); - percpu_ref_resurrect(ref); - if (got) - percpu_ref_put(ref); -} - static bool io_match_task(struct io_kiocb *head, struct task_struct *task, bool cancel_all) __must_hold(&req->ctx->timeout_lock) @@ -11007,66 +10995,6 @@ err: return ret; } -static bool io_register_op_must_quiesce(int op) -{ - switch (op) { - case IORING_REGISTER_BUFFERS: - case IORING_UNREGISTER_BUFFERS: - case IORING_REGISTER_FILES: - case IORING_UNREGISTER_FILES: - case IORING_REGISTER_FILES_UPDATE: - case IORING_REGISTER_EVENTFD: - case IORING_REGISTER_EVENTFD_ASYNC: - case IORING_UNREGISTER_EVENTFD: - case IORING_REGISTER_PROBE: - case IORING_REGISTER_PERSONALITY: - case IORING_UNREGISTER_PERSONALITY: - case IORING_REGISTER_ENABLE_RINGS: - case IORING_REGISTER_RESTRICTIONS: - case IORING_REGISTER_FILES2: - case IORING_REGISTER_FILES_UPDATE2: - case IORING_REGISTER_BUFFERS2: - case IORING_REGISTER_BUFFERS_UPDATE: - case IORING_REGISTER_IOWQ_AFF: - case IORING_UNREGISTER_IOWQ_AFF: - case IORING_REGISTER_IOWQ_MAX_WORKERS: - return false; - default: - return true; - } -} - -static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx) -{ - long ret; - - percpu_ref_kill(&ctx->refs); - - /* - * Drop uring mutex before waiting for references to exit. If another - * thread is currently inside io_uring_enter() it might need to grab the - * uring_lock to make progress. If we hold it here across the drain - * wait, then we can deadlock. It's safe to drop the mutex here, since - * no new references will come in after we've killed the percpu ref. - */ - mutex_unlock(&ctx->uring_lock); - do { - ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ); - if (ret) { - ret = min(0L, ret); - break; - } - - ret = io_run_task_work_sig(); - io_req_caches_free(ctx); - } while (ret >= 0); - mutex_lock(&ctx->uring_lock); - - if (ret) - io_refs_resurrect(&ctx->refs, &ctx->ref_comp); - return ret; -} - static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -11090,12 +11018,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, return -EACCES; } - if (io_register_op_must_quiesce(opcode)) { - ret = io_ctx_quiesce(ctx); - if (ret) - return ret; - } - switch (opcode) { case IORING_REGISTER_BUFFERS: ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); @@ -11200,11 +11122,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; } - if (io_register_op_must_quiesce(opcode)) { - /* bring the ctx back to life */ - percpu_ref_reinit(&ctx->refs); - reinit_completion(&ctx->ref_comp); - } return ret; } From f0a4e62bb5343b3b7163bc851cfb4bebfefcc4e7 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 7 Feb 2022 09:24:11 -0700 Subject: [PATCH 06/42] io_uring: Fix use of uninitialized ret in io_eventfd_register() Clang warns: fs/io_uring.c:9396:9: warning: variable 'ret' is uninitialized when used here [-Wuninitialized] return ret; ^~~ fs/io_uring.c:9373:13: note: initialize the variable 'ret' to silence this warning int fd, ret; ^ = 0 1 warning generated. Just return 0 directly and reduce the scope of ret to the if statement, as that is the only place that it is used, which is how the function was before the fixes commit. Fixes: 1a75fac9a0f9 ("io_uring: avoid ring quiesce while registering/unregistering eventfd") Link: https://github.com/ClangBuiltLinux/linux/issues/1579 Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Link: https://lore.kernel.org/r/20220207162410.1013466-1-nathan@kernel.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 49bcc5fc688d..1dce3f6e7031 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9378,7 +9378,7 @@ static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, { struct io_ev_fd *ev_fd; __s32 __user *fds = arg; - int fd, ret; + int fd; ev_fd = rcu_dereference_protected(ctx->io_ev_fd, lockdep_is_held(&ctx->uring_lock)); @@ -9394,14 +9394,14 @@ static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); if (IS_ERR(ev_fd->cq_ev_fd)) { - ret = PTR_ERR(ev_fd->cq_ev_fd); + int ret = PTR_ERR(ev_fd->cq_ev_fd); kfree(ev_fd); return ret; } ev_fd->eventfd_async = eventfd_async; rcu_assign_pointer(ctx->io_ev_fd, ev_fd); - return ret; + return 0; } static void io_eventfd_put(struct rcu_head *rcu) From 42abc95f05bff5180ac40c7ba5726b73c1d5e2f4 Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Sun, 6 Feb 2022 17:52:39 +0800 Subject: [PATCH 07/42] io-wq: decouple work_list protection from the big wqe->lock wqe->lock is abused, it now protects acct->work_list, hash stuff, nr_workers, wqe->free_list and so on. Lets first get the work_list out of the wqe-lock mess by introduce a specific lock for work list. This is the first step to solve the huge contension between work insertion and work consumption. good thing: - split locking for bound and unbound work list - reduce contension between work_list visit and (worker's)free_list. For the hash stuff, since there won't be a work with same file in both bound and unbound work list, thus they won't visit same hash entry. it works well to use the new lock to protect hash stuff. Results: set max_unbound_worker = 4, test with echo-server: nice -n -15 ./io_uring_echo_server -p 8081 -f -n 1000 -l 16 (-n connection, -l workload) before this patch: Samples: 2M of event 'cycles:ppp', Event count (approx.): 1239982111074 Overhead Command Shared Object Symbol 28.59% iou-wrk-10021 [kernel.vmlinux] [k] native_queued_spin_lock_slowpath 8.89% io_uring_echo_s [kernel.vmlinux] [k] native_queued_spin_lock_slowpath 6.20% iou-wrk-10021 [kernel.vmlinux] [k] _raw_spin_lock 2.45% io_uring_echo_s [kernel.vmlinux] [k] io_prep_async_work 2.36% iou-wrk-10021 [kernel.vmlinux] [k] _raw_spin_lock_irqsave 2.29% iou-wrk-10021 [kernel.vmlinux] [k] io_worker_handle_work 1.29% io_uring_echo_s [kernel.vmlinux] [k] io_wqe_enqueue 1.06% iou-wrk-10021 [kernel.vmlinux] [k] io_wqe_worker 1.06% io_uring_echo_s [kernel.vmlinux] [k] _raw_spin_lock 1.03% iou-wrk-10021 [kernel.vmlinux] [k] __schedule 0.99% iou-wrk-10021 [kernel.vmlinux] [k] tcp_sendmsg_locked with this patch: Samples: 1M of event 'cycles:ppp', Event count (approx.): 708446691943 Overhead Command Shared Object Symbol 16.86% iou-wrk-10893 [kernel.vmlinux] [k] native_queued_spin_lock_slowpat 9.10% iou-wrk-10893 [kernel.vmlinux] [k] _raw_spin_lock 4.53% io_uring_echo_s [kernel.vmlinux] [k] native_queued_spin_lock_slowpat 2.87% iou-wrk-10893 [kernel.vmlinux] [k] io_worker_handle_work 2.57% iou-wrk-10893 [kernel.vmlinux] [k] _raw_spin_lock_irqsave 2.56% io_uring_echo_s [kernel.vmlinux] [k] io_prep_async_work 1.82% io_uring_echo_s [kernel.vmlinux] [k] _raw_spin_lock 1.33% iou-wrk-10893 [kernel.vmlinux] [k] io_wqe_worker 1.26% io_uring_echo_s [kernel.vmlinux] [k] try_to_wake_up spin_lock failure from 25.59% + 8.89% = 34.48% to 16.86% + 4.53% = 21.39% TPS is similar, while cpu usage is from almost 400% to 350% Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20220206095241.121485-2-haoxu@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io-wq.c | 96 +++++++++++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 44 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index bb7f161bb19c..9595616ccaa3 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -76,6 +76,7 @@ struct io_wqe_acct { unsigned max_workers; int index; atomic_t nr_running; + raw_spinlock_t lock; struct io_wq_work_list work_list; unsigned long flags; }; @@ -224,12 +225,12 @@ static void io_worker_exit(struct io_worker *worker) if (worker->flags & IO_WORKER_F_FREE) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); - preempt_disable(); + raw_spin_unlock(&wqe->lock); io_wqe_dec_running(worker); worker->flags = 0; + preempt_disable(); current->flags &= ~PF_IO_WORKER; preempt_enable(); - raw_spin_unlock(&wqe->lock); kfree_rcu(worker, rcu); io_worker_ref_put(wqe->wq); @@ -385,7 +386,6 @@ fail: } static void io_wqe_dec_running(struct io_worker *worker) - __must_hold(wqe->lock) { struct io_wqe_acct *acct = io_wqe_get_acct(worker); struct io_wqe *wqe = worker->wqe; @@ -393,13 +393,18 @@ static void io_wqe_dec_running(struct io_worker *worker) if (!(worker->flags & IO_WORKER_F_UP)) return; - if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) { - atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); + if (!atomic_dec_and_test(&acct->nr_running)) + return; + raw_spin_lock(&wqe->lock); + if (!io_acct_run_queue(acct)) { raw_spin_unlock(&wqe->lock); - io_queue_worker_create(worker, acct, create_worker_cb); - raw_spin_lock(&wqe->lock); + return; } + + raw_spin_unlock(&wqe->lock); + atomic_inc(&acct->nr_running); + atomic_inc(&wqe->wq->worker_refs); + io_queue_worker_create(worker, acct, create_worker_cb); } /* @@ -407,11 +412,12 @@ static void io_wqe_dec_running(struct io_worker *worker) * it's currently on the freelist */ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker) - __must_hold(wqe->lock) { if (worker->flags & IO_WORKER_F_FREE) { worker->flags &= ~IO_WORKER_F_FREE; + raw_spin_lock(&wqe->lock); hlist_nulls_del_init_rcu(&worker->nulls_node); + raw_spin_unlock(&wqe->lock); } } @@ -456,7 +462,7 @@ static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, struct io_worker *worker) - __must_hold(wqe->lock) + __must_hold(acct->lock) { struct io_wq_work_node *node, *prev; struct io_wq_work *work, *tail; @@ -498,9 +504,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, * work being added and clearing the stalled bit. */ set_bit(IO_ACCT_STALLED_BIT, &acct->flags); - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&acct->lock); unstalled = io_wait_on_hash(wqe, stall_hash); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&acct->lock); if (unstalled) { clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); if (wq_has_sleeper(&wqe->wq->hash->wait)) @@ -538,7 +544,7 @@ static void io_assign_current_work(struct io_worker *worker, static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); static void io_worker_handle_work(struct io_worker *worker) - __releases(wqe->lock) + __releases(acct->lock) { struct io_wqe_acct *acct = io_wqe_get_acct(worker); struct io_wqe *wqe = worker->wqe; @@ -556,6 +562,7 @@ static void io_worker_handle_work(struct io_worker *worker) * clear the stalled flag. */ work = io_get_next_work(acct, worker); + raw_spin_unlock(&acct->lock); if (work) { __io_worker_busy(wqe, worker); @@ -569,10 +576,9 @@ static void io_worker_handle_work(struct io_worker *worker) raw_spin_lock(&worker->lock); worker->next_work = work; raw_spin_unlock(&worker->lock); - } - raw_spin_unlock(&wqe->lock); - if (!work) + } else { break; + } io_assign_current_work(worker, work); __set_current_state(TASK_RUNNING); @@ -609,7 +615,7 @@ static void io_worker_handle_work(struct io_worker *worker) } } while (work); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&acct->lock); } while (1); } @@ -634,11 +640,14 @@ static int io_wqe_worker(void *data) set_current_state(TASK_INTERRUPTIBLE); loop: - raw_spin_lock(&wqe->lock); + raw_spin_lock(&acct->lock); if (io_acct_run_queue(acct)) { io_worker_handle_work(worker); goto loop; + } else { + raw_spin_unlock(&acct->lock); } + raw_spin_lock(&wqe->lock); /* timed out, exit unless we're the last worker */ if (last_timeout && acct->nr_workers > 1) { acct->nr_workers--; @@ -663,7 +672,7 @@ loop: } if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - raw_spin_lock(&wqe->lock); + raw_spin_lock(&acct->lock); io_worker_handle_work(worker); } @@ -705,10 +714,7 @@ void io_wq_worker_sleeping(struct task_struct *tsk) return; worker->flags &= ~IO_WORKER_F_RUNNING; - - raw_spin_lock(&worker->wqe->lock); io_wqe_dec_running(worker); - raw_spin_unlock(&worker->wqe->lock); } static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker, @@ -778,10 +784,12 @@ static void create_worker_cont(struct callback_head *cb) .cancel_all = true, }; + raw_spin_unlock(&wqe->lock); while (io_acct_cancel_pending_work(wqe, acct, &match)) - raw_spin_lock(&wqe->lock); + ; + } else { + raw_spin_unlock(&wqe->lock); } - raw_spin_unlock(&wqe->lock); io_worker_ref_put(wqe->wq); kfree(worker); return; @@ -914,6 +922,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); + struct io_cb_cancel_data match; unsigned work_flags = work->flags; bool do_create; @@ -927,10 +936,12 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) return; } - raw_spin_lock(&wqe->lock); + raw_spin_lock(&acct->lock); io_wqe_insert_work(wqe, work); clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); + raw_spin_unlock(&acct->lock); + raw_spin_lock(&wqe->lock); rcu_read_lock(); do_create = !io_wqe_activate_free_worker(wqe, acct); rcu_read_unlock(); @@ -946,18 +957,18 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) return; raw_spin_lock(&wqe->lock); - /* fatal condition, failed to create the first worker */ - if (!acct->nr_workers) { - struct io_cb_cancel_data match = { - .fn = io_wq_work_match_item, - .data = work, - .cancel_all = false, - }; - - if (io_acct_cancel_pending_work(wqe, acct, &match)) - raw_spin_lock(&wqe->lock); + if (acct->nr_workers) { + raw_spin_unlock(&wqe->lock); + return; } raw_spin_unlock(&wqe->lock); + + /* fatal condition, failed to create the first worker */ + match.fn = io_wq_work_match_item, + match.data = work, + match.cancel_all = false, + + io_acct_cancel_pending_work(wqe, acct, &match); } } @@ -1032,22 +1043,23 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe, static bool io_acct_cancel_pending_work(struct io_wqe *wqe, struct io_wqe_acct *acct, struct io_cb_cancel_data *match) - __releases(wqe->lock) { struct io_wq_work_node *node, *prev; struct io_wq_work *work; + raw_spin_lock(&acct->lock); wq_list_for_each(node, prev, &acct->work_list) { work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; io_wqe_remove_pending(wqe, work, prev); - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&acct->lock); io_run_cancel(work, wqe); match->nr_pending++; /* not safe to continue after unlock */ return true; } + raw_spin_unlock(&acct->lock); return false; } @@ -1061,7 +1073,6 @@ retry: struct io_wqe_acct *acct = io_get_acct(wqe, i == 0); if (io_acct_cancel_pending_work(wqe, acct, match)) { - raw_spin_lock(&wqe->lock); if (match->cancel_all) goto retry; break; @@ -1103,13 +1114,11 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; - raw_spin_lock(&wqe->lock); io_wqe_cancel_pending_work(wqe, &match); - if (match.nr_pending && !match.cancel_all) { - raw_spin_unlock(&wqe->lock); + if (match.nr_pending && !match.cancel_all) return IO_WQ_CANCEL_OK; - } + raw_spin_lock(&wqe->lock); io_wqe_cancel_running_work(wqe, &match); raw_spin_unlock(&wqe->lock); if (match.nr_running && !match.cancel_all) @@ -1190,6 +1199,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) acct->index = i; atomic_set(&acct->nr_running, 0); INIT_WQ_LIST(&acct->work_list); + raw_spin_lock_init(&acct->lock); } wqe->wq = wq; raw_spin_lock_init(&wqe->lock); @@ -1282,9 +1292,7 @@ static void io_wq_destroy(struct io_wq *wq) .fn = io_wq_work_match_all, .cancel_all = true, }; - raw_spin_lock(&wqe->lock); io_wqe_cancel_pending_work(wqe, &match); - raw_spin_unlock(&wqe->lock); free_cpumask_var(wqe->cpu_mask); kfree(wqe); } From e13fb1fe1483f6cd6452f25b866ffadf5ee0eff6 Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Sun, 6 Feb 2022 17:52:40 +0800 Subject: [PATCH 08/42] io-wq: reduce acct->lock crossing functions lock/unlock reduce acct->lock lock and unlock in different functions to make the code clearer. Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20220206095241.121485-3-haoxu@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io-wq.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 9595616ccaa3..f7b7fa396faf 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -239,10 +239,15 @@ static void io_worker_exit(struct io_worker *worker) static inline bool io_acct_run_queue(struct io_wqe_acct *acct) { + bool ret = false; + + raw_spin_lock(&acct->lock); if (!wq_list_empty(&acct->work_list) && !test_bit(IO_ACCT_STALLED_BIT, &acct->flags)) - return true; - return false; + ret = true; + raw_spin_unlock(&acct->lock); + + return ret; } /* @@ -395,13 +400,9 @@ static void io_wqe_dec_running(struct io_worker *worker) if (!atomic_dec_and_test(&acct->nr_running)) return; - raw_spin_lock(&wqe->lock); - if (!io_acct_run_queue(acct)) { - raw_spin_unlock(&wqe->lock); + if (!io_acct_run_queue(acct)) return; - } - raw_spin_unlock(&wqe->lock); atomic_inc(&acct->nr_running); atomic_inc(&wqe->wq->worker_refs); io_queue_worker_create(worker, acct, create_worker_cb); @@ -544,7 +545,6 @@ static void io_assign_current_work(struct io_worker *worker, static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); static void io_worker_handle_work(struct io_worker *worker) - __releases(acct->lock) { struct io_wqe_acct *acct = io_wqe_get_acct(worker); struct io_wqe *wqe = worker->wqe; @@ -561,6 +561,7 @@ static void io_worker_handle_work(struct io_worker *worker) * can't make progress, any work completion or insertion will * clear the stalled flag. */ + raw_spin_lock(&acct->lock); work = io_get_next_work(acct, worker); raw_spin_unlock(&acct->lock); if (work) { @@ -614,8 +615,6 @@ static void io_worker_handle_work(struct io_worker *worker) wake_up(&wq->hash->wait); } } while (work); - - raw_spin_lock(&acct->lock); } while (1); } @@ -639,14 +638,9 @@ static int io_wqe_worker(void *data) long ret; set_current_state(TASK_INTERRUPTIBLE); -loop: - raw_spin_lock(&acct->lock); - if (io_acct_run_queue(acct)) { + while (io_acct_run_queue(acct)) io_worker_handle_work(worker); - goto loop; - } else { - raw_spin_unlock(&acct->lock); - } + raw_spin_lock(&wqe->lock); /* timed out, exit unless we're the last worker */ if (last_timeout && acct->nr_workers > 1) { @@ -671,10 +665,8 @@ loop: last_timeout = !ret; } - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - raw_spin_lock(&acct->lock); + if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) io_worker_handle_work(worker); - } audit_free(current); io_worker_exit(worker); From 86127bb18aea7e553cfd0842bcd33a6dc80bfbc8 Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Sun, 6 Feb 2022 17:52:41 +0800 Subject: [PATCH 09/42] io-wq: use IO_WQ_ACCT_NR rather than hardcoded number It's better to use the defined enum stuff not the hardcoded number to define array. Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20220206095241.121485-4-haoxu@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io-wq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index f7b7fa396faf..5b93fa67d346 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -92,7 +92,7 @@ enum { */ struct io_wqe { raw_spinlock_t lock; - struct io_wqe_acct acct[2]; + struct io_wqe_acct acct[IO_WQ_ACCT_NR]; int node; @@ -1376,7 +1376,7 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND); BUILD_BUG_ON((int) IO_WQ_ACCT_NR != 2); - for (i = 0; i < 2; i++) { + for (i = 0; i < IO_WQ_ACCT_NR; i++) { if (new_count[i] > task_rlimit(current, RLIMIT_NPROC)) new_count[i] = task_rlimit(current, RLIMIT_NPROC); } From d5ec1dfaf59bf1632d7f2114d209bf80bfbd907a Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Mon, 14 Feb 2022 10:04:29 -0800 Subject: [PATCH 10/42] io-uring: add __fill_cqe function This introduces the __fill_cqe function. This is necessary to correctly issue the io_uring_complete tracepoint. Signed-off-by: Stefan Roesch Link: https://lore.kernel.org/r/20220214180430.70572-2-shr@fb.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1dce3f6e7031..b09c40353415 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1914,13 +1914,11 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, +static inline bool __fill_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { struct io_uring_cqe *cqe; - trace_io_uring_complete(ctx, user_data, res, cflags); - /* * If we can't get a cq entry, userspace overflowed the * submission (by quite a lot). Increment the overflow count in @@ -1936,17 +1934,24 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, return io_cqring_event_overflow(ctx, user_data, res, cflags); } +static inline bool __io_fill_cqe(struct io_kiocb *req, s32 res, u32 cflags) +{ + trace_io_uring_complete(req->ctx, req->user_data, res, cflags); + return __fill_cqe(req->ctx, req->user_data, res, cflags); +} + static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) { if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe(req->ctx, req->user_data, res, cflags); + __io_fill_cqe(req, res, cflags); } static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { ctx->cq_extra++; - return __io_fill_cqe(ctx, user_data, res, cflags); + trace_io_uring_complete(ctx, user_data, res, cflags); + return __fill_cqe(ctx, user_data, res, cflags); } static void __io_req_complete_post(struct io_kiocb *req, s32 res, @@ -1955,7 +1960,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res, struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe(ctx, req->user_data, res, cflags); + __io_fill_cqe(req, res, cflags); /* * If we're the last reference to this request, add to our locked * free_list cache. @@ -2544,8 +2549,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) comp_list); if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe(ctx, req->user_data, req->result, - req->cflags); + __io_fill_cqe(req, req->result, req->cflags); } io_commit_cqring(ctx); @@ -2667,7 +2671,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (unlikely(req->flags & REQ_F_CQE_SKIP)) continue; - __io_fill_cqe(ctx, req->user_data, req->result, io_put_kbuf(req)); + __io_fill_cqe(req, req->result, io_put_kbuf(req)); nr_events++; } From 502c87d65564cbfd65b1621309bcd900390eca81 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Mon, 14 Feb 2022 10:04:30 -0800 Subject: [PATCH 11/42] io-uring: Make tracepoints consistent. This makes the io-uring tracepoints consistent. Where it makes sense the tracepoints start with the following four fields: - context (ring) - request - user_data - opcode. Signed-off-by: Stefan Roesch Link: https://lore.kernel.org/r/20220214180430.70572-3-shr@fb.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 24 +-- include/trace/events/io_uring.h | 318 +++++++++++++++----------------- 2 files changed, 166 insertions(+), 176 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b09c40353415..5ec24b0438f1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1603,8 +1603,8 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) if (WARN_ON_ONCE(!same_thread_group(req->task, current))) req->work.flags |= IO_WQ_WORK_CANCEL; - trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, - &req->work, req->flags); + trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags, + &req->work, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); @@ -1936,7 +1936,7 @@ static inline bool __fill_cqe(struct io_ring_ctx *ctx, u64 user_data, static inline bool __io_fill_cqe(struct io_kiocb *req, s32 res, u32 cflags) { - trace_io_uring_complete(req->ctx, req->user_data, res, cflags); + trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags); return __fill_cqe(req->ctx, req->user_data, res, cflags); } @@ -1950,7 +1950,7 @@ static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { ctx->cq_extra++; - trace_io_uring_complete(ctx, user_data, res, cflags); + trace_io_uring_complete(ctx, NULL, user_data, res, cflags); return __fill_cqe(ctx, user_data, res, cflags); } @@ -2202,7 +2202,9 @@ static void io_fail_links(struct io_kiocb *req) nxt = link->link; link->link = NULL; - trace_io_uring_fail_link(req, link); + trace_io_uring_fail_link(req->ctx, req, req->user_data, + req->opcode, link); + if (!ignore_cqes) { link->flags &= ~REQ_F_CQE_SKIP; io_fill_cqe_req(link, res, 0); @@ -5629,7 +5631,7 @@ static void __io_poll_execute(struct io_kiocb *req, int mask) else req->io_task_work.func = io_apoll_task_func; - trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); + trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask); io_req_task_work_add(req, false); } @@ -5858,7 +5860,7 @@ static int io_arm_poll_handler(struct io_kiocb *req) if (ret || ipt.error) return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; - trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data, + trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode, mask, apoll->poll.events); return IO_APOLL_OK; } @@ -6667,7 +6669,7 @@ fail: goto queue; } - trace_io_uring_defer(ctx, req, req->user_data); + trace_io_uring_defer(ctx, req, req->user_data, req->opcode); de->req = req; de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); @@ -7001,7 +7003,7 @@ static struct file *io_file_get_normal(struct io_ring_ctx *ctx, { struct file *file = fget(fd); - trace_io_uring_file_get(ctx, fd); + trace_io_uring_file_get(ctx, req, req->user_data, fd); /* we don't allow fixed io_uring files */ if (file && unlikely(file->f_op == &io_uring_fops)) @@ -7299,7 +7301,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_init_req(ctx, req, sqe); if (unlikely(ret)) { - trace_io_uring_req_failed(sqe, ret); + trace_io_uring_req_failed(sqe, ctx, req, ret); /* fail even hard links since we don't submit */ if (link->head) { @@ -7326,7 +7328,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, } /* don't need @sqe from now on */ - trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data, + trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode, req->flags, true, ctx->flags & IORING_SETUP_SQPOLL); diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 098beda7601a..18d4341c581c 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -29,15 +29,15 @@ TRACE_EVENT(io_uring_create, TP_ARGS(fd, ctx, sq_entries, cq_entries, flags), TP_STRUCT__entry ( - __field( int, fd ) - __field( void *, ctx ) + __field( int, fd ) + __field( void *, ctx ) __field( u32, sq_entries ) __field( u32, cq_entries ) __field( u32, flags ) ), TP_fast_assign( - __entry->fd = fd; + __entry->fd = fd; __entry->ctx = ctx; __entry->sq_entries = sq_entries; __entry->cq_entries = cq_entries; @@ -72,11 +72,11 @@ TRACE_EVENT(io_uring_register, TP_ARGS(ctx, opcode, nr_files, nr_bufs, ret), TP_STRUCT__entry ( - __field( void *, ctx ) - __field( unsigned, opcode ) - __field( unsigned, nr_files ) - __field( unsigned, nr_bufs ) - __field( long, ret ) + __field( void *, ctx ) + __field( unsigned, opcode ) + __field( unsigned, nr_files) + __field( unsigned, nr_bufs ) + __field( long, ret ) ), TP_fast_assign( @@ -97,6 +97,8 @@ TRACE_EVENT(io_uring_register, * io_uring_file_get - called before getting references to an SQE file * * @ctx: pointer to a ring context structure + * @req: pointer to a submitted request + * @user_data: user data associated with the request * @fd: SQE file descriptor * * Allows to trace out how often an SQE file reference is obtained, which can @@ -105,59 +107,71 @@ TRACE_EVENT(io_uring_register, */ TRACE_EVENT(io_uring_file_get, - TP_PROTO(void *ctx, int fd), + TP_PROTO(void *ctx, void *req, unsigned long long user_data, int fd), - TP_ARGS(ctx, fd), + TP_ARGS(ctx, req, user_data, fd), TP_STRUCT__entry ( - __field( void *, ctx ) - __field( int, fd ) + __field( void *, ctx ) + __field( void *, req ) + __field( u64, user_data ) + __field( int, fd ) ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = ctx; + __entry->req = req; + __entry->user_data = user_data; __entry->fd = fd; ), - TP_printk("ring %p, fd %d", __entry->ctx, __entry->fd) + TP_printk("ring %p, req %p, user_data %llu, fd %d", + __entry->ctx, __entry->req, __entry->user_data, __entry->fd) ); /** * io_uring_queue_async_work - called before submitting a new async work * * @ctx: pointer to a ring context structure - * @hashed: type of workqueue, hashed or normal * @req: pointer to a submitted request + * @user_data: user data associated with the request + * @opcode: opcode of request + * @flags request flags * @work: pointer to a submitted io_wq_work + * @rw: type of workqueue, hashed or normal * * Allows to trace asynchronous work submission. */ TRACE_EVENT(io_uring_queue_async_work, - TP_PROTO(void *ctx, int rw, void * req, struct io_wq_work *work, - unsigned int flags), + TP_PROTO(void *ctx, void * req, unsigned long long user_data, u8 opcode, + unsigned int flags, struct io_wq_work *work, int rw), - TP_ARGS(ctx, rw, req, work, flags), + TP_ARGS(ctx, req, user_data, flags, opcode, work, rw), TP_STRUCT__entry ( - __field( void *, ctx ) - __field( int, rw ) - __field( void *, req ) - __field( struct io_wq_work *, work ) - __field( unsigned int, flags ) + __field( void *, ctx ) + __field( void *, req ) + __field( u64, user_data ) + __field( u8, opcode ) + __field( unsigned int, flags ) + __field( struct io_wq_work *, work ) + __field( int, rw ) ), TP_fast_assign( - __entry->ctx = ctx; - __entry->rw = rw; - __entry->req = req; - __entry->work = work; - __entry->flags = flags; + __entry->ctx = ctx; + __entry->req = req; + __entry->user_data = user_data; + __entry->flags = flags; + __entry->opcode = opcode; + __entry->work = work; + __entry->rw = rw; ), - TP_printk("ring %p, request %p, flags %d, %s queue, work %p", - __entry->ctx, __entry->req, __entry->flags, - __entry->rw ? "hashed" : "normal", __entry->work) + TP_printk("ring %p, request %p, user_data %llu, opcode %d, flags %d, %s queue, work %p", + __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, + __entry->flags, __entry->rw ? "hashed" : "normal", __entry->work) ); /** @@ -166,30 +180,33 @@ TRACE_EVENT(io_uring_queue_async_work, * @ctx: pointer to a ring context structure * @req: pointer to a deferred request * @user_data: user data associated with the request + * @opcode: opcode of request * * Allows to track deferred requests, to get an insight about what requests are * not started immediately. */ TRACE_EVENT(io_uring_defer, - TP_PROTO(void *ctx, void *req, unsigned long long user_data), + TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode), - TP_ARGS(ctx, req, user_data), + TP_ARGS(ctx, req, user_data, opcode), TP_STRUCT__entry ( - __field( void *, ctx ) - __field( void *, req ) - __field( unsigned long long, data ) + __field( void *, ctx ) + __field( void *, req ) + __field( unsigned long long, data ) + __field( u8, opcode ) ), TP_fast_assign( __entry->ctx = ctx; __entry->req = req; __entry->data = user_data; + __entry->opcode = opcode; ), - TP_printk("ring %p, request %p user_data %llu", __entry->ctx, - __entry->req, __entry->data) + TP_printk("ring %p, request %p, user_data %llu, opcode %d", + __entry->ctx, __entry->req, __entry->data, __entry->opcode) ); /** @@ -247,7 +264,7 @@ TRACE_EVENT(io_uring_cqring_wait, ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = ctx; __entry->min_events = min_events; ), @@ -257,7 +274,10 @@ TRACE_EVENT(io_uring_cqring_wait, /** * io_uring_fail_link - called before failing a linked request * + * @ctx: pointer to a ring context structure * @req: request, which links were cancelled + * @user_data: user data associated with the request + * @opcode: opcode of request * @link: cancelled link * * Allows to track linked requests cancellation, to see not only that some work @@ -265,27 +285,36 @@ TRACE_EVENT(io_uring_cqring_wait, */ TRACE_EVENT(io_uring_fail_link, - TP_PROTO(void *req, void *link), + TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, void *link), - TP_ARGS(req, link), + TP_ARGS(ctx, req, user_data, opcode, link), TP_STRUCT__entry ( - __field( void *, req ) - __field( void *, link ) + __field( void *, ctx ) + __field( void *, req ) + __field( unsigned long long, user_data ) + __field( u8, opcode ) + __field( void *, link ) ), TP_fast_assign( - __entry->req = req; - __entry->link = link; + __entry->ctx = ctx; + __entry->req = req; + __entry->user_data = user_data; + __entry->opcode = opcode; + __entry->link = link; ), - TP_printk("request %p, link %p", __entry->req, __entry->link) + TP_printk("ring %p, request %p, user_data %llu, opcode %d, link %p", + __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, + __entry->link) ); /** * io_uring_complete - called when completing an SQE * * @ctx: pointer to a ring context structure + * @req: pointer to a submitted request * @user_data: user data associated with the request * @res: result of the request * @cflags: completion flags @@ -293,12 +322,13 @@ TRACE_EVENT(io_uring_fail_link, */ TRACE_EVENT(io_uring_complete, - TP_PROTO(void *ctx, u64 user_data, int res, unsigned cflags), + TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags), - TP_ARGS(ctx, user_data, res, cflags), + TP_ARGS(ctx, req, user_data, res, cflags), TP_STRUCT__entry ( __field( void *, ctx ) + __field( void *, req ) __field( u64, user_data ) __field( int, res ) __field( unsigned, cflags ) @@ -306,14 +336,16 @@ TRACE_EVENT(io_uring_complete, TP_fast_assign( __entry->ctx = ctx; + __entry->req = req; __entry->user_data = user_data; __entry->res = res; __entry->cflags = cflags; ), - TP_printk("ring %p, user_data 0x%llx, result %d, cflags %x", - __entry->ctx, (unsigned long long)__entry->user_data, - __entry->res, __entry->cflags) + TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags %x", + __entry->ctx, __entry->req, + (unsigned long long)__entry->user_data, + __entry->res, __entry->cflags) ); /** @@ -321,8 +353,8 @@ TRACE_EVENT(io_uring_complete, * * @ctx: pointer to a ring context structure * @req: pointer to a submitted request - * @opcode: opcode of request * @user_data: user data associated with the request + * @opcode: opcode of request * @flags request flags * @force_nonblock: whether a context blocking or not * @sq_thread: true if sq_thread has submitted this SQE @@ -332,34 +364,34 @@ TRACE_EVENT(io_uring_complete, */ TRACE_EVENT(io_uring_submit_sqe, - TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data, u32 flags, + TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, u32 flags, bool force_nonblock, bool sq_thread), - TP_ARGS(ctx, req, opcode, user_data, flags, force_nonblock, sq_thread), + TP_ARGS(ctx, req, user_data, opcode, flags, force_nonblock, sq_thread), TP_STRUCT__entry ( - __field( void *, ctx ) - __field( void *, req ) - __field( u8, opcode ) - __field( u64, user_data ) - __field( u32, flags ) - __field( bool, force_nonblock ) - __field( bool, sq_thread ) + __field( void *, ctx ) + __field( void *, req ) + __field( unsigned long long, user_data ) + __field( u8, opcode ) + __field( u32, flags ) + __field( bool, force_nonblock ) + __field( bool, sq_thread ) ), TP_fast_assign( __entry->ctx = ctx; __entry->req = req; - __entry->opcode = opcode; __entry->user_data = user_data; + __entry->opcode = opcode; __entry->flags = flags; __entry->force_nonblock = force_nonblock; __entry->sq_thread = sq_thread; ), - TP_printk("ring %p, req %p, op %d, data 0x%llx, flags %u, " + TP_printk("ring %p, req %p, user_data %llu, opcode %d, flags %u, " "non block %d, sq_thread %d", __entry->ctx, __entry->req, - __entry->opcode, (unsigned long long)__entry->user_data, + __entry->user_data, __entry->opcode, __entry->flags, __entry->force_nonblock, __entry->sq_thread) ); @@ -368,8 +400,8 @@ TRACE_EVENT(io_uring_submit_sqe, * * @ctx: pointer to a ring context structure * @req: pointer to the armed request - * @opcode: opcode of request * @user_data: user data associated with the request + * @opcode: opcode of request * @mask: request poll events mask * @events: registered events of interest * @@ -378,155 +410,110 @@ TRACE_EVENT(io_uring_submit_sqe, */ TRACE_EVENT(io_uring_poll_arm, - TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data, + TP_PROTO(void *ctx, void *req, u64 user_data, u8 opcode, int mask, int events), - TP_ARGS(ctx, req, opcode, user_data, mask, events), + TP_ARGS(ctx, req, user_data, opcode, mask, events), TP_STRUCT__entry ( - __field( void *, ctx ) - __field( void *, req ) - __field( u8, opcode ) - __field( u64, user_data ) - __field( int, mask ) - __field( int, events ) + __field( void *, ctx ) + __field( void *, req ) + __field( unsigned long long, user_data ) + __field( u8, opcode ) + __field( int, mask ) + __field( int, events ) ), TP_fast_assign( __entry->ctx = ctx; __entry->req = req; - __entry->opcode = opcode; __entry->user_data = user_data; + __entry->opcode = opcode; __entry->mask = mask; __entry->events = events; ), - TP_printk("ring %p, req %p, op %d, data 0x%llx, mask 0x%x, events 0x%x", - __entry->ctx, __entry->req, __entry->opcode, - (unsigned long long) __entry->user_data, + TP_printk("ring %p, req %p, user_data %llu, opcode %d, mask 0x%x, events 0x%x", + __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, __entry->mask, __entry->events) ); -TRACE_EVENT(io_uring_poll_wake, - - TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask), - - TP_ARGS(ctx, opcode, user_data, mask), - - TP_STRUCT__entry ( - __field( void *, ctx ) - __field( u8, opcode ) - __field( u64, user_data ) - __field( int, mask ) - ), - - TP_fast_assign( - __entry->ctx = ctx; - __entry->opcode = opcode; - __entry->user_data = user_data; - __entry->mask = mask; - ), - - TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x", - __entry->ctx, __entry->opcode, - (unsigned long long) __entry->user_data, - __entry->mask) -); - -TRACE_EVENT(io_uring_task_add, - - TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask), - - TP_ARGS(ctx, opcode, user_data, mask), - - TP_STRUCT__entry ( - __field( void *, ctx ) - __field( u8, opcode ) - __field( u64, user_data ) - __field( int, mask ) - ), - - TP_fast_assign( - __entry->ctx = ctx; - __entry->opcode = opcode; - __entry->user_data = user_data; - __entry->mask = mask; - ), - - TP_printk("ring %p, op %d, data 0x%llx, mask %x", - __entry->ctx, __entry->opcode, - (unsigned long long) __entry->user_data, - __entry->mask) -); - /* - * io_uring_task_run - called when task_work_run() executes the poll events - * notification callbacks + * io_uring_task_add - called after adding a task * * @ctx: pointer to a ring context structure - * @req: pointer to the armed request - * @opcode: opcode of request + * @req: pointer to request * @user_data: user data associated with the request + * @opcode: opcode of request + * @mask: request poll events mask * - * Allows to track when notified poll events are processed */ -TRACE_EVENT(io_uring_task_run, +TRACE_EVENT(io_uring_task_add, - TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data), + TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, int mask), - TP_ARGS(ctx, req, opcode, user_data), + TP_ARGS(ctx, req, user_data, opcode, mask), TP_STRUCT__entry ( - __field( void *, ctx ) - __field( void *, req ) - __field( u8, opcode ) - __field( u64, user_data ) + __field( void *, ctx ) + __field( void *, req ) + __field( unsigned long long, user_data ) + __field( u8, opcode ) + __field( int, mask ) ), TP_fast_assign( __entry->ctx = ctx; __entry->req = req; - __entry->opcode = opcode; __entry->user_data = user_data; + __entry->opcode = opcode; + __entry->mask = mask; ), - TP_printk("ring %p, req %p, op %d, data 0x%llx", - __entry->ctx, __entry->req, __entry->opcode, - (unsigned long long) __entry->user_data) + TP_printk("ring %p, req %p, user_data %llu, opcode %d, mask %x", + __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, + __entry->mask) ); /* * io_uring_req_failed - called when an sqe is errored dring submission * * @sqe: pointer to the io_uring_sqe that failed + * @ctx: pointer to a ring context structure + * @req: pointer to request * @error: error it failed with * * Allows easier diagnosing of malformed requests in production systems. */ TRACE_EVENT(io_uring_req_failed, - TP_PROTO(const struct io_uring_sqe *sqe, int error), + TP_PROTO(const struct io_uring_sqe *sqe, void *ctx, void *req, int error), - TP_ARGS(sqe, error), + TP_ARGS(sqe, ctx, req, error), TP_STRUCT__entry ( - __field( u8, opcode ) - __field( u8, flags ) - __field( u8, ioprio ) - __field( u64, off ) - __field( u64, addr ) - __field( u32, len ) - __field( u32, op_flags ) - __field( u64, user_data ) - __field( u16, buf_index ) - __field( u16, personality ) - __field( u32, file_index ) - __field( u64, pad1 ) - __field( u64, pad2 ) - __field( int, error ) + __field( void *, ctx ) + __field( void *, req ) + __field( unsigned long long, user_data ) + __field( u8, opcode ) + __field( u8, flags ) + __field( u8, ioprio ) + __field( u64, off ) + __field( u64, addr ) + __field( u32, len ) + __field( u32, op_flags ) + __field( u16, buf_index ) + __field( u16, personality ) + __field( u32, file_index ) + __field( u64, pad1 ) + __field( u64, pad2 ) + __field( int, error ) ), TP_fast_assign( + __entry->ctx = ctx; + __entry->req = req; + __entry->user_data = sqe->user_data; __entry->opcode = sqe->opcode; __entry->flags = sqe->flags; __entry->ioprio = sqe->ioprio; @@ -534,7 +521,6 @@ TRACE_EVENT(io_uring_req_failed, __entry->addr = sqe->addr; __entry->len = sqe->len; __entry->op_flags = sqe->rw_flags; - __entry->user_data = sqe->user_data; __entry->buf_index = sqe->buf_index; __entry->personality = sqe->personality; __entry->file_index = sqe->file_index; @@ -543,13 +529,15 @@ TRACE_EVENT(io_uring_req_failed, __entry->error = error; ), - TP_printk("op %d, flags=0x%x, prio=%d, off=%llu, addr=%llu, " - "len=%u, rw_flags=0x%x, user_data=0x%llx, buf_index=%d, " + TP_printk("ring %p, req %p, user_data %llu, " + "op %d, flags=0x%x, prio=%d, off=%llu, addr=%llu, " + "len=%u, rw_flags=0x%x, buf_index=%d, " "personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d", + __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, __entry->flags, __entry->ioprio, (unsigned long long)__entry->off, (unsigned long long) __entry->addr, __entry->len, - __entry->op_flags, (unsigned long long) __entry->user_data, + __entry->op_flags, __entry->buf_index, __entry->personality, __entry->file_index, (unsigned long long) __entry->pad1, (unsigned long long) __entry->pad2, __entry->error) From c5020bc8d9295ad4a3e09d858a28b26a25d4afab Mon Sep 17 00:00:00 2001 From: Olivier Langlois Date: Wed, 16 Feb 2022 14:53:42 -0500 Subject: [PATCH 12/42] io_uring: Remove unneeded test in io_run_task_work_sig() Avoid testing TIF_NOTIFY_SIGNAL twice by calling task_sigpending() directly from io_run_task_work_sig() Signed-off-by: Olivier Langlois Link: https://lore.kernel.org/r/bd7c0495f7656e803e5736708591bb665e6eaacd.1645041650.git.olivier@trillion01.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5ec24b0438f1..8d3454a243b4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7704,11 +7704,11 @@ static int io_run_task_work_sig(void) { if (io_run_task_work()) return 1; - if (!signal_pending(current)) - return 0; if (test_thread_flag(TIF_NOTIFY_SIGNAL)) return -ERESTARTSYS; - return -EINTR; + if (task_sigpending(current)) + return -EINTR; + return 0; } /* when returns >0, the caller should retry */ From af9c45ecebaf1b428306f41421f4bcffe439f735 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Tue, 22 Feb 2022 02:55:01 -0800 Subject: [PATCH 13/42] io_uring: remove duplicated calls to io_kiocb_ppos io_kiocb_ppos is called in both branches, and it seems that the compiler does not fuse this. Fusing removes a few bytes from loop_rw_iter. Before: $ nm -S fs/io_uring.o | grep loop_rw_iter 0000000000002430 0000000000000124 t loop_rw_iter After: $ nm -S fs/io_uring.o | grep loop_rw_iter 0000000000002430 000000000000010d t loop_rw_iter Signed-off-by: Dylan Yudaken Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8d3454a243b4..d9c0fd042f17 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3420,6 +3420,7 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) struct kiocb *kiocb = &req->rw.kiocb; struct file *file = req->file; ssize_t ret = 0; + loff_t *ppos; /* * Don't support polled IO through this interface, and we can't @@ -3432,6 +3433,8 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) !(kiocb->ki_filp->f_flags & O_NONBLOCK)) return -EAGAIN; + ppos = io_kiocb_ppos(kiocb); + while (iov_iter_count(iter)) { struct iovec iovec; ssize_t nr; @@ -3445,10 +3448,10 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) if (rw == READ) { nr = file->f_op->read(file, iovec.iov_base, - iovec.iov_len, io_kiocb_ppos(kiocb)); + iovec.iov_len, ppos); } else { nr = file->f_op->write(file, iovec.iov_base, - iovec.iov_len, io_kiocb_ppos(kiocb)); + iovec.iov_len, ppos); } if (nr < 0) { From d34e1e5b396a0dbaa4a29b7138df662cfb9d8e8e Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Tue, 22 Feb 2022 02:55:02 -0800 Subject: [PATCH 14/42] io_uring: update kiocb->ki_pos at execution time Update kiocb->ki_pos at execution time rather than in io_prep_rw(). io_prep_rw() happens before the job is enqueued to a worker and so the offset might be read multiple times before being executed once. Ensures that the file position in a set of _linked_ SQEs will be only obtained after earlier SQEs have completed, and so will include their incremented file position. Signed-off-by: Dylan Yudaken Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d9c0fd042f17..d0f3e084da16 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3020,14 +3020,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; kiocb->ki_pos = READ_ONCE(sqe->off); - if (kiocb->ki_pos == -1) { - if (!(file->f_mode & FMODE_STREAM)) { - req->flags |= REQ_F_CUR_POS; - kiocb->ki_pos = file->f_pos; - } else { - kiocb->ki_pos = 0; - } - } kiocb->ki_flags = iocb_flags(file); ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); if (unlikely(ret)) @@ -3094,6 +3086,20 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } +static inline void io_kiocb_update_pos(struct io_kiocb *req) +{ + struct kiocb *kiocb = &req->rw.kiocb; + + if (kiocb->ki_pos == -1) { + if (!(req->file->f_mode & FMODE_STREAM)) { + req->flags |= REQ_F_CUR_POS; + kiocb->ki_pos = req->file->f_pos; + } else { + kiocb->ki_pos = 0; + } + } +} + static void kiocb_done(struct io_kiocb *req, ssize_t ret, unsigned int issue_flags) { @@ -3682,6 +3688,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags &= ~IOCB_NOWAIT; } + io_kiocb_update_pos(req); + ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result); if (unlikely(ret)) { kfree(iovec); @@ -3811,6 +3819,8 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags &= ~IOCB_NOWAIT; } + io_kiocb_update_pos(req); + ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result); if (unlikely(ret)) goto out_free; From b4aec40015953b65f2f114641e7fd7714c8df8e6 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Tue, 22 Feb 2022 02:55:03 -0800 Subject: [PATCH 15/42] io_uring: do not recalculate ppos unnecessarily There is a slight optimisation to be had by calculating the correct pos pointer inside io_kiocb_update_pos and then using that later. It seems code size drops by a bit: 000000000000a1b0 0000000000000400 t io_read 000000000000a5b0 0000000000000319 t io_write vs 000000000000a1b0 00000000000003f6 t io_read 000000000000a5b0 0000000000000310 t io_write Signed-off-by: Dylan Yudaken Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d0f3e084da16..c9efb1d32fd2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3086,18 +3086,22 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } -static inline void io_kiocb_update_pos(struct io_kiocb *req) +static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) { struct kiocb *kiocb = &req->rw.kiocb; + bool is_stream = req->file->f_mode & FMODE_STREAM; if (kiocb->ki_pos == -1) { - if (!(req->file->f_mode & FMODE_STREAM)) { + if (!is_stream) { req->flags |= REQ_F_CUR_POS; kiocb->ki_pos = req->file->f_pos; + return &kiocb->ki_pos; } else { kiocb->ki_pos = 0; + return NULL; } } + return is_stream ? NULL : &kiocb->ki_pos; } static void kiocb_done(struct io_kiocb *req, ssize_t ret, @@ -3658,6 +3662,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; struct io_async_rw *rw; ssize_t ret, ret2; + loff_t *ppos; if (!req_has_async_data(req)) { ret = io_import_iovec(READ, req, &iovec, s, issue_flags); @@ -3688,9 +3693,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags &= ~IOCB_NOWAIT; } - io_kiocb_update_pos(req); + ppos = io_kiocb_update_pos(req); - ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result); + ret = rw_verify_area(READ, req->file, ppos, req->result); if (unlikely(ret)) { kfree(iovec); return ret; @@ -3789,6 +3794,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) struct kiocb *kiocb = &req->rw.kiocb; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; ssize_t ret, ret2; + loff_t *ppos; if (!req_has_async_data(req)) { ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags); @@ -3819,9 +3825,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags &= ~IOCB_NOWAIT; } - io_kiocb_update_pos(req); + ppos = io_kiocb_update_pos(req); - ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result); + ret = rw_verify_area(WRITE, req->file, ppos, req->result); if (unlikely(ret)) goto out_free; From 63c36549737e8132e89ec6563d26523895ae3121 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 24 Feb 2022 02:51:57 -0800 Subject: [PATCH 16/42] io_uring: documentation fixup Fix incorrect name reference in comment. ki_filp does not exist in the struct, but file does. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220224105157.1332353-1-dylany@fb.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c9efb1d32fd2..f8fd3b2bb30d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -830,7 +830,7 @@ enum { * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can * access the file pointer through any of the sub-structs, - * or directly as just 'ki_filp' in this struct. + * or directly as just 'file' in this struct. */ struct io_kiocb { union { From e7a6c00dc77aedf27a601738ea509f1caea6d673 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 4 Mar 2022 08:22:22 -0700 Subject: [PATCH 17/42] io_uring: add support for registering ring file descriptors Lots of workloads use multiple threads, in which case the file table is shared between them. This makes getting and putting the ring file descriptor for each io_uring_enter(2) system call more expensive, as it involves an atomic get and put for each call. Similarly to how we allow registering normal file descriptors to avoid this overhead, add support for an io_uring_register(2) API that allows to register the ring fds themselves: 1) IORING_REGISTER_RING_FDS - takes an array of io_uring_rsrc_update structs, and registers them with the task. 2) IORING_UNREGISTER_RING_FDS - takes an array of io_uring_src_update structs, and unregisters them. When a ring fd is registered, it is internally represented by an offset. This offset is returned to the application, and the application then uses this offset and sets IORING_ENTER_REGISTERED_RING for the io_uring_enter(2) system call. This works just like using a registered file descriptor, rather than a real one, in an SQE, where IOSQE_FIXED_FILE gets set to tell io_uring that we're using an internal offset/descriptor rather than a real file descriptor. In initial testing, this provides a nice bump in performance for threaded applications in real world cases where the batch count (eg number of requests submitted per io_uring_enter(2) invocation) is low. In a microbenchmark, submitting NOP requests, we see the following increases in performance: Requests per syscall Baseline Registered Increase ---------------------------------------------------------------- 1 ~7030K ~8080K +15% 2 ~13120K ~14800K +13% 4 ~22740K ~25300K +11% Co-developed-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 182 +++++++++++++++++++++++++++++++++- include/linux/io_uring.h | 5 +- include/uapi/linux/io_uring.h | 13 ++- 3 files changed, 190 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f8fd3b2bb30d..daa3609b742f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -466,6 +466,11 @@ struct io_ring_ctx { }; }; +/* + * Arbitrary limit, can be raised if need be + */ +#define IO_RINGFD_REG_MAX 16 + struct io_uring_task { /* submission side */ int cached_refs; @@ -481,6 +486,7 @@ struct io_uring_task { struct io_wq_work_list task_list; struct io_wq_work_list prior_task_list; struct callback_head task_work; + struct file **registered_rings; bool task_running; }; @@ -8788,8 +8794,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task, if (unlikely(!tctx)) return -ENOMEM; + tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX, + sizeof(struct file *), GFP_KERNEL); + if (unlikely(!tctx->registered_rings)) { + kfree(tctx); + return -ENOMEM; + } + ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); if (unlikely(ret)) { + kfree(tctx->registered_rings); kfree(tctx); return ret; } @@ -8798,6 +8812,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task, if (IS_ERR(tctx->io_wq)) { ret = PTR_ERR(tctx->io_wq); percpu_counter_destroy(&tctx->inflight); + kfree(tctx->registered_rings); kfree(tctx); return ret; } @@ -8822,6 +8837,7 @@ void __io_uring_free(struct task_struct *tsk) WARN_ON_ONCE(tctx->io_wq); WARN_ON_ONCE(tctx->cached_refs); + kfree(tctx->registered_rings); percpu_counter_destroy(&tctx->inflight); kfree(tctx); tsk->io_uring = NULL; @@ -10043,6 +10059,139 @@ void __io_uring_cancel(bool cancel_all) io_uring_cancel_generic(cancel_all, NULL); } +void io_uring_unreg_ringfd(void) +{ + struct io_uring_task *tctx = current->io_uring; + int i; + + for (i = 0; i < IO_RINGFD_REG_MAX; i++) { + if (tctx->registered_rings[i]) { + fput(tctx->registered_rings[i]); + tctx->registered_rings[i] = NULL; + } + } +} + +static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, + int start, int end) +{ + struct file *file; + int offset; + + for (offset = start; offset < end; offset++) { + offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); + if (tctx->registered_rings[offset]) + continue; + + file = fget(fd); + if (!file) { + return -EBADF; + } else if (file->f_op != &io_uring_fops) { + fput(file); + return -EOPNOTSUPP; + } + tctx->registered_rings[offset] = file; + return offset; + } + + return -EBUSY; +} + +/* + * Register a ring fd to avoid fdget/fdput for each io_uring_enter() + * invocation. User passes in an array of struct io_uring_rsrc_update + * with ->data set to the ring_fd, and ->offset given for the desired + * index. If no index is desired, application may set ->offset == -1U + * and we'll find an available index. Returns number of entries + * successfully processed, or < 0 on error if none were processed. + */ +static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, + unsigned nr_args) +{ + struct io_uring_rsrc_update __user *arg = __arg; + struct io_uring_rsrc_update reg; + struct io_uring_task *tctx; + int ret, i; + + if (!nr_args || nr_args > IO_RINGFD_REG_MAX) + return -EINVAL; + + mutex_unlock(&ctx->uring_lock); + ret = io_uring_add_tctx_node(ctx); + mutex_lock(&ctx->uring_lock); + if (ret) + return ret; + + tctx = current->io_uring; + for (i = 0; i < nr_args; i++) { + int start, end; + + if (copy_from_user(®, &arg[i], sizeof(reg))) { + ret = -EFAULT; + break; + } + + if (reg.offset == -1U) { + start = 0; + end = IO_RINGFD_REG_MAX; + } else { + if (reg.offset >= IO_RINGFD_REG_MAX) { + ret = -EINVAL; + break; + } + start = reg.offset; + end = start + 1; + } + + ret = io_ring_add_registered_fd(tctx, reg.data, start, end); + if (ret < 0) + break; + + reg.offset = ret; + if (copy_to_user(&arg[i], ®, sizeof(reg))) { + fput(tctx->registered_rings[reg.offset]); + tctx->registered_rings[reg.offset] = NULL; + ret = -EFAULT; + break; + } + } + + return i ? i : ret; +} + +static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, + unsigned nr_args) +{ + struct io_uring_rsrc_update __user *arg = __arg; + struct io_uring_task *tctx = current->io_uring; + struct io_uring_rsrc_update reg; + int ret = 0, i; + + if (!nr_args || nr_args > IO_RINGFD_REG_MAX) + return -EINVAL; + if (!tctx) + return 0; + + for (i = 0; i < nr_args; i++) { + if (copy_from_user(®, &arg[i], sizeof(reg))) { + ret = -EFAULT; + break; + } + if (reg.offset >= IO_RINGFD_REG_MAX) { + ret = -EINVAL; + break; + } + + reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX); + if (tctx->registered_rings[reg.offset]) { + fput(tctx->registered_rings[reg.offset]); + tctx->registered_rings[reg.offset] = NULL; + } + } + + return i ? i : ret; +} + static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, size_t sz) { @@ -10173,12 +10322,28 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, io_run_task_work(); if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | - IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))) + IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | + IORING_ENTER_REGISTERED_RING))) return -EINVAL; - f = fdget(fd); - if (unlikely(!f.file)) - return -EBADF; + /* + * Ring fd has been registered via IORING_REGISTER_RING_FDS, we + * need only dereference our task private array to find it. + */ + if (flags & IORING_ENTER_REGISTERED_RING) { + struct io_uring_task *tctx = current->io_uring; + + if (!tctx || fd >= IO_RINGFD_REG_MAX) + return -EINVAL; + fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); + f.file = tctx->registered_rings[fd]; + if (unlikely(!f.file)) + return -EBADF; + } else { + f = fdget(fd); + if (unlikely(!f.file)) + return -EBADF; + } ret = -EOPNOTSUPP; if (unlikely(f.file->f_op != &io_uring_fops)) @@ -10252,7 +10417,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, out: percpu_ref_put(&ctx->refs); out_fput: - fdput(f); + if (!(flags & IORING_ENTER_REGISTERED_RING)) + fdput(f); return submitted ? submitted : ret; } @@ -11142,6 +11308,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_iowq_max_workers(ctx, arg); break; + case IORING_REGISTER_RING_FDS: + ret = io_ringfd_register(ctx, arg, nr_args); + break; + case IORING_UNREGISTER_RING_FDS: + ret = io_ringfd_unregister(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 649a4d7c241b..1814e698d861 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -9,11 +9,14 @@ struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); +void io_uring_unreg_ringfd(void); static inline void io_uring_files_cancel(void) { - if (current->io_uring) + if (current->io_uring) { + io_uring_unreg_ringfd(); __io_uring_cancel(false); + } } static inline void io_uring_task_cancel(void) { diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 787f491f0d2a..42b2fe84dbcd 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -257,10 +257,11 @@ struct io_cqring_offsets { /* * io_uring_enter(2) flags */ -#define IORING_ENTER_GETEVENTS (1U << 0) -#define IORING_ENTER_SQ_WAKEUP (1U << 1) -#define IORING_ENTER_SQ_WAIT (1U << 2) -#define IORING_ENTER_EXT_ARG (1U << 3) +#define IORING_ENTER_GETEVENTS (1U << 0) +#define IORING_ENTER_SQ_WAKEUP (1U << 1) +#define IORING_ENTER_SQ_WAIT (1U << 2) +#define IORING_ENTER_EXT_ARG (1U << 3) +#define IORING_ENTER_REGISTERED_RING (1U << 4) /* * Passed in for io_uring_setup(2). Copied back with updated info on success @@ -325,6 +326,10 @@ enum { /* set/get max number of io-wq workers */ IORING_REGISTER_IOWQ_MAX_WORKERS = 19, + /* register/unregister io_uring fd with the ring */ + IORING_REGISTER_RING_FDS = 20, + IORING_UNREGISTER_RING_FDS = 21, + /* this goes last */ IORING_REGISTER_LAST }; From cc3cec8367cba76a8ae4c271eba8450f3efc1ba3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 8 Mar 2022 17:46:52 -0700 Subject: [PATCH 18/42] io_uring: speedup provided buffer handling In testing high frequency workloads with provided buffers, we spend a lot of time in allocating and freeing the buffer units themselves. Rather than repeatedly free and alloc them, add a recycling cache instead. There are two caches: - ctx->io_buffers_cache. This is the one we grab from in the submission path, and it's protected by ctx->uring_lock. For inline completions, we can recycle straight back to this cache and not need any extra locking. - ctx->io_buffers_comp. If we're not under uring_lock, then we use this list to recycle buffers. It's protected by the completion_lock. On adding a new buffer, check io_buffers_cache. If it's empty, check if we can splice entries from the io_buffers_comp_cache. This reduces about 5-10% of overhead from provided buffers, bringing it pretty close to the non-provided path. Signed-off-by: Jens Axboe --- fs/io_uring.c | 139 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 117 insertions(+), 22 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index daa3609b742f..0f5e999e569f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -384,6 +384,7 @@ struct io_ring_ctx { struct list_head ltimeout_list; struct list_head cq_overflow_list; struct xarray io_buffers; + struct list_head io_buffers_cache; struct xarray personalities; u32 pers_next; unsigned sq_thread_idle; @@ -426,6 +427,8 @@ struct io_ring_ctx { struct hlist_head *cancel_hash; unsigned cancel_hash_bits; bool poll_multi_queue; + + struct list_head io_buffers_comp; } ____cacheline_aligned_in_smp; struct io_restriction restrictions; @@ -441,6 +444,8 @@ struct io_ring_ctx { struct llist_head rsrc_put_llist; struct list_head rsrc_ref_list; spinlock_t rsrc_ref_lock; + + struct list_head io_buffers_pages; }; /* Keep this last, we don't need it for the fast path */ @@ -1278,24 +1283,56 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, } } -static unsigned int __io_put_kbuf(struct io_kiocb *req) +static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) { struct io_buffer *kbuf = req->kbuf; unsigned int cflags; - cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; - cflags |= IORING_CQE_F_BUFFER; + cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT); req->flags &= ~REQ_F_BUFFER_SELECTED; - kfree(kbuf); + list_add(&kbuf->list, list); req->kbuf = NULL; return cflags; } -static inline unsigned int io_put_kbuf(struct io_kiocb *req) +static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) { if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) return 0; - return __io_put_kbuf(req); + return __io_put_kbuf(req, &req->ctx->io_buffers_comp); +} + +static inline unsigned int io_put_kbuf(struct io_kiocb *req, + unsigned issue_flags) +{ + unsigned int cflags; + + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + return 0; + + /* + * We can add this buffer back to two lists: + * + * 1) The io_buffers_cache list. This one is protected by the + * ctx->uring_lock. If we already hold this lock, add back to this + * list as we can grab it from issue as well. + * 2) The io_buffers_comp list. This one is protected by the + * ctx->completion_lock. + * + * We migrate buffers from the comp_list to the issue cache list + * when we need one. + */ + if (issue_flags & IO_URING_F_UNLOCKED) { + struct io_ring_ctx *ctx = req->ctx; + + spin_lock(&ctx->completion_lock); + cflags = __io_put_kbuf(req, &ctx->io_buffers_comp); + spin_unlock(&ctx->completion_lock); + } else { + cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache); + } + + return cflags; } static bool io_match_task(struct io_kiocb *head, struct task_struct *task, @@ -1443,6 +1480,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); + INIT_LIST_HEAD(&ctx->io_buffers_cache); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); @@ -1451,6 +1489,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); + INIT_LIST_HEAD(&ctx->io_buffers_pages); + INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); @@ -2329,7 +2369,8 @@ static void handle_prev_tw_list(struct io_wq_work_node *node, if (likely(*uring_locked)) req->io_task_work.func(req, uring_locked); else - __io_req_complete_post(req, req->result, io_put_kbuf(req)); + __io_req_complete_post(req, req->result, + io_put_kbuf_comp(req)); node = next; } while (node); @@ -2679,7 +2720,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (unlikely(req->flags & REQ_F_CQE_SKIP)) continue; - __io_fill_cqe(req, req->result, io_put_kbuf(req)); + __io_fill_cqe(req, req->result, io_put_kbuf(req, 0)); nr_events++; } @@ -2855,14 +2896,14 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) { - unsigned int cflags = io_put_kbuf(req); int res = req->result; if (*locked) { - io_req_complete_state(req, res, cflags); + io_req_complete_state(req, res, io_put_kbuf(req, 0)); io_req_add_compl_list(req); } else { - io_req_complete_post(req, res, cflags); + io_req_complete_post(req, res, + io_put_kbuf(req, IO_URING_F_UNLOCKED)); } } @@ -2871,7 +2912,8 @@ static void __io_complete_rw(struct io_kiocb *req, long res, { if (__io_complete_rw_common(req, res)) return; - __io_req_complete(req, issue_flags, req->result, io_put_kbuf(req)); + __io_req_complete(req, issue_flags, req->result, + io_put_kbuf(req, issue_flags)); } static void io_complete_rw(struct kiocb *kiocb, long res) @@ -4518,13 +4560,11 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, nxt = list_first_entry(&buf->list, struct io_buffer, list); list_del(&nxt->list); - kfree(nxt); if (++i == nbufs) return i; cond_resched(); } i++; - kfree(buf); xa_erase(&ctx->io_buffers, bgid); return i; @@ -4590,17 +4630,63 @@ static int io_provide_buffers_prep(struct io_kiocb *req, return 0; } -static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) +static int io_refill_buffer_cache(struct io_ring_ctx *ctx) +{ + struct io_buffer *buf; + struct page *page; + int bufs_in_page; + + /* + * Completions that don't happen inline (eg not under uring_lock) will + * add to ->io_buffers_comp. If we don't have any free buffers, check + * the completion list and splice those entries first. + */ + if (!list_empty_careful(&ctx->io_buffers_comp)) { + spin_lock(&ctx->completion_lock); + if (!list_empty(&ctx->io_buffers_comp)) { + list_splice_init(&ctx->io_buffers_comp, + &ctx->io_buffers_cache); + spin_unlock(&ctx->completion_lock); + return 0; + } + spin_unlock(&ctx->completion_lock); + } + + /* + * No free buffers and no completion entries either. Allocate a new + * page worth of buffer entries and add those to our freelist. + */ + page = alloc_page(GFP_KERNEL_ACCOUNT); + if (!page) + return -ENOMEM; + + list_add(&page->lru, &ctx->io_buffers_pages); + + buf = page_address(page); + bufs_in_page = PAGE_SIZE / sizeof(*buf); + while (bufs_in_page) { + list_add_tail(&buf->list, &ctx->io_buffers_cache); + buf++; + bufs_in_page--; + } + + return 0; +} + +static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, + struct io_buffer **head) { struct io_buffer *buf; u64 addr = pbuf->addr; int i, bid = pbuf->bid; for (i = 0; i < pbuf->nbufs; i++) { - buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); - if (!buf) + if (list_empty(&ctx->io_buffers_cache) && + io_refill_buffer_cache(ctx)) break; - + buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, + list); + list_del(&buf->list); buf->addr = addr; buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); buf->bid = bid; @@ -4632,7 +4718,7 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) list = head = xa_load(&ctx->io_buffers, p->bgid); - ret = io_add_buffers(p, &head); + ret = io_add_buffers(ctx, p, &head); if (ret >= 0 && !list) { ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL); if (ret < 0) @@ -5229,7 +5315,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; - __io_req_complete(req, issue_flags, ret, io_put_kbuf(req)); + __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); return 0; } @@ -5284,7 +5370,8 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) out_free: req_set_fail(req); } - __io_req_complete(req, issue_flags, ret, io_put_kbuf(req)); + + __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); return 0; } @@ -6704,7 +6791,7 @@ fail: static void io_clean_op(struct io_kiocb *req) { if (req->flags & REQ_F_BUFFER_SELECTED) - io_put_kbuf(req); + io_put_kbuf_comp(req); if (req->flags & REQ_F_NEED_CLEANUP) { switch (req->opcode) { @@ -9475,6 +9562,14 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) xa_for_each(&ctx->io_buffers, index, buf) __io_remove_buffers(ctx, buf, index, -1U); + + while (!list_empty(&ctx->io_buffers_pages)) { + struct page *page; + + page = list_first_entry(&ctx->io_buffers_pages, struct page, lru); + list_del_init(&page->lru); + __free_page(page); + } } static void io_req_caches_free(struct io_ring_ctx *ctx) From 4f57f06ce2186c31c3da52386125dc57b1cd6f96 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Mar 2022 06:27:26 -0700 Subject: [PATCH 19/42] io_uring: add support for IORING_OP_MSG_RING command This adds support for IORING_OP_MSG_RING, which allows an SQE to signal another ring. That allows either waking up someone waiting on the ring, or even passing a 64-bit value via the user_data field in the CQE. sqe->fd must contain the fd of a ring that should receive the CQE. sqe->off will be propagated to the cqe->user_data on the target ring, and sqe->len will be propagated to cqe->res. The results CQE will have IORING_CQE_F_MSG set in its flags, to indicate that this CQE was generated from a messaging request rather than a SQE issued locally on that ring. This effectively allows passing a 64-bit and a 32-bit quantify between the two rings. This request type has the following request specific error cases: - -EBADFD. Set if the sqe->fd doesn't point to a file descriptor that is of the io_uring type. - -EOVERFLOW. Set if we were not able to deliver a request to the target ring. Signed-off-by: Jens Axboe --- fs/io_uring.c | 55 +++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 3 ++ 2 files changed, 58 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0f5e999e569f..36b001365a79 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -706,6 +706,12 @@ struct io_hardlink { int flags; }; +struct io_msg { + struct file *file; + u64 user_data; + u32 len; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -871,6 +877,7 @@ struct io_kiocb { struct io_mkdir mkdir; struct io_symlink symlink; struct io_hardlink hardlink; + struct io_msg msg; }; u8 opcode; @@ -1121,6 +1128,9 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_MKDIRAT] = {}, [IORING_OP_SYMLINKAT] = {}, [IORING_OP_LINKAT] = {}, + [IORING_OP_MSG_RING] = { + .needs_file = 1, + }, }; /* requests with any of those set should undergo io_disarm_next() */ @@ -4322,6 +4332,46 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static int io_msg_ring_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || + sqe->rw_flags || sqe->splice_fd_in || sqe->buf_index || + sqe->personality)) + return -EINVAL; + + if (req->file->f_op != &io_uring_fops) + return -EBADFD; + + req->msg.user_data = READ_ONCE(sqe->off); + req->msg.len = READ_ONCE(sqe->len); + return 0; +} + +static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ring_ctx *target_ctx; + struct io_msg *msg = &req->msg; + int ret = -EOVERFLOW; + bool filled; + + target_ctx = req->file->private_data; + + spin_lock(&target_ctx->completion_lock); + filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, + IORING_CQE_F_MSG); + io_commit_cqring(target_ctx); + spin_unlock(&target_ctx->completion_lock); + + if (filled) { + io_cqring_ev_posted(target_ctx); + ret = 0; + } + + __io_req_complete(req, issue_flags, ret, 0); + return 0; +} + static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; @@ -6700,6 +6750,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_symlinkat_prep(req, sqe); case IORING_OP_LINKAT: return io_linkat_prep(req, sqe); + case IORING_OP_MSG_RING: + return io_msg_ring_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -6983,6 +7035,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) case IORING_OP_LINKAT: ret = io_linkat(req, issue_flags); break; + case IORING_OP_MSG_RING: + ret = io_msg_ring(req, issue_flags); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 42b2fe84dbcd..8bd4bfdd9a89 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -143,6 +143,7 @@ enum { IORING_OP_MKDIRAT, IORING_OP_SYMLINKAT, IORING_OP_LINKAT, + IORING_OP_MSG_RING, /* this goes last, obviously */ IORING_OP_LAST, @@ -199,9 +200,11 @@ struct io_uring_cqe { * * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID * IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries + * IORING_CQE_F_MSG If set, CQE was generated with IORING_OP_MSG_RING */ #define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_MORE (1U << 1) +#define IORING_CQE_F_MSG (1U << 2) enum { IORING_CQE_BUFFER_SHIFT = 16, From 950e79dd73131a263b2dd7fec521ceafd28d2724 Mon Sep 17 00:00:00 2001 From: Olivier Langlois Date: Tue, 8 Mar 2022 17:17:21 -0500 Subject: [PATCH 20/42] io_uring: minor io_cqring_wait() optimization Move up the block manipulating the sig variable to execute code that may encounter an error and exit first before continuing executing the rest of the function and avoid useless computations Signed-off-by: Olivier Langlois Link: https://lore.kernel.org/r/84513f7cc1b1fb31d8f4cb910aee033391d036b4.1646777484.git.olivier@trillion01.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 36b001365a79..3d2d47540e1b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7919,14 +7919,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, break; } while (1); - if (uts) { - struct timespec64 ts; - - if (get_timespec64(&ts, uts)) - return -EFAULT; - timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); - } - if (sig) { #ifdef CONFIG_COMPAT if (in_compat_syscall()) @@ -7940,6 +7932,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } + if (uts) { + struct timespec64 ts; + + if (get_timespec64(&ts, uts)) + return -EFAULT; + timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); + } + init_waitqueue_func_entry(&iowq.wq, io_wake_function); iowq.wq.private = current; INIT_LIST_HEAD(&iowq.wq.entry); From adc8682ec69012b68d5ab7123e246d2ad9a6f94b Mon Sep 17 00:00:00 2001 From: Olivier Langlois Date: Tue, 8 Mar 2022 17:17:26 -0500 Subject: [PATCH 21/42] io_uring: Add support for napi_busy_poll The sqpoll thread can be used for performing the napi busy poll in a similar way that it does io polling for file systems supporting direct access bypassing the page cache. The other way that io_uring can be used for napi busy poll is by calling io_uring_enter() to get events. If the user specify a timeout value, it is distributed between polling and sleeping by using the systemwide setting /proc/sys/net/core/busy_poll. The changes have been tested with this program: https://github.com/lano1106/io_uring_udp_ping and the result is: Without sqpoll: NAPI busy loop disabled: rtt min/avg/max/mdev = 40.631/42.050/58.667/1.547 us NAPI busy loop enabled: rtt min/avg/max/mdev = 30.619/31.753/61.433/1.456 us With sqpoll: NAPI busy loop disabled: rtt min/avg/max/mdev = 42.087/44.438/59.508/1.533 us NAPI busy loop enabled: rtt min/avg/max/mdev = 35.779/37.347/52.201/0.924 us Co-developed-by: Hao Xu Signed-off-by: Hao Xu Signed-off-by: Olivier Langlois Link: https://lore.kernel.org/r/810bd9408ffc510ff08269e78dca9df4af0b9e4e.1646777484.git.olivier@trillion01.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 232 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 231 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3d2d47540e1b..3d30f7b07677 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include #include @@ -401,6 +402,11 @@ struct io_ring_ctx { struct list_head sqd_list; unsigned long check_cq_overflow; +#ifdef CONFIG_NET_RX_BUSY_POLL + /* used to track busy poll napi_id */ + struct list_head napi_list; + spinlock_t napi_lock; /* napi_list lock */ +#endif struct { unsigned cached_cq_tail; @@ -1513,6 +1519,10 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_WQ_LIST(&ctx->locked_free_list); INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); INIT_WQ_LIST(&ctx->submit_state.compl_reqs); +#ifdef CONFIG_NET_RX_BUSY_POLL + INIT_LIST_HEAD(&ctx->napi_list); + spin_lock_init(&ctx->napi_lock); +#endif return ctx; err: kfree(ctx->dummy_ubuf); @@ -5581,6 +5591,108 @@ IO_NETOP_FN(send); IO_NETOP_FN(recv); #endif /* CONFIG_NET */ +#ifdef CONFIG_NET_RX_BUSY_POLL + +#define NAPI_TIMEOUT (60 * SEC_CONVERSION) + +struct napi_entry { + struct list_head list; + unsigned int napi_id; + unsigned long timeout; +}; + +/* + * Add busy poll NAPI ID from sk. + */ +static void io_add_napi(struct file *file, struct io_ring_ctx *ctx) +{ + unsigned int napi_id; + struct socket *sock; + struct sock *sk; + struct napi_entry *ne; + + if (!net_busy_loop_on()) + return; + + sock = sock_from_file(file); + if (!sock) + return; + + sk = sock->sk; + if (!sk) + return; + + napi_id = READ_ONCE(sk->sk_napi_id); + + /* Non-NAPI IDs can be rejected */ + if (napi_id < MIN_NAPI_ID) + return; + + spin_lock(&ctx->napi_lock); + list_for_each_entry(ne, &ctx->napi_list, list) { + if (ne->napi_id == napi_id) { + ne->timeout = jiffies + NAPI_TIMEOUT; + goto out; + } + } + + ne = kmalloc(sizeof(*ne), GFP_NOWAIT); + if (!ne) + goto out; + + ne->napi_id = napi_id; + ne->timeout = jiffies + NAPI_TIMEOUT; + list_add_tail(&ne->list, &ctx->napi_list); +out: + spin_unlock(&ctx->napi_lock); +} + +static inline void io_check_napi_entry_timeout(struct napi_entry *ne) +{ + if (time_after(jiffies, ne->timeout)) { + list_del(&ne->list); + kfree(ne); + } +} + +/* + * Busy poll if globally on and supporting sockets found + */ +static bool io_napi_busy_loop(struct list_head *napi_list) +{ + struct napi_entry *ne, *n; + + list_for_each_entry_safe(ne, n, napi_list, list) { + napi_busy_loop(ne->napi_id, NULL, NULL, true, + BUSY_POLL_BUDGET); + io_check_napi_entry_timeout(ne); + } + return !list_empty(napi_list); +} + +static void io_free_napi_list(struct io_ring_ctx *ctx) +{ + spin_lock(&ctx->napi_lock); + while (!list_empty(&ctx->napi_list)) { + struct napi_entry *ne = + list_first_entry(&ctx->napi_list, struct napi_entry, + list); + + list_del(&ne->list); + kfree(ne); + } + spin_unlock(&ctx->napi_lock); +} +#else +static inline void io_add_napi(struct file *file, struct io_ring_ctx *ctx) +{ +} + +static inline void io_free_napi_list(struct io_ring_ctx *ctx) +{ +} +#endif /* CONFIG_NET_RX_BUSY_POLL */ + struct io_poll_table { struct poll_table_struct pt; struct io_kiocb *req; @@ -5727,6 +5839,7 @@ static int io_poll_check_events(struct io_kiocb *req) if (unlikely(!filled)) return -ECANCELED; io_cqring_ev_posted(ctx); + io_add_napi(req->file, ctx); } else if (req->result) { return 0; } @@ -5959,6 +6072,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, __io_poll_execute(req, mask); return 0; } + io_add_napi(req->file, req->ctx); /* * Release ownership. If someone tried to queue a tw while it was @@ -7706,7 +7820,13 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) !(ctx->flags & IORING_SETUP_R_DISABLED)) ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); - +#ifdef CONFIG_NET_RX_BUSY_POLL + spin_lock(&ctx->napi_lock); + if (!list_empty(&ctx->napi_list) && + io_napi_busy_loop(&ctx->napi_list)) + ++ret; + spin_unlock(&ctx->napi_lock); +#endif if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) wake_up(&ctx->sqo_sq_wait); if (creds) @@ -7837,6 +7957,9 @@ struct io_wait_queue { struct io_ring_ctx *ctx; unsigned cq_tail; unsigned nr_timeouts; +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned busy_poll_to; +#endif }; static inline bool io_should_wake(struct io_wait_queue *iowq) @@ -7898,6 +8021,87 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, return 1; } +#ifdef CONFIG_NET_RX_BUSY_POLL +static void io_adjust_busy_loop_timeout(struct timespec64 *ts, + struct io_wait_queue *iowq) +{ + unsigned busy_poll_to = READ_ONCE(sysctl_net_busy_poll); + struct timespec64 pollto = ns_to_timespec64(1000 * (s64)busy_poll_to); + + if (timespec64_compare(ts, &pollto) > 0) { + *ts = timespec64_sub(*ts, pollto); + iowq->busy_poll_to = busy_poll_to; + } else { + u64 to = timespec64_to_ns(ts); + + do_div(to, 1000); + iowq->busy_poll_to = to; + ts->tv_sec = 0; + ts->tv_nsec = 0; + } +} + +static inline bool io_busy_loop_timeout(unsigned long start_time, + unsigned long bp_usec) +{ + if (bp_usec) { + unsigned long end_time = start_time + bp_usec; + unsigned long now = busy_loop_current_time(); + + return time_after(now, end_time); + } + return true; +} + +static bool io_busy_loop_end(void *p, unsigned long start_time) +{ + struct io_wait_queue *iowq = p; + + return signal_pending(current) || + io_should_wake(iowq) || + io_busy_loop_timeout(start_time, iowq->busy_poll_to); +} + +static void io_blocking_napi_busy_loop(struct list_head *napi_list, + struct io_wait_queue *iowq) +{ + unsigned long start_time = + list_is_singular(napi_list) ? 0 : + busy_loop_current_time(); + + do { + if (list_is_singular(napi_list)) { + struct napi_entry *ne = + list_first_entry(napi_list, + struct napi_entry, list); + + napi_busy_loop(ne->napi_id, io_busy_loop_end, iowq, + true, BUSY_POLL_BUDGET); + io_check_napi_entry_timeout(ne); + break; + } + } while (io_napi_busy_loop(napi_list) && + !io_busy_loop_end(iowq, start_time)); +} + +static void io_putback_napi_list(struct io_ring_ctx *ctx, + struct list_head *napi_list) +{ + struct napi_entry *cne, *lne; + + spin_lock(&ctx->napi_lock); + list_for_each_entry(cne, &ctx->napi_list, list) + list_for_each_entry(lne, napi_list, list) + if (cne->napi_id == lne->napi_id) { + list_del(&lne->list); + kfree(lne); + break; + } + list_splice(napi_list, &ctx->napi_list); + spin_unlock(&ctx->napi_lock); +} +#endif /* CONFIG_NET_RX_BUSY_POLL */ + /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. @@ -7910,6 +8114,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, struct io_rings *rings = ctx->rings; ktime_t timeout = KTIME_MAX; int ret; +#ifdef CONFIG_NET_RX_BUSY_POLL + LIST_HEAD(local_napi_list); +#endif do { io_cqring_overflow_flush(ctx); @@ -7932,13 +8139,29 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } +#ifdef CONFIG_NET_RX_BUSY_POLL + iowq.busy_poll_to = 0; + if (!(ctx->flags & IORING_SETUP_SQPOLL)) { + spin_lock(&ctx->napi_lock); + list_splice_init(&ctx->napi_list, &local_napi_list); + spin_unlock(&ctx->napi_lock); + } +#endif if (uts) { struct timespec64 ts; if (get_timespec64(&ts, uts)) return -EFAULT; +#ifdef CONFIG_NET_RX_BUSY_POLL + if (!list_empty(&local_napi_list)) + io_adjust_busy_loop_timeout(&ts, &iowq); +#endif timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); } +#ifdef CONFIG_NET_RX_BUSY_POLL + else if (!list_empty(&local_napi_list)) + iowq.busy_poll_to = READ_ONCE(sysctl_net_busy_poll); +#endif init_waitqueue_func_entry(&iowq.wq, io_wake_function); iowq.wq.private = current; @@ -7948,6 +8171,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; trace_io_uring_cqring_wait(ctx, min_events); +#ifdef CONFIG_NET_RX_BUSY_POLL + if (iowq.busy_poll_to) + io_blocking_napi_busy_loop(&local_napi_list, &iowq); + if (!list_empty(&local_napi_list)) + io_putback_napi_list(ctx, &local_napi_list); +#endif do { /* if we can't even flush overflow, don't wait for more */ if (!io_cqring_overflow_flush(ctx)) { @@ -9709,6 +9938,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_req_caches_free(ctx); if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); + io_free_napi_list(ctx); kfree(ctx->cancel_hash); kfree(ctx->dummy_ubuf); kfree(ctx); From 9af177ee3ef14c17ef10893c257fa4e2008a3d74 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Mar 2022 16:46:07 -0700 Subject: [PATCH 22/42] io_uring: retry early for reads if we can poll Most of the logic in io_read() deals with regular files, and in some ways it would make sense to split the handling into S_IFREG and others. But at least for retry, we don't need to bother setting up a bunch of state just to abort in the loop later. In particular, don't bother forcing setup of async data for a normal non-vectored read when we don't need it. Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3d30f7b07677..4d8366bc226f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3773,6 +3773,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; + /* if we can poll, just do that */ + if (req->opcode == IORING_OP_READ && file_can_poll(req->file)) + return -EAGAIN; /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) goto done; From 2be2eb02e2f5a096c351e5b70c46cfef259dabcd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Mar 2022 09:54:25 -0700 Subject: [PATCH 23/42] io_uring: ensure reads re-import for selected buffers If we drop buffers between scheduling a retry, then we need to re-import when we start the request again. Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4d8366bc226f..584b36dcd0aa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3737,6 +3737,16 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret < 0)) return ret; } else { + /* + * Safe and required to re-import if we're using provided + * buffers, as we dropped the selected one before retry. + */ + if (req->flags & REQ_F_BUFFER_SELECT) { + ret = io_import_iovec(READ, req, &iovec, s, issue_flags); + if (unlikely(ret < 0)) + return ret; + } + rw = req->async_data; s = &rw->s; /* From b1c62645758eb438179e3a0769168cb7b0a94d6b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Mar 2022 11:27:52 -0700 Subject: [PATCH 24/42] io_uring: recycle provided buffers if request goes async If we are using provided buffers, it's less than useful to have a buffer selected and pinned if a request needs to go async or arms poll for notification trigger on when we can process it. Recycle the buffer in those events, so we don't pin it for the duration of the request. Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 584b36dcd0aa..3145c9cacee0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -269,6 +269,7 @@ struct io_buffer { __u64 addr; __u32 len; __u16 bid; + __u16 bgid; }; struct io_restriction { @@ -1351,6 +1352,36 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, return cflags; } +static void io_kbuf_recycle(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer *head, *buf; + + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + return; + + lockdep_assert_held(&ctx->uring_lock); + + buf = req->kbuf; + + head = xa_load(&ctx->io_buffers, buf->bgid); + if (head) { + list_add(&buf->list, &head->list); + } else { + int ret; + + INIT_LIST_HEAD(&buf->list); + + /* if we fail, just leave buffer attached */ + ret = xa_insert(&ctx->io_buffers, buf->bgid, buf, GFP_KERNEL); + if (unlikely(ret < 0)) + return; + } + + req->flags &= ~REQ_F_BUFFER_SELECTED; + req->kbuf = NULL; +} + static bool io_match_task(struct io_kiocb *head, struct task_struct *task, bool cancel_all) __must_hold(&req->ctx->timeout_lock) @@ -4763,6 +4794,7 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, buf->addr = addr; buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); buf->bid = bid; + buf->bgid = pbuf->bgid; addr += pbuf->len; bid++; if (!*head) { @@ -7395,8 +7427,12 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req) * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. */ + io_kbuf_recycle(req); io_queue_async_work(req, NULL); break; + case IO_APOLL_OK: + io_kbuf_recycle(req); + break; } if (linked_timeout) From bcbb7bf6ccde7cb969a5642879832bc84ebf06a3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Mar 2022 12:59:35 -0700 Subject: [PATCH 25/42] io_uring: allow submissions to continue on error By default, io_uring will stop submitting a batch of requests if we run into an error submitting a request. This isn't strictly necessary, as the error result is passed out-of-band via a CQE anyway. And it can be a bit confusing for some applications. Provide a way to setup a ring that will continue submitting on error, when the error CQE has been posted. There's still one case that will break out of submission. If we fail allocating a request, then we'll still return -ENOMEM. We could in theory post a CQE for that condition too even if we never got a request. Leave that for a potential followup. Reported-by: Dylan Yudaken Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 +++++++++--- include/uapi/linux/io_uring.h | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3145c9cacee0..229b31d644ef 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7801,8 +7801,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) } /* will complete beyond this point, count as submitted */ submitted++; - if (io_submit_sqe(ctx, req, sqe)) - break; + if (io_submit_sqe(ctx, req, sqe)) { + /* + * Continue submitting even for sqe failure if the + * ring was setup with IORING_SETUP_SUBMIT_ALL + */ + if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL)) + break; + } } while (submitted < nr); if (unlikely(submitted != nr)) { @@ -11265,7 +11271,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | - IORING_SETUP_R_DISABLED)) + IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL)) return -EINVAL; return io_uring_create(entries, &p, params); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 8bd4bfdd9a89..d2be4eb22008 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -101,6 +101,7 @@ enum { #define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */ #define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ #define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */ +#define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */ enum { IORING_OP_NOP, From f3b6a41eb2bbdf545a42e54d637c34f4b1fdf5b9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 12 Mar 2022 06:50:13 -0700 Subject: [PATCH 26/42] io_uring: remove duplicated member check for io_msg_ring_prep() Julia and the kernel test robot report that the prep handling for this command inadvertently checks one field twice: fs/io_uring.c:4338:42-56: duplicated argument to && or || Get rid of it. Reported-by: kernel test robot Reported-by: Julia Lawall Fixes: 4f57f06ce218 ("io_uring: add support for IORING_OP_MSG_RING command") Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 229b31d644ef..299154efcd8a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4389,9 +4389,8 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) static int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || - sqe->rw_flags || sqe->splice_fd_in || sqe->buf_index || - sqe->personality)) + if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags || + sqe->splice_fd_in || sqe->buf_index || sqe->personality)) return -EINVAL; if (req->file->f_op != &io_uring_fops) From 4d9237e32c5db4f07f749a7ff1dd9b366bf3600e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 15 Mar 2022 10:54:08 -0600 Subject: [PATCH 27/42] io_uring: recycle apoll_poll entries Particularly for networked workloads, io_uring intensively uses its poll based backend to get a notification when data/space is available. Profiling workloads, we see 3-4% of alloc+free that is directly attributed to just the apoll allocation and free (and the rest being skb alloc+free). For the fast path, we have ctx->uring_lock held already for both issue and the inline completions, and we can utilize that to avoid any extra locking needed to have a basic recycling cache for the apoll entries on both the alloc and free side. Double poll still requires an allocation. But those are rare and not a fast path item. With the simple cache in place, we see a 3-4% reduction in overhead for the workload. Signed-off-by: Jens Axboe --- fs/io_uring.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 299154efcd8a..b17cf54653df 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -387,6 +387,7 @@ struct io_ring_ctx { struct list_head cq_overflow_list; struct xarray io_buffers; struct list_head io_buffers_cache; + struct list_head apoll_cache; struct xarray personalities; u32 pers_next; unsigned sq_thread_idle; @@ -1528,6 +1529,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); + INIT_LIST_HEAD(&ctx->apoll_cache); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); @@ -2650,6 +2652,15 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) if (!(req->flags & REQ_F_CQE_SKIP)) __io_fill_cqe(req, req->result, req->cflags); + if ((req->flags & REQ_F_POLLED) && req->apoll) { + struct async_poll *apoll = req->apoll; + + if (apoll->double_poll) + kfree(apoll->double_poll); + list_add(&apoll->poll.wait.entry, + &ctx->apoll_cache); + req->flags &= ~REQ_F_POLLED; + } } io_commit_cqring(ctx); @@ -6143,7 +6154,7 @@ enum { IO_APOLL_READY }; -static int io_arm_poll_handler(struct io_kiocb *req) +static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) { const struct io_op_def *def = &io_op_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; @@ -6168,9 +6179,16 @@ static int io_arm_poll_handler(struct io_kiocb *req) mask |= POLLOUT | POLLWRNORM; } - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); - if (unlikely(!apoll)) - return IO_APOLL_ABORTED; + if (!(issue_flags & IO_URING_F_UNLOCKED) && + !list_empty(&ctx->apoll_cache)) { + apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, + poll.wait.entry); + list_del_init(&apoll->poll.wait.entry); + } else { + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (unlikely(!apoll)) + return IO_APOLL_ABORTED; + } apoll->double_poll = NULL; req->apoll = apoll; req->flags |= REQ_F_POLLED; @@ -7271,7 +7289,7 @@ static void io_wq_submit_work(struct io_wq_work *work) continue; } - if (io_arm_poll_handler(req) == IO_APOLL_OK) + if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK) return; /* aborted or ready, in either case retry blocking */ needs_poll = false; @@ -7417,7 +7435,7 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req) { struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); - switch (io_arm_poll_handler(req)) { + switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: io_req_task_queue(req); break; @@ -9938,6 +9956,18 @@ static void io_wait_rsrc_data(struct io_rsrc_data *data) wait_for_completion(&data->done); } +static void io_flush_apoll_cache(struct io_ring_ctx *ctx) +{ + struct async_poll *apoll; + + while (!list_empty(&ctx->apoll_cache)) { + apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, + poll.wait.entry); + list_del(&apoll->poll.wait.entry); + kfree(apoll); + } +} + static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); @@ -9960,6 +9990,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) if (ctx->rings) __io_cqring_overflow_flush(ctx, true); io_eventfd_unregister(ctx); + io_flush_apoll_cache(ctx); mutex_unlock(&ctx->uring_lock); io_destroy_buffers(ctx); if (ctx->sq_creds) From 052ebf1fbb1cab86b145a68d80219c8c57321cbd Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Wed, 16 Mar 2022 02:52:04 -0700 Subject: [PATCH 28/42] io_uring: make tracing format consistent Make the tracing formatting for user_data and flags consistent. Having consistent formatting allows one for example to grep for a specific user_data/flags and be able to trace a single sqe through easily. Change user_data to 0x%llx and flags to 0x%x everywhere. The '0x' is useful to disambiguate for example "user_data 100". Additionally remove the '=' for flags in io_uring_req_failed, again for consistency. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220316095204.2191498-1-dylany@fb.com Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 18d4341c581c..cddf5b6fbeb4 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -44,7 +44,7 @@ TRACE_EVENT(io_uring_create, __entry->flags = flags; ), - TP_printk("ring %p, fd %d sq size %d, cq size %d, flags %d", + TP_printk("ring %p, fd %d sq size %d, cq size %d, flags 0x%x", __entry->ctx, __entry->fd, __entry->sq_entries, __entry->cq_entries, __entry->flags) ); @@ -125,7 +125,7 @@ TRACE_EVENT(io_uring_file_get, __entry->fd = fd; ), - TP_printk("ring %p, req %p, user_data %llu, fd %d", + TP_printk("ring %p, req %p, user_data 0x%llx, fd %d", __entry->ctx, __entry->req, __entry->user_data, __entry->fd) ); @@ -169,7 +169,7 @@ TRACE_EVENT(io_uring_queue_async_work, __entry->rw = rw; ), - TP_printk("ring %p, request %p, user_data %llu, opcode %d, flags %d, %s queue, work %p", + TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d, flags 0x%x, %s queue, work %p", __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, __entry->flags, __entry->rw ? "hashed" : "normal", __entry->work) ); @@ -205,7 +205,7 @@ TRACE_EVENT(io_uring_defer, __entry->opcode = opcode; ), - TP_printk("ring %p, request %p, user_data %llu, opcode %d", + TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d", __entry->ctx, __entry->req, __entry->data, __entry->opcode) ); @@ -305,7 +305,7 @@ TRACE_EVENT(io_uring_fail_link, __entry->link = link; ), - TP_printk("ring %p, request %p, user_data %llu, opcode %d, link %p", + TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d, link %p", __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, __entry->link) ); @@ -342,9 +342,9 @@ TRACE_EVENT(io_uring_complete, __entry->cflags = cflags; ), - TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags %x", + TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x", __entry->ctx, __entry->req, - (unsigned long long)__entry->user_data, + __entry->user_data, __entry->res, __entry->cflags) ); @@ -389,7 +389,7 @@ TRACE_EVENT(io_uring_submit_sqe, __entry->sq_thread = sq_thread; ), - TP_printk("ring %p, req %p, user_data %llu, opcode %d, flags %u, " + TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, flags 0x%x, " "non block %d, sq_thread %d", __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, __entry->flags, __entry->force_nonblock, __entry->sq_thread) @@ -433,7 +433,7 @@ TRACE_EVENT(io_uring_poll_arm, __entry->events = events; ), - TP_printk("ring %p, req %p, user_data %llu, opcode %d, mask 0x%x, events 0x%x", + TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, mask 0x%x, events 0x%x", __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, __entry->mask, __entry->events) ); @@ -470,7 +470,7 @@ TRACE_EVENT(io_uring_task_add, __entry->mask = mask; ), - TP_printk("ring %p, req %p, user_data %llu, opcode %d, mask %x", + TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, mask %x", __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, __entry->mask) ); @@ -529,8 +529,8 @@ TRACE_EVENT(io_uring_req_failed, __entry->error = error; ), - TP_printk("ring %p, req %p, user_data %llu, " - "op %d, flags=0x%x, prio=%d, off=%llu, addr=%llu, " + TP_printk("ring %p, req %p, user_data 0x%llx, " + "op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, " "len=%u, rw_flags=0x%x, buf_index=%d, " "personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d", __entry->ctx, __entry->req, __entry->user_data, From 521d61fc760aebdbf8938347a40c9538c0a70034 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Mar 2022 12:53:23 -0600 Subject: [PATCH 29/42] io_uring: move req->poll_refs into previous struct hole This serves two purposes: - We now have the last cacheline mostly unused for generic workloads, instead of having to pull in the poll refs explicitly for workloads that rely on poll arming. - It shrinks the io_kiocb from 232 to 224 bytes. Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b17cf54653df..fa4e2cb47e56 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -908,6 +908,7 @@ struct io_kiocb { /* used by request caches, completion batching and iopoll */ struct io_wq_work_node comp_list; atomic_t refs; + atomic_t poll_refs; struct io_kiocb *link; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ @@ -916,12 +917,11 @@ struct io_kiocb { struct async_poll *apoll; /* opcode allocated if it needs to store data for async defer */ void *async_data; - struct io_wq_work work; /* custom credentials, valid IFF REQ_F_CREDS is set */ - const struct cred *creds; /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ struct io_buffer *kbuf; - atomic_t poll_refs; + const struct cred *creds; + struct io_wq_work work; }; struct io_tctx_node { From 81459350d581e958ee9c6e76031f77333881c23c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Mar 2022 16:55:05 -0600 Subject: [PATCH 30/42] io_uring: cache req->apoll->events in req->cflags When we arm poll on behalf of a different type of request, like a network receive, then we allocate req->apoll as our poll entry. Running network workloads shows io_poll_check_events() as the most expensive part of io_uring, and it's all due to having to pull in req->apoll instead of just the request which we have hot already. Cache poll->events in req->cflags, which isn't used until the request completes anyway. This isn't strictly needed for regular poll, where req->poll.events is used and thus already hot, but for the sake of unification we do it all around. This saves 3-4% of overhead in certain request workloads. Signed-off-by: Jens Axboe --- fs/io_uring.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index fa4e2cb47e56..bfddad7a14ef 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5876,13 +5876,13 @@ static int io_poll_check_events(struct io_kiocb *req) return -ECANCELED; if (!req->result) { - struct poll_table_struct pt = { ._key = poll->events }; + struct poll_table_struct pt = { ._key = req->cflags }; - req->result = vfs_poll(req->file, &pt) & poll->events; + req->result = vfs_poll(req->file, &pt) & req->cflags; } /* multishot, just fill an CQE and proceed */ - if (req->result && !(poll->events & EPOLLONESHOT)) { + if (req->result && !(req->cflags & EPOLLONESHOT)) { __poll_t mask = mangle_poll(req->result & poll->events); bool filled; @@ -5953,9 +5953,16 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) io_req_complete_failed(req, ret); } -static void __io_poll_execute(struct io_kiocb *req, int mask) +static void __io_poll_execute(struct io_kiocb *req, int mask, int events) { req->result = mask; + /* + * This is useful for poll that is armed on behalf of another + * request, and where the wakeup path could be on a different + * CPU. We want to avoid pulling in req->apoll->events for that + * case. + */ + req->cflags = events; if (req->opcode == IORING_OP_POLL_ADD) req->io_task_work.func = io_poll_task_func; else @@ -5965,17 +5972,17 @@ static void __io_poll_execute(struct io_kiocb *req, int mask) io_req_task_work_add(req, false); } -static inline void io_poll_execute(struct io_kiocb *req, int res) +static inline void io_poll_execute(struct io_kiocb *req, int res, int events) { if (io_poll_get_ownership(req)) - __io_poll_execute(req, res); + __io_poll_execute(req, res, events); } static void io_poll_cancel_req(struct io_kiocb *req) { io_poll_mark_cancelled(req); /* kick tw, which should complete the request */ - io_poll_execute(req, 0); + io_poll_execute(req, 0, 0); } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -5989,7 +5996,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (unlikely(mask & POLLFREE)) { io_poll_mark_cancelled(req); /* we have to kick tw in case it's not already */ - io_poll_execute(req, 0); + io_poll_execute(req, 0, poll->events); /* * If the waitqueue is being freed early but someone is already @@ -6020,7 +6027,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, list_del_init(&poll->wait.entry); poll->head = NULL; } - __io_poll_execute(req, mask); + __io_poll_execute(req, mask, poll->events); } return 1; } @@ -6124,7 +6131,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, /* can't multishot if failed, just queue the event we've got */ if (unlikely(ipt->error || !ipt->nr_entries)) poll->events |= EPOLLONESHOT; - __io_poll_execute(req, mask); + __io_poll_execute(req, mask, poll->events); return 0; } io_add_napi(req->file, req->ctx); @@ -6135,7 +6142,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, */ v = atomic_dec_return(&req->poll_refs); if (unlikely(v & IO_POLL_REF_MASK)) - __io_poll_execute(req, 0); + __io_poll_execute(req, 0, poll->events); return 0; } @@ -6333,7 +6340,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return -EINVAL; io_req_set_refcount(req); - poll->events = io_poll_parse_events(sqe, flags); + req->cflags = poll->events = io_poll_parse_events(sqe, flags); return 0; } From 91eac1c69c202d9dad8bf717ae5b92db70bfe5cf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Mar 2022 16:59:10 -0600 Subject: [PATCH 31/42] io_uring: cache poll/double-poll state with a request flag With commit "io_uring: cache req->apoll->events in req->cflags" applied, we now have just io_poll_remove_entries() dipping into req->apoll when it isn't strictly necessary. Mark poll and double-poll with a flag, so we know if we need to look at apoll->double_poll. This avoids pulling in those cachelines if we don't need them. The common case is that the poll wake handler already removed these entries while hot off the completion path. Signed-off-by: Jens Axboe --- fs/io_uring.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index bfddad7a14ef..5b5f48f0f81e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -771,6 +771,8 @@ enum { REQ_F_ARM_LTIMEOUT_BIT, REQ_F_ASYNC_DATA_BIT, REQ_F_SKIP_LINK_CQES_BIT, + REQ_F_SINGLE_POLL_BIT, + REQ_F_DOUBLE_POLL_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -829,6 +831,10 @@ enum { REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), /* don't post CQEs while failing linked requests */ REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), + /* single poll may be active */ + REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), + /* double poll may active */ + REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), }; struct async_poll { @@ -5823,8 +5829,12 @@ static inline void io_poll_remove_entry(struct io_poll_iocb *poll) static void io_poll_remove_entries(struct io_kiocb *req) { - struct io_poll_iocb *poll = io_poll_get_single(req); - struct io_poll_iocb *poll_double = io_poll_get_double(req); + /* + * Nothing to do if neither of those flags are set. Avoid dipping + * into the poll/apoll/double cachelines if we can. + */ + if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL))) + return; /* * While we hold the waitqueue lock and the waitqueue is nonempty, @@ -5842,9 +5852,10 @@ static void io_poll_remove_entries(struct io_kiocb *req) * In that case, only RCU prevents the queue memory from being freed. */ rcu_read_lock(); - io_poll_remove_entry(poll); - if (poll_double) - io_poll_remove_entry(poll_double); + if (req->flags & REQ_F_SINGLE_POLL) + io_poll_remove_entry(io_poll_get_single(req)); + if (req->flags & REQ_F_DOUBLE_POLL) + io_poll_remove_entry(io_poll_get_double(req)); rcu_read_unlock(); } @@ -6026,6 +6037,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && poll->events & EPOLLONESHOT) { list_del_init(&poll->wait.entry); poll->head = NULL; + req->flags &= ~REQ_F_SINGLE_POLL; } __io_poll_execute(req, mask, poll->events); } @@ -6062,12 +6074,14 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->error = -ENOMEM; return; } + req->flags |= REQ_F_DOUBLE_POLL; io_init_poll_iocb(poll, first->events, first->wait.func); *poll_ptr = poll; if (req->opcode == IORING_OP_POLL_ADD) req->flags |= REQ_F_ASYNC_DATA; } + req->flags |= REQ_F_SINGLE_POLL; pt->nr_entries++; poll->head = head; poll->wait.private = req; From ae4da18941c1c13a9bd6f1d39888ca9a4ff3db91 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 17 Mar 2022 02:03:36 +0000 Subject: [PATCH 32/42] io_uring: normilise naming for fill_cqe* Restore consistency in __io_fill_cqe* like helpers, always honouring "io_" prefix and adding "req" when we're passing in a request. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/bd016ff5c1a4f74687828069d2619d8a65e0c6d7.1647481208.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5b5f48f0f81e..b9013fc14f74 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2019,7 +2019,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -static inline bool __fill_cqe(struct io_ring_ctx *ctx, u64 user_data, +static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { struct io_uring_cqe *cqe; @@ -2039,16 +2039,16 @@ static inline bool __fill_cqe(struct io_ring_ctx *ctx, u64 user_data, return io_cqring_event_overflow(ctx, user_data, res, cflags); } -static inline bool __io_fill_cqe(struct io_kiocb *req, s32 res, u32 cflags) +static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) { trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags); - return __fill_cqe(req->ctx, req->user_data, res, cflags); + return __io_fill_cqe(req->ctx, req->user_data, res, cflags); } static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) { if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe(req, res, cflags); + __io_fill_cqe_req(req, res, cflags); } static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, @@ -2056,7 +2056,7 @@ static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, { ctx->cq_extra++; trace_io_uring_complete(ctx, NULL, user_data, res, cflags); - return __fill_cqe(ctx, user_data, res, cflags); + return __io_fill_cqe(ctx, user_data, res, cflags); } static void __io_req_complete_post(struct io_kiocb *req, s32 res, @@ -2065,7 +2065,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res, struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe(req, res, cflags); + __io_fill_cqe_req(req, res, cflags); /* * If we're the last reference to this request, add to our locked * free_list cache. @@ -2657,7 +2657,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) comp_list); if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe(req, req->result, req->cflags); + __io_fill_cqe_req(req, req->result, req->cflags); if ((req->flags & REQ_F_POLLED) && req->apoll) { struct async_poll *apoll = req->apoll; @@ -2788,7 +2788,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (unlikely(req->flags & REQ_F_CQE_SKIP)) continue; - __io_fill_cqe(req, req->result, io_put_kbuf(req, 0)); + __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0)); nr_events++; } From 6695490dc85781fe98b782f36f27c13710dbc921 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 17 Mar 2022 02:03:37 +0000 Subject: [PATCH 33/42] io_uring: refactor timeout cancellation cqe posting io_fill_cqe*() is not always the best way to post CQEs just because there is enough of infrastructure on top. Replace a raw call to a variant of it inside of io_timeout_cancel(), which also saves us some bloating and might help with batching later. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/46113ec4345764b4aef3b384ce38cceabaeedcbb.1647481208.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b9013fc14f74..15de14d4331b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6471,10 +6471,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (IS_ERR(req)) return PTR_ERR(req); - - req_set_fail(req); - io_fill_cqe_req(req, -ECANCELED, 0); - io_put_req_deferred(req); + io_req_task_queue_fail(req, -ECANCELED); return 0; } From 3b2b78a8eb7cc38d207c5ee516769bc3f44d19ea Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 17 Mar 2022 02:03:38 +0000 Subject: [PATCH 34/42] io_uring: extend provided buf return to fails It's never a good idea to put provided buffers without notifying the userspace, it'll lead to userspace leaks, so add io_put_kbuf() in io_req_complete_failed(). The fail helper is called by all sorts of requests, but it's still safe to do as io_put_kbuf() will return 0 in for all requests that don't support and so don't expect provided buffers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a4880106fcf199d5810707fe2d17126fcdf18bc4.1647481208.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 15de14d4331b..a90844bf9b9c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2124,7 +2124,7 @@ static inline void io_req_complete(struct io_kiocb *req, s32 res) static void io_req_complete_failed(struct io_kiocb *req, s32 res) { req_set_fail(req); - io_req_complete_post(req, res, 0); + io_req_complete_post(req, res, io_put_kbuf(req, 0)); } static void io_req_complete_fail_submit(struct io_kiocb *req) From b91ef1872869d99cd42e908eb9754b81115c2c05 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 16 Mar 2022 20:24:47 -0600 Subject: [PATCH 35/42] io_uring: fix provided buffer return on failure for kiocb_done() Use io_req_complete_failed() in kiocb_done(). This cleans up the code, but also ensures that a provided buffers is correctly freed on failure. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a4880106fcf199d5810707fe2d17126fcdf18bc4.1647481208.git.asml.silence@gmail.com [axboe: split from previous patch] Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a90844bf9b9c..f039706bce0e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3242,14 +3242,10 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret, if (req->flags & REQ_F_REISSUE) { req->flags &= ~REQ_F_REISSUE; - if (io_resubmit_prep(req)) { + if (io_resubmit_prep(req)) io_req_task_queue_reissue(req); - } else { - req_set_fail(req); - req->result = ret; - req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req, false); - } + else + io_req_task_queue_fail(req, ret); } } From 0f84747177b962c32243a57cb454193bdba4fe8d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 17 Mar 2022 02:03:39 +0000 Subject: [PATCH 36/42] io_uring: remove extra barrier for non-sqpoll iopoll smp_mb() in io_cqring_ev_posted_iopoll() is only there because of waitqueue_active(). However, non-SQPOLL IOPOLL ring doesn't wake the CQ and so the barrier there is useless. Kill it, it's usually pretty expensive. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d72e8ef6f7a3f6a72e18fad8409f7d47afc8da7d.1647481208.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f039706bce0e..692dbe7b98e9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1877,11 +1877,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) { - /* see waitqueue_active() comment */ - smp_mb(); - if (ctx->flags & IORING_SETUP_SQPOLL) { - if (waitqueue_active(&ctx->cq_wait)) + if (wq_has_sleeper(&ctx->cq_wait)) wake_up_all(&ctx->cq_wait); } io_eventfd_signal(ctx); From 66fc25ca6b7ec4124606e0d59c71c6bcf14e05bb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 17 Mar 2022 02:03:40 +0000 Subject: [PATCH 37/42] io_uring: shuffle io_eventfd_signal() bits around A preparation patch, which moves a fast ->io_ev_fd check out of io_eventfd_signal() into ev_posted*(). Compilers are smart enough for it to not change anything, but will need it later. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ec4091ac76d43912b73917e8db651c2dac4b7b01.1647481208.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 692dbe7b98e9..31c625f61fd8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1828,10 +1828,6 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd; - /* Return quickly if ctx->io_ev_fd doesn't exist */ - if (likely(!rcu_dereference_raw(ctx->io_ev_fd))) - return; - rcu_read_lock(); /* * rcu_dereference ctx->io_ev_fd once and use it for both for checking @@ -1851,7 +1847,6 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx) if (!ev_fd->eventfd_async || io_wq_current_is_worker()) eventfd_signal(ev_fd->cq_ev_fd, 1); - out: rcu_read_unlock(); } @@ -1863,7 +1858,7 @@ out: * 1:1 relationship between how many times this function is called (and * hence the eventfd count) and number of CQEs posted to the CQ ring. */ -static void io_cqring_ev_posted(struct io_ring_ctx *ctx) +static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) { /* * wake_up_all() may seem excessive, but io_wake_function() and @@ -1872,7 +1867,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) */ if (wq_has_sleeper(&ctx->cq_wait)) wake_up_all(&ctx->cq_wait); - io_eventfd_signal(ctx); + if (unlikely(rcu_dereference_raw(ctx->io_ev_fd))) + io_eventfd_signal(ctx); } static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) @@ -1881,7 +1877,8 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) if (wq_has_sleeper(&ctx->cq_wait)) wake_up_all(&ctx->cq_wait); } - io_eventfd_signal(ctx); + if (unlikely(rcu_dereference_raw(ctx->io_ev_fd))) + io_eventfd_signal(ctx); } /* Returns true if there are no backlogged entries after the flush */ From 9333f6b4628c8037a89ed23e1188d4b7dc5d74e4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 17 Mar 2022 02:03:41 +0000 Subject: [PATCH 38/42] io_uring: thin down io_commit_cqring() io_commit_cqring() is currently always under spinlock section, so it's always better to keep it as slim as possible. Move __io_commit_cqring_flush() out of it into ev_posted*(). If fast checks do fail and this post-processing is required, we'll reacquire ->completion_lock, which is fine as we don't care about performance of draining and offset timeouts. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ec4e81fd720d3bc7bca8cb9152e080dad1a052f1.1647481208.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 31c625f61fd8..e69531ccd209 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1779,20 +1779,21 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->timeout_lock); } +static inline void io_commit_cqring(struct io_ring_ctx *ctx) +{ + /* order cqe stores with ring update */ + smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); +} + static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { + spin_lock(&ctx->completion_lock); if (ctx->off_timeout_used) io_flush_timeouts(ctx); if (ctx->drain_active) io_queue_deferred(ctx); -} - -static inline void io_commit_cqring(struct io_ring_ctx *ctx) -{ - if (unlikely(ctx->off_timeout_used || ctx->drain_active)) - __io_commit_cqring_flush(ctx); - /* order cqe stores with ring update */ - smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); } static inline bool io_sqring_full(struct io_ring_ctx *ctx) @@ -1860,6 +1861,9 @@ out: */ static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) { + if (unlikely(ctx->off_timeout_used || ctx->drain_active)) + __io_commit_cqring_flush(ctx); + /* * wake_up_all() may seem excessive, but io_wake_function() and * io_should_wake() handle the termination of the loop and only @@ -1873,6 +1877,9 @@ static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) { + if (unlikely(ctx->off_timeout_used || ctx->drain_active)) + __io_commit_cqring_flush(ctx); + if (ctx->flags & IORING_SETUP_SQPOLL) { if (wq_has_sleeper(&ctx->cq_wait)) wake_up_all(&ctx->cq_wait); From 9aa8dfde4869ccdec0a7290b686dbc10e079e163 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 17 Mar 2022 02:03:42 +0000 Subject: [PATCH 39/42] io_uring: fold evfd signalling under a slower path Add ->has_evfd flag, which is true IFF there is an eventfd attached, and use it to hide io_eventfd_signal() into __io_commit_cqring_flush() and combine fast checks in a single if. Also, gcc 11.2 wasn't inlining io_cqring_ev_posted() without this change, so helps with that as well. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f6168471997decded475a063f92915787975a30b.1647481208.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 60 +++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e69531ccd209..68498cebc0ef 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -347,6 +347,7 @@ struct io_ring_ctx { unsigned int off_timeout_used: 1; unsigned int drain_active: 1; unsigned int drain_disabled: 1; + unsigned int has_evfd: 1; } ____cacheline_aligned_in_smp; /* submission data */ @@ -1181,6 +1182,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); +static void io_eventfd_signal(struct io_ring_ctx *ctx); static struct kmem_cache *req_cachep; @@ -1785,15 +1787,19 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx) smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); } -static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx) +static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { - spin_lock(&ctx->completion_lock); - if (ctx->off_timeout_used) - io_flush_timeouts(ctx); - if (ctx->drain_active) - io_queue_deferred(ctx); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); + if (ctx->off_timeout_used || ctx->drain_active) { + spin_lock(&ctx->completion_lock); + if (ctx->off_timeout_used) + io_flush_timeouts(ctx); + if (ctx->drain_active) + io_queue_deferred(ctx); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + } + if (ctx->has_evfd) + io_eventfd_signal(ctx); } static inline bool io_sqring_full(struct io_ring_ctx *ctx) @@ -1852,6 +1858,17 @@ out: rcu_read_unlock(); } +static inline void io_cqring_wake(struct io_ring_ctx *ctx) +{ + /* + * wake_up_all() may seem excessive, but io_wake_function() and + * io_should_wake() handle the termination of the loop and only + * wake as many waiters as we need to. + */ + if (wq_has_sleeper(&ctx->cq_wait)) + wake_up_all(&ctx->cq_wait); +} + /* * This should only get called when at least one event has been posted. * Some applications rely on the eventfd notification count only changing @@ -1861,31 +1878,21 @@ out: */ static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active)) + if (unlikely(ctx->off_timeout_used || ctx->drain_active || + ctx->has_evfd)) __io_commit_cqring_flush(ctx); - /* - * wake_up_all() may seem excessive, but io_wake_function() and - * io_should_wake() handle the termination of the loop and only - * wake as many waiters as we need to. - */ - if (wq_has_sleeper(&ctx->cq_wait)) - wake_up_all(&ctx->cq_wait); - if (unlikely(rcu_dereference_raw(ctx->io_ev_fd))) - io_eventfd_signal(ctx); + io_cqring_wake(ctx); } static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active)) + if (unlikely(ctx->off_timeout_used || ctx->drain_active || + ctx->has_evfd)) __io_commit_cqring_flush(ctx); - if (ctx->flags & IORING_SETUP_SQPOLL) { - if (wq_has_sleeper(&ctx->cq_wait)) - wake_up_all(&ctx->cq_wait); - } - if (unlikely(rcu_dereference_raw(ctx->io_ev_fd))) - io_eventfd_signal(ctx); + if (ctx->flags & IORING_SETUP_SQPOLL) + io_cqring_wake(ctx); } /* Returns true if there are no backlogged entries after the flush */ @@ -9898,7 +9905,7 @@ static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } ev_fd->eventfd_async = eventfd_async; - + ctx->has_evfd = true; rcu_assign_pointer(ctx->io_ev_fd, ev_fd); return 0; } @@ -9918,6 +9925,7 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) ev_fd = rcu_dereference_protected(ctx->io_ev_fd, lockdep_is_held(&ctx->uring_lock)); if (ev_fd) { + ctx->has_evfd = false; rcu_assign_pointer(ctx->io_ev_fd, NULL); call_rcu(&ev_fd->rcu, io_eventfd_put); return 0; From dbc7d452e7cf7d3ebc0064e68d30e28d86d3939a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Mar 2022 17:20:10 -0600 Subject: [PATCH 40/42] io_uring: manage provided buffers strictly ordered Workloads using provided buffers benefit from using and returning buffers in the right order, and so does TLBs for that matter. Manage the internal buffer list in a straight list, rather than use the head buffer as the insertion node. Use a hashed list for the buffer group IDs instead of xarray, the overhead is much lower this way. xarray provides internal locking and other trickery that is handy for some uses cases, but io_uring already locks internally for the buffer manipulation and needs none of that. This is good for about a 2% reduction in overhead, combination of the improved management and the fact that the workload has an easier time bundling back provided buffers. Signed-off-by: Jens Axboe --- fs/io_uring.c | 154 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 92 insertions(+), 62 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 68498cebc0ef..690bfeaa609a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -264,6 +264,12 @@ struct io_rsrc_data { bool quiesce; }; +struct io_buffer_list { + struct list_head list; + struct list_head buf_list; + __u16 bgid; +}; + struct io_buffer { struct list_head list; __u64 addr; @@ -334,6 +340,8 @@ struct io_ev_fd { struct rcu_head rcu; }; +#define IO_BUFFERS_HASH_BITS 5 + struct io_ring_ctx { /* const or read-mostly hot data */ struct { @@ -386,7 +394,7 @@ struct io_ring_ctx { struct list_head timeout_list; struct list_head ltimeout_list; struct list_head cq_overflow_list; - struct xarray io_buffers; + struct list_head *io_buffers; struct list_head io_buffers_cache; struct list_head apoll_cache; struct xarray personalities; @@ -1361,10 +1369,25 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, return cflags; } +static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, + unsigned int bgid) +{ + struct list_head *hash_list; + struct io_buffer_list *bl; + + hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; + list_for_each_entry(bl, hash_list, list) + if (bl->bgid == bgid || bgid == -1U) + return bl; + + return NULL; +} + static void io_kbuf_recycle(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_buffer *head, *buf; + struct io_buffer_list *bl; + struct io_buffer *buf; if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) return; @@ -1372,21 +1395,8 @@ static void io_kbuf_recycle(struct io_kiocb *req) lockdep_assert_held(&ctx->uring_lock); buf = req->kbuf; - - head = xa_load(&ctx->io_buffers, buf->bgid); - if (head) { - list_add(&buf->list, &head->list); - } else { - int ret; - - INIT_LIST_HEAD(&buf->list); - - /* if we fail, just leave buffer attached */ - ret = xa_insert(&ctx->io_buffers, buf->bgid, buf, GFP_KERNEL); - if (unlikely(ret < 0)) - return; - } - + bl = io_buffer_get_list(ctx, buf->bgid); + list_add(&buf->list, &bl->buf_list); req->flags &= ~REQ_F_BUFFER_SELECTED; req->kbuf = NULL; } @@ -1501,7 +1511,7 @@ static __cold void io_fallback_req_func(struct work_struct *work) static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; - int hash_bits; + int i, hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -1528,6 +1538,13 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) /* set invalid range, so io_import_fixed() fails meeting it */ ctx->dummy_ubuf->ubuf = -1UL; + ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS, + sizeof(struct list_head), GFP_KERNEL); + if (!ctx->io_buffers) + goto err; + for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) + INIT_LIST_HEAD(&ctx->io_buffers[i]); + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto err; @@ -1539,7 +1556,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->io_buffers_cache); INIT_LIST_HEAD(&ctx->apoll_cache); init_completion(&ctx->ref_comp); - xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); @@ -1568,6 +1584,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) err: kfree(ctx->dummy_ubuf); kfree(ctx->cancel_hash); + kfree(ctx->io_buffers); kfree(ctx); return NULL; } @@ -3351,30 +3368,36 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) mutex_lock(&ctx->uring_lock); } +static void io_buffer_add_list(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned int bgid) +{ + struct list_head *list; + + list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; + INIT_LIST_HEAD(&bl->buf_list); + bl->bgid = bgid; + list_add(&bl->list, list); +} + static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, int bgid, unsigned int issue_flags) { struct io_buffer *kbuf = req->kbuf; - struct io_buffer *head; bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; if (req->flags & REQ_F_BUFFER_SELECTED) return kbuf; - io_ring_submit_lock(req->ctx, needs_lock); + io_ring_submit_lock(ctx, needs_lock); - lockdep_assert_held(&req->ctx->uring_lock); + lockdep_assert_held(&ctx->uring_lock); - head = xa_load(&req->ctx->io_buffers, bgid); - if (head) { - if (!list_empty(&head->list)) { - kbuf = list_last_entry(&head->list, struct io_buffer, - list); - list_del(&kbuf->list); - } else { - kbuf = head; - xa_erase(&req->ctx->io_buffers, bgid); - } + bl = io_buffer_get_list(ctx, bgid); + if (bl && !list_empty(&bl->buf_list)) { + kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); + list_del(&kbuf->list); if (*len > kbuf->len) *len = kbuf->len; req->flags |= REQ_F_BUFFER_SELECTED; @@ -4669,8 +4692,8 @@ static int io_remove_buffers_prep(struct io_kiocb *req, return 0; } -static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, - int bgid, unsigned nbufs) +static int __io_remove_buffers(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned nbufs) { unsigned i = 0; @@ -4679,17 +4702,16 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, return 0; /* the head kbuf is the list itself */ - while (!list_empty(&buf->list)) { + while (!list_empty(&bl->buf_list)) { struct io_buffer *nxt; - nxt = list_first_entry(&buf->list, struct io_buffer, list); + nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&nxt->list); if (++i == nbufs) return i; cond_resched(); } i++; - xa_erase(&ctx->io_buffers, bgid); return i; } @@ -4698,7 +4720,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; - struct io_buffer *head; + struct io_buffer_list *bl; int ret = 0; bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; @@ -4707,9 +4729,9 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) lockdep_assert_held(&ctx->uring_lock); ret = -ENOENT; - head = xa_load(&ctx->io_buffers, p->bgid); - if (head) - ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); + bl = io_buffer_get_list(ctx, p->bgid); + if (bl) + ret = __io_remove_buffers(ctx, bl, p->nbufs); if (ret < 0) req_set_fail(req); @@ -4798,7 +4820,7 @@ static int io_refill_buffer_cache(struct io_ring_ctx *ctx) } static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, - struct io_buffer **head) + struct io_buffer_list *bl) { struct io_buffer *buf; u64 addr = pbuf->addr; @@ -4810,30 +4832,24 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, break; buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, list); - list_del(&buf->list); + list_move_tail(&buf->list, &bl->buf_list); buf->addr = addr; buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); buf->bid = bid; buf->bgid = pbuf->bgid; addr += pbuf->len; bid++; - if (!*head) { - INIT_LIST_HEAD(&buf->list); - *head = buf; - } else { - list_add_tail(&buf->list, &(*head)->list); - } cond_resched(); } - return i ? i : -ENOMEM; + return i ? 0 : -ENOMEM; } static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; - struct io_buffer *head, *list; + struct io_buffer_list *bl; int ret = 0; bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; @@ -4841,14 +4857,18 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) lockdep_assert_held(&ctx->uring_lock); - list = head = xa_load(&ctx->io_buffers, p->bgid); - - ret = io_add_buffers(ctx, p, &head); - if (ret >= 0 && !list) { - ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL); - if (ret < 0) - __io_remove_buffers(ctx, head, p->bgid, -1U); + bl = io_buffer_get_list(ctx, p->bgid); + if (unlikely(!bl)) { + bl = kmalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) { + ret = -ENOMEM; + goto err; + } + io_buffer_add_list(ctx, bl, p->bgid); } + + ret = io_add_buffers(ctx, p, bl); +err: if (ret < 0) req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ @@ -9936,11 +9956,20 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) static void io_destroy_buffers(struct io_ring_ctx *ctx) { - struct io_buffer *buf; - unsigned long index; + int i; - xa_for_each(&ctx->io_buffers, index, buf) - __io_remove_buffers(ctx, buf, index, -1U); + for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) { + struct list_head *list = &ctx->io_buffers[i]; + + while (!list_empty(list)) { + struct io_buffer_list *bl; + + bl = list_first_entry(list, struct io_buffer_list, list); + __io_remove_buffers(ctx, bl, -1U); + list_del(&bl->list); + kfree(bl); + } + } while (!list_empty(&ctx->io_buffers_pages)) { struct page *page; @@ -10049,6 +10078,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_free_napi_list(ctx); kfree(ctx->cancel_hash); kfree(ctx->dummy_ubuf); + kfree(ctx->io_buffers); kfree(ctx); } From adf3a9e9f556613197583a1884f0de40a8bb6fb9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Mar 2022 17:26:19 -0600 Subject: [PATCH 41/42] io_uring: don't check unrelated req->open.how in accept request Looks like a victim of too much copy/paste, we should not be looking at req->open.how in accept. The point is to check CLOEXEC and error out, which we don't invalid direct descriptors on exec. Hence any attempt to get a direct descriptor with CLOEXEC is invalid. No harm is done here, as req->open.how.flags overlaps with req->accept.flags, but it's very confusing and might change if either of those command structs are modified. Fixes: aaa4db12ef7b ("io_uring: accept directly into fixed file table") Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 690bfeaa609a..2d6ab732f940 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5535,8 +5535,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->nofile = rlimit(RLIMIT_NOFILE); accept->file_slot = READ_ONCE(sqe->file_index); - if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) || - (accept->flags & SOCK_CLOEXEC))) + if (accept->file_slot && (accept->flags & SOCK_CLOEXEC)) return -EINVAL; if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; From 5e929367468c8f97cd1ffb0417316cecfebef94b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 18 Mar 2022 11:28:13 -0600 Subject: [PATCH 42/42] io_uring: terminate manual loop iterator loop correctly for non-vecs The fix for not advancing the iterator if we're using fixed buffers is broken in that it can hit a condition where we don't terminate the loop. This results in io-wq looping forever, asking to read (or write) 0 bytes for every subsequent loop. Reported-by: Joel Jaeschke Link: https://github.com/axboe/liburing/issues/549 Fixes: 16c8d2df7ec0 ("io_uring: ensure symmetry in handling iter types in loop_rw_iter()") Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2d6ab732f940..5fa736344b67 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3612,13 +3612,15 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) ret = nr; break; } + ret += nr; if (!iov_iter_is_bvec(iter)) { iov_iter_advance(iter, nr); } else { - req->rw.len -= nr; req->rw.addr += nr; + req->rw.len -= nr; + if (!req->rw.len) + break; } - ret += nr; if (nr != iovec.iov_len) break; }