From 68ca8fc00277ad04c975c382bd6e2d500e5c7185 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 21 Mar 2022 22:02:19 +0000
Subject: [PATCH 001/111] io_uring: small optimisation of tctx_task_work

There should be no completions stashed when we first get into
tctx_task_work(), so move completion flushing checks a bit later
after we had a chance to execute some task works.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c6765c804f3c438591b9825ab9c43d22039073c4.1647897811.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7625b29153b9..7fefdb9c4ae3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2475,10 +2475,6 @@ static void tctx_task_work(struct callback_head *cb)
 	while (1) {
 		struct io_wq_work_node *node1, *node2;
 
-		if (!tctx->task_list.first &&
-		    !tctx->prior_task_list.first && uring_locked)
-			io_submit_flush_completions(ctx);
-
 		spin_lock_irq(&tctx->task_lock);
 		node1 = tctx->prior_task_list.first;
 		node2 = tctx->task_list.first;
@@ -2492,10 +2488,13 @@ static void tctx_task_work(struct callback_head *cb)
 
 		if (node1)
 			handle_prev_tw_list(node1, &ctx, &uring_locked);
-
 		if (node2)
 			handle_tw_list(node2, &ctx, &uring_locked);
 		cond_resched();
+
+		if (!tctx->task_list.first &&
+		    !tctx->prior_task_list.first && uring_locked)
+			io_submit_flush_completions(ctx);
 	}
 
 	ctx_flush_and_put(ctx, &uring_locked);

From 60053be859b33f7a381a3f1755db5caffaa3cab8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 21 Mar 2022 22:02:20 +0000
Subject: [PATCH 002/111] io_uring: remove extra ifs around io_commit_cqring

Now io_commit_cqring() is simple and it tolerates well being called
without a new CQE filled, so kill a bunch of not needed anymore
guards.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/36aed692dff402bba00a444a63a9cd2e97a340ea.1647897811.git.asml.silence@gmail.com
[axboe: fold in followup fix]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7fefdb9c4ae3..f61490aee3d3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1920,8 +1920,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 			   ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
 	}
 
-	if (posted)
-		io_commit_cqring(ctx);
+	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
 	if (posted)
 		io_cqring_ev_posted(ctx);
@@ -2361,8 +2360,7 @@ static void __io_req_find_next_prep(struct io_kiocb *req)
 
 	spin_lock(&ctx->completion_lock);
 	posted = io_disarm_next(req);
-	if (posted)
-		io_commit_cqring(ctx);
+	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
 	if (posted)
 		io_cqring_ev_posted(ctx);
@@ -10128,8 +10126,7 @@ static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
 		}
 	}
 	spin_unlock_irq(&ctx->timeout_lock);
-	if (canceled != 0)
-		io_commit_cqring(ctx);
+	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
 	if (canceled != 0)
 		io_cqring_ev_posted(ctx);
@@ -10149,11 +10146,13 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 		io_unregister_personality(ctx, index);
 	mutex_unlock(&ctx->uring_lock);
 
-	io_kill_timeouts(ctx, NULL, true);
-	io_poll_remove_all(ctx, NULL, true);
-
-	/* if we failed setting up the ctx, we might not have any rings */
-	io_iopoll_try_reap_events(ctx);
+	/* failed during ring init, it couldn't have issued any requests */
+	if (ctx->rings) {
+		io_kill_timeouts(ctx, NULL, true);
+		io_poll_remove_all(ctx, NULL, true);
+		/* if we failed setting up the ctx, we might not have any rings */
+		io_iopoll_try_reap_events(ctx);
+	}
 
 	INIT_WORK(&ctx->exit_work, io_ring_exit_work);
 	/*
@@ -10245,6 +10244,10 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 	struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
 	struct io_uring_task *tctx = task ? task->io_uring : NULL;
 
+	/* failed during ring init, it couldn't have issued any requests */
+	if (!ctx->rings)
+		return;
+
 	while (1) {
 		enum io_wq_cancel cret;
 		bool ret = false;

From 7819a1f6ac0393f1f7861dfece6ffd5ef010f0f9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 21 Mar 2022 22:02:21 +0000
Subject: [PATCH 003/111] io_uring: refactor io_req_find_next

Move the fast path from io_req_find_next() into callers. It prepares us
for further changes.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/10bd0e564472dde0c7f8d90ae317d05356cd565a.1647897811.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f61490aee3d3..1859fff53ddf 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2370,8 +2370,6 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 {
 	struct io_kiocb *nxt;
 
-	if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
-		return NULL;
 	/*
 	 * If LINK is set, we have dependent requests in this chain. If we
 	 * didn't fail this request, queue the first one up, moving any other
@@ -2598,10 +2596,12 @@ static void io_req_task_queue_reissue(struct io_kiocb *req)
 
 static inline void io_queue_next(struct io_kiocb *req)
 {
-	struct io_kiocb *nxt = io_req_find_next(req);
+	if (unlikely(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))) {
+		struct io_kiocb *nxt = io_req_find_next(req);
 
-	if (nxt)
-		io_req_task_queue(nxt);
+		if (nxt)
+			io_req_task_queue(nxt);
+	}
 }
 
 static void io_free_req(struct io_kiocb *req)
@@ -2695,7 +2695,8 @@ static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
 	struct io_kiocb *nxt = NULL;
 
 	if (req_ref_put_and_test(req)) {
-		nxt = io_req_find_next(req);
+		if (unlikely(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))
+			nxt = io_req_find_next(req);
 		__io_free_req(req);
 	}
 	return nxt;

From a538be5be328229d4da3343d4d6514bb4d5c3d5d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 21 Mar 2022 22:02:22 +0000
Subject: [PATCH 004/111] io_uring: optimise io_free_batch_list

We do several req->flags checks in the fast path of
io_free_batch_list(). One explicit check of REQ_F_REFCOUNT, and two
other hidden in io_queue_next() and io_dismantle_req(). Moreover, there
is a io_req_put_rsrc_locked() call in between, so there is no hope
req->flags will be preserved in registers.

All those flags if not a slow path than definitely a slower path, so
put them all under a single flags mask check and save several mem
reloads and ifs.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0fb493f73f2009aea395c570c2932fecaa4e1244.1647897811.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1859fff53ddf..fddfbf0f9310 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -113,6 +113,9 @@
 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
 				REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA)
 
+#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
+				 IO_REQ_CLEAN_FLAGS)
+
 #define IO_TCTX_REFS_CACHE_NR	(1U << 10)
 
 struct io_uring {
@@ -2626,15 +2629,20 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    comp_list);
 
-		if (unlikely(req->flags & REQ_F_REFCOUNT)) {
-			node = req->comp_list.next;
-			if (!req_ref_put_and_test(req))
-				continue;
+		if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
+			if (req->flags & REQ_F_REFCOUNT) {
+				node = req->comp_list.next;
+				if (!req_ref_put_and_test(req))
+					continue;
+			}
+			io_queue_next(req);
+			if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
+				io_clean_op(req);
 		}
+		if (!(req->flags & REQ_F_FIXED_FILE))
+			io_put_file(req->file);
 
 		io_req_put_rsrc_locked(req, ctx);
-		io_queue_next(req);
-		io_dismantle_req(req);
 
 		if (req->task != task) {
 			if (task)

From b605a7fabb607adbe4ea7cc97b69ae6e0555e7b2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 21 Mar 2022 22:02:23 +0000
Subject: [PATCH 005/111] io_uring: move poll recycling later in compl flushing

There is a new (req->flags & REQ_F_POLLED) check in
__io_submit_flush_completions() for poll recycling, however
io_free_batch_list() is a much better place for it. First, we prefer it
after putting the last req ref just to avoid potential problems in the
future. Also, it'll enable the recycling for IOPOLL and also will place
it closer to all other req->flags bits clean up requests.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/31dfe1dafda66ba3ce36b301884ec7e162c777d1.1647897811.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index fddfbf0f9310..c6746b416816 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2635,6 +2635,15 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
 				if (!req_ref_put_and_test(req))
 					continue;
 			}
+			if ((req->flags & REQ_F_POLLED) && req->apoll) {
+				struct async_poll *apoll = req->apoll;
+
+				if (apoll->double_poll)
+					kfree(apoll->double_poll);
+				list_add(&apoll->poll.wait.entry,
+						&ctx->apoll_cache);
+				req->flags &= ~REQ_F_POLLED;
+			}
 			io_queue_next(req);
 			if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
 				io_clean_op(req);
@@ -2673,15 +2682,6 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 
 			if (!(req->flags & REQ_F_CQE_SKIP))
 				__io_fill_cqe_req(req, req->result, req->cflags);
-			if ((req->flags & REQ_F_POLLED) && req->apoll) {
-				struct async_poll *apoll = req->apoll;
-
-				if (apoll->double_poll)
-					kfree(apoll->double_poll);
-				list_add(&apoll->poll.wait.entry,
-						&ctx->apoll_cache);
-				req->flags &= ~REQ_F_POLLED;
-			}
 		}
 
 		io_commit_cqring(ctx);

From 57859f4d93db3eecd5dd680d3b3520eea80c996b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 21 Mar 2022 22:02:24 +0000
Subject: [PATCH 006/111] io_uring: clean up io_queue_next()

Move fast check out of io_queue_next(), it makes req->flags checks in
__io_submit_flush_completions() a bit clearer and grants us better
comtrol, e.g. can remove now not justified unlikely() in
__io_submit_flush_completions(). Also, we don't care about having this
check in io_free_req() as the function is a slow path and
io_req_find_next() handles it correctly.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1f9e1cc80adbb11b37017d511df4a2c6141a3f08.1647897811.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c6746b416816..ce726b328b53 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2597,14 +2597,12 @@ static void io_req_task_queue_reissue(struct io_kiocb *req)
 	io_req_task_work_add(req, false);
 }
 
-static inline void io_queue_next(struct io_kiocb *req)
+static void io_queue_next(struct io_kiocb *req)
 {
-	if (unlikely(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))) {
-		struct io_kiocb *nxt = io_req_find_next(req);
+	struct io_kiocb *nxt = io_req_find_next(req);
 
-		if (nxt)
-			io_req_task_queue(nxt);
-	}
+	if (nxt)
+		io_req_task_queue(nxt);
 }
 
 static void io_free_req(struct io_kiocb *req)
@@ -2644,7 +2642,8 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
 						&ctx->apoll_cache);
 				req->flags &= ~REQ_F_POLLED;
 			}
-			io_queue_next(req);
+			if (req->flags & (REQ_F_LINK|REQ_F_HARDLINK))
+				io_queue_next(req);
 			if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
 				io_clean_op(req);
 		}

From f81440d33cc61bd80f37e1045f0f5a21d043eed3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 22 Mar 2022 14:07:56 +0000
Subject: [PATCH 007/111] io_uring: split off IOPOLL argument verifiction

IOPOLL doesn't use additional arguments like sigsets, but it still
needs some basic verification, which is currently done by
io_get_ext_arg(). This patch adds a separate function for the IOPOLL
path, which is a bit simpler and doesn't do extra. This prepares us for
further patches, which would have hurt inlining in the hot path otherwise.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/71b23fca412e3374b74be7711cfd42a3d9d5dfe0.1647957378.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ce726b328b53..9a243a1eb8c1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -10701,6 +10701,19 @@ static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
 	return 0;
 }
 
+static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
+{
+	if (flags & IORING_ENTER_EXT_ARG) {
+		struct io_uring_getevents_arg arg;
+
+		if (argsz != sizeof(arg))
+			return -EINVAL;
+		if (copy_from_user(&arg, argp, sizeof(arg)))
+			return -EFAULT;
+	}
+	return 0;
+}
+
 static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
 			  struct __kernel_timespec __user **ts,
 			  const sigset_t __user **sig)
@@ -10814,13 +10827,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			goto out;
 	}
 	if (flags & IORING_ENTER_GETEVENTS) {
-		const sigset_t __user *sig;
-		struct __kernel_timespec __user *ts;
-
-		ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
-		if (unlikely(ret))
-			goto out;
-
 		min_complete = min(min_complete, ctx->cq_entries);
 
 		/*
@@ -10831,8 +10837,17 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		 */
 		if (ctx->flags & IORING_SETUP_IOPOLL &&
 		    !(ctx->flags & IORING_SETUP_SQPOLL)) {
+			ret = io_validate_ext_arg(flags, argp, argsz);
+			if (unlikely(ret))
+				goto out;
 			ret = io_iopoll_check(ctx, min_complete);
 		} else {
+			const sigset_t __user *sig;
+			struct __kernel_timespec __user *ts;
+
+			ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
+			if (unlikely(ret))
+				goto out;
 			ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
 		}
 	}

From 773697b610bff9451cf67d4f57b08d68c4b1a832 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 22 Mar 2022 14:07:57 +0000
Subject: [PATCH 008/111] io_uring: pre-calculate syscall iopolling decision

Syscall should only iopoll for events when it's a IOPOLL ring and is not
SQPOLL. Instead of check both flags every time we can save it in ring
flags so it's easier to use. We don't care much about an extra if there,
however it will be inconvenient to copy-paste this chunk with checks in
future patches.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/7fd2f8fc2606305aa06dd8c0ff8f76a66b39c383.1647957378.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9a243a1eb8c1..5fc97791c51b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -356,6 +356,7 @@ struct io_ring_ctx {
 		unsigned int		drain_active: 1;
 		unsigned int		drain_disabled: 1;
 		unsigned int		has_evfd: 1;
+		unsigned int		syscall_iopoll: 1;
 	} ____cacheline_aligned_in_smp;
 
 	/* submission data */
@@ -10829,14 +10830,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	if (flags & IORING_ENTER_GETEVENTS) {
 		min_complete = min(min_complete, ctx->cq_entries);
 
-		/*
-		 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
-		 * space applications don't need to do io completion events
-		 * polling again, they can rely on io_sq_thread to do polling
-		 * work, which can reduce cpu usage and uring_lock contention.
-		 */
-		if (ctx->flags & IORING_SETUP_IOPOLL &&
-		    !(ctx->flags & IORING_SETUP_SQPOLL)) {
+		if (ctx->syscall_iopoll) {
 			ret = io_validate_ext_arg(flags, argp, argsz);
 			if (unlikely(ret))
 				goto out;
@@ -11174,6 +11168,17 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	ctx = io_ring_ctx_alloc(p);
 	if (!ctx)
 		return -ENOMEM;
+
+	/*
+	 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
+	 * space applications don't need to do io completion events
+	 * polling again, they can rely on io_sq_thread to do polling
+	 * work, which can reduce cpu usage and uring_lock contention.
+	 */
+	if (ctx->flags & IORING_SETUP_IOPOLL &&
+	    !(ctx->flags & IORING_SETUP_SQPOLL))
+		ctx->syscall_iopoll = 1;
+
 	ctx->compat = in_compat_syscall();
 	if (!capable(CAP_IPC_LOCK))
 		ctx->user = get_uid(current_user());

From d487b43cd3276e40723641510cbd2d33da4f6800 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 22 Mar 2022 14:07:58 +0000
Subject: [PATCH 009/111] io_uring: optimise mutex locking for submit+iopoll

Both submittion and iopolling requires holding uring_lock. IOPOLL can
users do them together in a single syscall, however it would still do 2
pairs of lock/unlock. Optimise this case combining locking into one
lock/unlock pair, which especially nice for low QD.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/034b6c41658648ad3ad3c9485ac8eb546f010bc4.1647957378.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5fc97791c51b..8d3b98b0c713 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2853,12 +2853,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 	unsigned int nr_events = 0;
 	int ret = 0;
 
-	/*
-	 * We disallow the app entering submit/complete with polling, but we
-	 * still need to lock the ring to prevent racing with polled issue
-	 * that got punted to a workqueue.
-	 */
-	mutex_lock(&ctx->uring_lock);
 	/*
 	 * Don't enter poll loop if we already have events pending.
 	 * If we do, we can potentially be spinning for commands that
@@ -2867,7 +2861,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 	if (test_bit(0, &ctx->check_cq_overflow))
 		__io_cqring_overflow_flush(ctx, false);
 	if (io_cqring_events(ctx))
-		goto out;
+		return 0;
 	do {
 		/*
 		 * If a submit got punted to a workqueue, we can have the
@@ -2897,8 +2891,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 		nr_events += ret;
 		ret = 0;
 	} while (nr_events < min && !need_resched());
-out:
-	mutex_unlock(&ctx->uring_lock);
+
 	return ret;
 }
 
@@ -10820,21 +10813,33 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		ret = io_uring_add_tctx_node(ctx);
 		if (unlikely(ret))
 			goto out;
+
 		mutex_lock(&ctx->uring_lock);
 		submitted = io_submit_sqes(ctx, to_submit);
-		mutex_unlock(&ctx->uring_lock);
-
-		if (submitted != to_submit)
+		if (submitted != to_submit) {
+			mutex_unlock(&ctx->uring_lock);
 			goto out;
+		}
+		if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll)
+			goto iopoll_locked;
+		mutex_unlock(&ctx->uring_lock);
 	}
 	if (flags & IORING_ENTER_GETEVENTS) {
-		min_complete = min(min_complete, ctx->cq_entries);
-
 		if (ctx->syscall_iopoll) {
+			/*
+			 * We disallow the app entering submit/complete with
+			 * polling, but we still need to lock the ring to
+			 * prevent racing with polled issue that got punted to
+			 * a workqueue.
+			 */
+			mutex_lock(&ctx->uring_lock);
+iopoll_locked:
 			ret = io_validate_ext_arg(flags, argp, argsz);
-			if (unlikely(ret))
-				goto out;
-			ret = io_iopoll_check(ctx, min_complete);
+			if (likely(!ret)) {
+				min_complete = min(min_complete, ctx->cq_entries);
+				ret = io_iopoll_check(ctx, min_complete);
+			}
+			mutex_unlock(&ctx->uring_lock);
 		} else {
 			const sigset_t __user *sig;
 			struct __kernel_timespec __user *ts;
@@ -10842,6 +10847,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
 			if (unlikely(ret))
 				goto out;
+			min_complete = min(min_complete, ctx->cq_entries);
 			ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
 		}
 	}

From f8929630514505ef8c6cf70d8b7a2bbf7e43b225 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 25 Mar 2022 11:52:14 +0000
Subject: [PATCH 010/111] io_uring: cleanup conditional submit locking

Refactor io_ring_submit_[un]lock(), make it accept issue_flags and
remove manual IO_URING_F_UNLOCKED checks. It also allows us to place
lockdep annotations inside instead of sprinkling them in a bunch of
places. There is only one user that doesn't fit now, so hand code
locking in __io_rsrc_put_work().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e55c2c06767676a801252e8094c9ab09912487a4.1648209006.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 95 ++++++++++++++++++++++-----------------------------
 1 file changed, 41 insertions(+), 54 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8d3b98b0c713..1ae3ba02a129 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1223,6 +1223,26 @@ struct sock *io_uring_get_socket(struct file *file)
 }
 EXPORT_SYMBOL(io_uring_get_socket);
 
+static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags)
+{
+	lockdep_assert_held(&ctx->uring_lock);
+	if (issue_flags & IO_URING_F_UNLOCKED)
+		mutex_unlock(&ctx->uring_lock);
+}
+
+static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags)
+{
+	/*
+	 * "Normal" inline submissions always hold the uring_lock, since we
+	 * grab it from the system call. Same is true for the SQPOLL offload.
+	 * The only exception is when we've detached the request and issue it
+	 * from an async worker thread, grab the lock for that case.
+	 */
+	if (issue_flags & IO_URING_F_UNLOCKED)
+		mutex_lock(&ctx->uring_lock);
+	lockdep_assert_held(&ctx->uring_lock);
+}
+
 static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
 {
 	if (!*locked) {
@@ -1420,10 +1440,7 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 	if (req->flags & REQ_F_PARTIAL_IO)
 		return;
 
-	if (issue_flags & IO_URING_F_UNLOCKED)
-		mutex_lock(&ctx->uring_lock);
-
-	lockdep_assert_held(&ctx->uring_lock);
+	io_ring_submit_lock(ctx, issue_flags);
 
 	buf = req->kbuf;
 	bl = io_buffer_get_list(ctx, buf->bgid);
@@ -1431,8 +1448,7 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 	req->flags &= ~REQ_F_BUFFER_SELECTED;
 	req->kbuf = NULL;
 
-	if (issue_flags & IO_URING_F_UNLOCKED)
-		mutex_unlock(&ctx->uring_lock);
+	io_ring_submit_unlock(ctx, issue_flags);
 }
 
 static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
@@ -3309,24 +3325,6 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
 	return __io_import_fixed(req, rw, iter, imu);
 }
 
-static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
-{
-	if (needs_lock)
-		mutex_unlock(&ctx->uring_lock);
-}
-
-static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
-{
-	/*
-	 * "Normal" inline submissions always hold the uring_lock, since we
-	 * grab it from the system call. Same is true for the SQPOLL offload.
-	 * The only exception is when we've detached the request and issue it
-	 * from an async worker thread, grab the lock for that case.
-	 */
-	if (needs_lock)
-		mutex_lock(&ctx->uring_lock);
-}
-
 static void io_buffer_add_list(struct io_ring_ctx *ctx,
 			       struct io_buffer_list *bl, unsigned int bgid)
 {
@@ -3342,16 +3340,13 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
 					  int bgid, unsigned int issue_flags)
 {
 	struct io_buffer *kbuf = req->kbuf;
-	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer_list *bl;
 
 	if (req->flags & REQ_F_BUFFER_SELECTED)
 		return kbuf;
 
-	io_ring_submit_lock(ctx, needs_lock);
-
-	lockdep_assert_held(&ctx->uring_lock);
+	io_ring_submit_lock(req->ctx, issue_flags);
 
 	bl = io_buffer_get_list(ctx, bgid);
 	if (bl && !list_empty(&bl->buf_list)) {
@@ -3365,7 +3360,7 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
 		kbuf = ERR_PTR(-ENOBUFS);
 	}
 
-	io_ring_submit_unlock(req->ctx, needs_lock);
+	io_ring_submit_unlock(req->ctx, issue_flags);
 	return kbuf;
 }
 
@@ -4734,11 +4729,8 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer_list *bl;
 	int ret = 0;
-	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 
-	io_ring_submit_lock(ctx, needs_lock);
-
-	lockdep_assert_held(&ctx->uring_lock);
+	io_ring_submit_lock(ctx, issue_flags);
 
 	ret = -ENOENT;
 	bl = io_buffer_get_list(ctx, p->bgid);
@@ -4749,7 +4741,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 
 	/* complete before unlock, IOPOLL may need the lock */
 	__io_req_complete(req, issue_flags, ret, 0);
-	io_ring_submit_unlock(ctx, needs_lock);
+	io_ring_submit_unlock(ctx, issue_flags);
 	return 0;
 }
 
@@ -4863,11 +4855,8 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer_list *bl;
 	int ret = 0;
-	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 
-	io_ring_submit_lock(ctx, needs_lock);
-
-	lockdep_assert_held(&ctx->uring_lock);
+	io_ring_submit_lock(ctx, issue_flags);
 
 	bl = io_buffer_get_list(ctx, p->bgid);
 	if (unlikely(!bl)) {
@@ -4885,7 +4874,7 @@ err:
 		req_set_fail(req);
 	/* complete before unlock, IOPOLL may need the lock */
 	__io_req_complete(req, issue_flags, ret, 0);
-	io_ring_submit_unlock(ctx, needs_lock);
+	io_ring_submit_unlock(ctx, issue_flags);
 	return 0;
 }
 
@@ -6787,7 +6776,6 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	u64 sqe_addr = req->cancel.addr;
-	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 	struct io_tctx_node *node;
 	int ret;
 
@@ -6796,7 +6784,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 		goto done;
 
 	/* slow path, try all io-wq's */
-	io_ring_submit_lock(ctx, needs_lock);
+	io_ring_submit_lock(ctx, issue_flags);
 	ret = -ENOENT;
 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
 		struct io_uring_task *tctx = node->task->io_uring;
@@ -6805,7 +6793,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 		if (ret != -ENOENT)
 			break;
 	}
-	io_ring_submit_unlock(ctx, needs_lock);
+	io_ring_submit_unlock(ctx, issue_flags);
 done:
 	if (ret < 0)
 		req_set_fail(req);
@@ -6832,7 +6820,6 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 	struct io_uring_rsrc_update2 up;
 	int ret;
 
@@ -6843,10 +6830,10 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	up.resv = 0;
 	up.resv2 = 0;
 
-	io_ring_submit_lock(ctx, needs_lock);
+	io_ring_submit_lock(ctx, issue_flags);
 	ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
 					&up, req->rsrc_update.nr_args);
-	io_ring_submit_unlock(ctx, needs_lock);
+	io_ring_submit_unlock(ctx, issue_flags);
 
 	if (ret < 0)
 		req_set_fail(req);
@@ -8772,15 +8759,17 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 		list_del(&prsrc->list);
 
 		if (prsrc->tag) {
-			bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
+			if (ctx->flags & IORING_SETUP_IOPOLL)
+				mutex_lock(&ctx->uring_lock);
 
-			io_ring_submit_lock(ctx, lock_ring);
 			spin_lock(&ctx->completion_lock);
 			io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
 			io_commit_cqring(ctx);
 			spin_unlock(&ctx->completion_lock);
 			io_cqring_ev_posted(ctx);
-			io_ring_submit_unlock(ctx, lock_ring);
+
+			if (ctx->flags & IORING_SETUP_IOPOLL)
+				mutex_unlock(&ctx->uring_lock);
 		}
 
 		rsrc_data->do_put(ctx, prsrc);
@@ -8956,12 +8945,11 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 				 unsigned int issue_flags, u32 slot_index)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 	bool needs_switch = false;
 	struct io_fixed_file *file_slot;
 	int ret = -EBADF;
 
-	io_ring_submit_lock(ctx, needs_lock);
+	io_ring_submit_lock(ctx, issue_flags);
 	if (file->f_op == &io_uring_fops)
 		goto err;
 	ret = -ENXIO;
@@ -9002,7 +8990,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 err:
 	if (needs_switch)
 		io_rsrc_node_switch(ctx, ctx->file_data);
-	io_ring_submit_unlock(ctx, needs_lock);
+	io_ring_submit_unlock(ctx, issue_flags);
 	if (ret)
 		fput(file);
 	return ret;
@@ -9012,12 +9000,11 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
 {
 	unsigned int offset = req->close.file_slot - 1;
 	struct io_ring_ctx *ctx = req->ctx;
-	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 	struct io_fixed_file *file_slot;
 	struct file *file;
 	int ret;
 
-	io_ring_submit_lock(ctx, needs_lock);
+	io_ring_submit_lock(ctx, issue_flags);
 	ret = -ENXIO;
 	if (unlikely(!ctx->file_data))
 		goto out;
@@ -9043,7 +9030,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
 	io_rsrc_node_switch(ctx, ctx->file_data);
 	ret = 0;
 out:
-	io_ring_submit_unlock(ctx, needs_lock);
+	io_ring_submit_unlock(ctx, issue_flags);
 	return ret;
 }
 

From 9d170164dbac3f674b5ec2189e2a279eb0b7625f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 25 Mar 2022 11:52:15 +0000
Subject: [PATCH 011/111] io_uring: partially uninline io_put_task()

In most cases io_put_task() is called from the submitter task and go
through a higly optimised fast path, which has to be inlined. The other
branch though is bulkier and we don't care about it as much because it
implies atomics and other heavy calls. Extract it into a helper, which
is expected not to be inlined.

[before] size ./fs/io_uring.o
   text    data     bss     dec     hex filename
  89328   13646       8  102982   19246 ./fs/io_uring.o
[after] size ./fs/io_uring.o
   text    data     bss     dec     hex filename
  89096   13646       8  102750   1915e ./fs/io_uring.o

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/dec213db0e0b8605132da81e0a0be687a4d140cb.1648209006.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1ae3ba02a129..4c4b81d691a9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1963,19 +1963,23 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
 	return ret;
 }
 
-/* must to be called somewhat shortly after putting a request */
-static inline void io_put_task(struct task_struct *task, int nr)
+static void __io_put_task(struct task_struct *task, int nr)
 {
 	struct io_uring_task *tctx = task->io_uring;
 
-	if (likely(task == current)) {
-		tctx->cached_refs += nr;
-	} else {
-		percpu_counter_sub(&tctx->inflight, nr);
-		if (unlikely(atomic_read(&tctx->in_idle)))
-			wake_up(&tctx->wait);
-		put_task_struct_many(task, nr);
-	}
+	percpu_counter_sub(&tctx->inflight, nr);
+	if (unlikely(atomic_read(&tctx->in_idle)))
+		wake_up(&tctx->wait);
+	put_task_struct_many(task, nr);
+}
+
+/* must to be called somewhat shortly after putting a request */
+static inline void io_put_task(struct task_struct *task, int nr)
+{
+	if (likely(task == current))
+		task->io_uring->cached_refs += nr;
+	else
+		__io_put_task(task, nr);
 }
 
 static void io_task_refs_refill(struct io_uring_task *tctx)

From 963c6abbb4e4ee3d00ca7992f751c49b38b07e68 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 25 Mar 2022 11:52:16 +0000
Subject: [PATCH 012/111] io_uring: silence io_for_each_link() warning

Some tooling keep complaining about self assignment in
io_for_each_link(), the code is correct but still let's workaround it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f0de77b0b0f8309554ba6fba34327b7813bcc3ff.1648209006.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4c4b81d691a9..ef95829c9ae1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6959,9 +6959,10 @@ static int io_req_prep_async(struct io_kiocb *req)
 static u32 io_get_sequence(struct io_kiocb *req)
 {
 	u32 seq = req->ctx->cached_sq_head;
+	struct io_kiocb *cur;
 
 	/* need original cached_sq_head, but it was increased for each req */
-	io_for_each_link(req, req)
+	io_for_each_link(cur, req)
 		seq--;
 	return seq;
 }

From 775a1f2f99483bda4e43376ebbc6b459c8fbf20a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 25 Mar 2022 11:52:17 +0000
Subject: [PATCH 013/111] io_uring: refactor io_req_add_compl_list()

A small refactoring for io_req_add_compl_list() deduplicating some code.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f0a5272b45efe4ffc41cb79b99784e39c699aade.1648209006.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ef95829c9ae1..007dc1c14af4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1692,11 +1692,10 @@ static void io_prep_async_link(struct io_kiocb *req)
 
 static inline void io_req_add_compl_list(struct io_kiocb *req)
 {
-	struct io_ring_ctx *ctx = req->ctx;
-	struct io_submit_state *state = &ctx->submit_state;
+	struct io_submit_state *state = &req->ctx->submit_state;
 
 	if (!(req->flags & REQ_F_CQE_SKIP))
-		ctx->submit_state.flush_cqes = true;
+		state->flush_cqes = true;
 	wq_list_add_tail(&req->comp_list, &state->compl_reqs);
 }
 

From b4f20bb4e6d55a971d5f5555a971978a6263d8de Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 25 Mar 2022 16:39:57 -0600
Subject: [PATCH 014/111] io_uring: move finish_wait() outside of loop in
 cqring_wait()

We don't need to call this for every loop. This is particularly
troublesome if we are task_work intensive, and get woken more often than
we desire due to that.

Just do it at the end, that's always safe as we initialize the waitqueue
list head anyway. This can save a considerable amount of hammering on
the waitqueue lock, which is also hot from the request completion side.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 007dc1c14af4..ed3f26259122 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8174,10 +8174,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
 						TASK_INTERRUPTIBLE);
 		ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
-		finish_wait(&ctx->cq_wait, &iowq.wq);
 		cond_resched();
 	} while (ret > 0);
 
+	finish_wait(&ctx->cq_wait, &iowq.wq);
 	restore_saved_sigmask_unless(ret == -EINTR);
 
 	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;

From 1f59bc0f18cf46abe27ea18cfa4cb7f1b4166896 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 6 Apr 2022 21:33:56 +0100
Subject: [PATCH 015/111] io_uring: don't scm-account for non af_unix sockets

io_uring deals with file reference loops by registering all fixed files
in the SCM/GC infrastrucure. However, only a small subset of all file
types can keep long-term references to other files and those that don't
are not interesting for the garbage collector as they can't be in a
reference loop. They neither can be directly recycled by GC nor affect
loop searching.

Let's skip io_uring SCM accounting for loop-less files, i.e. all but
af_unix sockets, quite imroving fixed file updates performance and
greatly helpnig with memory footprint.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9c44ecf6e89d69130a8c4360cce2183ffc5ddd6f.1649277098.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 51 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ed3f26259122..2b2203d1d1fb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1223,6 +1223,18 @@ struct sock *io_uring_get_socket(struct file *file)
 }
 EXPORT_SYMBOL(io_uring_get_socket);
 
+#if defined(CONFIG_UNIX)
+static inline bool io_file_need_scm(struct file *filp)
+{
+	return !!unix_get_socket(filp);
+}
+#else
+static inline bool io_file_need_scm(struct file *filp)
+{
+	return 0;
+}
+#endif
+
 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags)
 {
 	lockdep_assert_held(&ctx->uring_lock);
@@ -8426,6 +8438,17 @@ static void io_free_file_tables(struct io_file_table *table)
 
 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
+	int i;
+
+	for (i = 0; i < ctx->nr_user_files; i++) {
+		struct file *file = io_file_from_index(ctx, i);
+
+		if (!file || io_file_need_scm(file))
+			continue;
+		io_fixed_file_slot(&ctx->file_table, i)->file_ptr = 0;
+		fput(file);
+	}
+
 #if defined(CONFIG_UNIX)
 	if (ctx->ring_sock) {
 		struct sock *sock = ctx->ring_sock->sk;
@@ -8434,16 +8457,6 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
 			kfree_skb(skb);
 	}
-#else
-	int i;
-
-	for (i = 0; i < ctx->nr_user_files; i++) {
-		struct file *file;
-
-		file = io_file_from_index(ctx, i);
-		if (file)
-			fput(file);
-	}
 #endif
 	io_free_file_tables(&ctx->file_table);
 	io_rsrc_data_free(ctx->file_data);
@@ -8592,7 +8605,9 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
 /*
  * Ensure the UNIX gc is aware of our file set, so we are certain that
  * the io_uring can be safely unregistered on process exit, even if we have
- * loops in the file referencing.
+ * loops in the file referencing. We account only files that can hold other
+ * files because otherwise they can't form a loop and so are not interesting
+ * for GC.
  */
 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
 {
@@ -8618,8 +8633,9 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
 	for (i = 0; i < nr; i++) {
 		struct file *file = io_file_from_index(ctx, i + offset);
 
-		if (!file)
+		if (!file || !io_file_need_scm(file))
 			continue;
+
 		fpl->fp[nr_files] = get_file(file);
 		unix_inflight(fpl->user, fpl->fp[nr_files]);
 		nr_files++;
@@ -8636,7 +8652,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
 		for (i = 0; i < nr; i++) {
 			struct file *file = io_file_from_index(ctx, i + offset);
 
-			if (file)
+			if (file && io_file_need_scm(file))
 				fput(file);
 		}
 	} else {
@@ -8678,6 +8694,7 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
 
 		if (file)
 			fput(file);
+		io_fixed_file_slot(&ctx->file_table, total)->file_ptr = 0;
 		total++;
 	}
 
@@ -8699,6 +8716,11 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
 	struct sk_buff *skb;
 	int i;
 
+	if (!io_file_need_scm(file)) {
+		fput(file);
+		return;
+	}
+
 	__skb_queue_head_init(&list);
 
 	/*
@@ -8893,6 +8915,9 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 	struct sk_buff_head *head = &sock->sk_receive_queue;
 	struct sk_buff *skb;
 
+	if (!io_file_need_scm(file))
+		return 0;
+
 	/*
 	 * See if we can merge this file into an existing skb SCM_RIGHTS
 	 * file set. If there's no room, fall back to allocating a new skb

From a03a2a209e826f646dff49202b8b1eb8427c2d08 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 7 Apr 2022 13:40:01 +0100
Subject: [PATCH 016/111] io_uring: uniform SCM accounting

Channel all SCM accounting through io_sqe_file_register(), so we do it
uniformely for updates and initial registration and can kill duplicated
code. Registration might be slightly slower in some case, but first we
skip most of SCM accounting now so it's not a problem. Moreover, it's
nicer for an empty set registration as we don't even try to allocate
skb for them anymore.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/6c9afbeb22812777d0c43e52353b63db5b87ed1e.1649334991.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 93 ++++++++++++++-------------------------------------
 1 file changed, 25 insertions(+), 68 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2b2203d1d1fb..a9c8bf49552a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8663,48 +8663,6 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
 
 	return 0;
 }
-
-/*
- * If UNIX sockets are enabled, fd passing can cause a reference cycle which
- * causes regular reference counting to break down. We rely on the UNIX
- * garbage collection to take care of this problem for us.
- */
-static int io_sqe_files_scm(struct io_ring_ctx *ctx)
-{
-	unsigned left, total;
-	int ret = 0;
-
-	total = 0;
-	left = ctx->nr_user_files;
-	while (left) {
-		unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
-
-		ret = __io_sqe_files_scm(ctx, this_files, total);
-		if (ret)
-			break;
-		left -= this_files;
-		total += this_files;
-	}
-
-	if (!ret)
-		return 0;
-
-	while (total < ctx->nr_user_files) {
-		struct file *file = io_file_from_index(ctx, total);
-
-		if (file)
-			fput(file);
-		io_fixed_file_slot(&ctx->file_table, total)->file_ptr = 0;
-		total++;
-	}
-
-	return ret;
-}
-#else
-static int io_sqe_files_scm(struct io_ring_ctx *ctx)
-{
-	return 0;
-}
 #endif
 
 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
@@ -8825,6 +8783,9 @@ static void io_rsrc_put_work(struct work_struct *work)
 	}
 }
 
+static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
+				int index);
+
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 				 unsigned nr_args, u64 __user *tags)
 {
@@ -8849,27 +8810,31 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	if (ret)
 		return ret;
 
-	ret = -ENOMEM;
-	if (!io_alloc_file_tables(&ctx->file_table, nr_args))
-		goto out_free;
+	if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
+		io_rsrc_data_free(ctx->file_data);
+		ctx->file_data = NULL;
+		return -ENOMEM;
+	}
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
+		struct io_fixed_file *file_slot;
+
 		if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
 			ret = -EFAULT;
-			goto out_fput;
+			goto fail;
 		}
 		/* allow sparse sets */
 		if (fd == -1) {
 			ret = -EINVAL;
 			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
-				goto out_fput;
+				goto fail;
 			continue;
 		}
 
 		file = fget(fd);
 		ret = -EBADF;
 		if (unlikely(!file))
-			goto out_fput;
+			goto fail;
 
 		/*
 		 * Don't allow io_uring instances to be registered. If UNIX
@@ -8880,30 +8845,22 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		 */
 		if (file->f_op == &io_uring_fops) {
 			fput(file);
-			goto out_fput;
+			goto fail;
+		}
+		file_slot = io_fixed_file_slot(&ctx->file_table, i);
+		io_fixed_file_set(file_slot, file);
+		ret = io_sqe_file_register(ctx, file, i);
+		if (ret) {
+			file_slot->file_ptr = 0;
+			fput(file);
+			goto fail;
 		}
-		io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
-	}
-
-	ret = io_sqe_files_scm(ctx);
-	if (ret) {
-		__io_sqe_files_unregister(ctx);
-		return ret;
 	}
 
 	io_rsrc_node_switch(ctx, NULL);
-	return ret;
-out_fput:
-	for (i = 0; i < ctx->nr_user_files; i++) {
-		file = io_file_from_index(ctx, i);
-		if (file)
-			fput(file);
-	}
-	io_free_file_tables(&ctx->file_table);
-	ctx->nr_user_files = 0;
-out_free:
-	io_rsrc_data_free(ctx->file_data);
-	ctx->file_data = NULL;
+	return 0;
+fail:
+	__io_sqe_files_unregister(ctx);
 	return ret;
 }
 

From dca58c6a08a92c5c674140047de7f72f9cc843d0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 7 Apr 2022 13:40:02 +0100
Subject: [PATCH 017/111] io_uring: refactor __io_sqe_files_scm

__io_sqe_files_scm() is now called only from one place passing a single
file, so nr argument can be killed and __io_sqe_files_scm() simplified.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/66b492bc66dc8356d45d64076bb31d677d11a7c9.1649334991.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 46 ++++++++++++----------------------------------
 1 file changed, 12 insertions(+), 34 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a9c8bf49552a..cbc8ddb23181 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8609,12 +8609,12 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
  * files because otherwise they can't form a loop and so are not interesting
  * for GC.
  */
-static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
+static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int offset)
 {
+	struct file *file = io_file_from_index(ctx, offset);
 	struct sock *sk = ctx->ring_sock->sk;
 	struct scm_fp_list *fpl;
 	struct sk_buff *skb;
-	int i, nr_files;
 
 	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
 	if (!fpl)
@@ -8628,39 +8628,17 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
 
 	skb->sk = sk;
 
-	nr_files = 0;
 	fpl->user = get_uid(current_user());
-	for (i = 0; i < nr; i++) {
-		struct file *file = io_file_from_index(ctx, i + offset);
-
-		if (!file || !io_file_need_scm(file))
-			continue;
-
-		fpl->fp[nr_files] = get_file(file);
-		unix_inflight(fpl->user, fpl->fp[nr_files]);
-		nr_files++;
-	}
-
-	if (nr_files) {
-		fpl->max = SCM_MAX_FD;
-		fpl->count = nr_files;
-		UNIXCB(skb).fp = fpl;
-		skb->destructor = unix_destruct_scm;
-		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
-		skb_queue_head(&sk->sk_receive_queue, skb);
-
-		for (i = 0; i < nr; i++) {
-			struct file *file = io_file_from_index(ctx, i + offset);
-
-			if (file && io_file_need_scm(file))
-				fput(file);
-		}
-	} else {
-		kfree_skb(skb);
-		free_uid(fpl->user);
-		kfree(fpl);
-	}
+	fpl->fp[0] = get_file(file);
+	unix_inflight(fpl->user, file);
 
+	fpl->max = SCM_MAX_FD;
+	fpl->count = 1;
+	UNIXCB(skb).fp = fpl;
+	skb->destructor = unix_destruct_scm;
+	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+	skb_queue_head(&sk->sk_receive_queue, skb);
+	fput(file);
 	return 0;
 }
 #endif
@@ -8904,7 +8882,7 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 		return 0;
 	}
 
-	return __io_sqe_files_scm(ctx, 1, index);
+	return __io_sqe_files_scm(ctx, index);
 #else
 	return 0;
 #endif

From e390510af0461e743a8f0031fc235552c40778f9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 7 Apr 2022 13:40:03 +0100
Subject: [PATCH 018/111] io_uring: don't pass around fixed index for scm

There is an old API nuisance where io_uring's SCM accounting functions
traverse fixed file tables and so requires them to be set in advance,
which leads to some implicit rules of how io_sqe_file_register() should
be used.

__io_sqe_files_scm() now works with only one file at a time, pass a file
directly and get rid of all fixed table dereferencing inside. Clean
io_sqe_file_register() callers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/fb32031d892e61a7748c70da7999725d5e798671.1649334991.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 37 ++++++++++++++-----------------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cbc8ddb23181..f97a0a5c55a0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8609,9 +8609,8 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
  * files because otherwise they can't form a loop and so are not interesting
  * for GC.
  */
-static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int offset)
+static int __io_sqe_files_scm(struct io_ring_ctx *ctx, struct file *file)
 {
-	struct file *file = io_file_from_index(ctx, offset);
 	struct sock *sk = ctx->ring_sock->sk;
 	struct scm_fp_list *fpl;
 	struct sk_buff *skb;
@@ -8761,8 +8760,7 @@ static void io_rsrc_put_work(struct work_struct *work)
 	}
 }
 
-static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
-				int index);
+static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file);
 
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 				 unsigned nr_args, u64 __user *tags)
@@ -8825,14 +8823,13 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			fput(file);
 			goto fail;
 		}
-		file_slot = io_fixed_file_slot(&ctx->file_table, i);
-		io_fixed_file_set(file_slot, file);
-		ret = io_sqe_file_register(ctx, file, i);
+		ret = io_sqe_file_register(ctx, file);
 		if (ret) {
-			file_slot->file_ptr = 0;
 			fput(file);
 			goto fail;
 		}
+		file_slot = io_fixed_file_slot(&ctx->file_table, i);
+		io_fixed_file_set(file_slot, file);
 	}
 
 	io_rsrc_node_switch(ctx, NULL);
@@ -8842,8 +8839,7 @@ fail:
 	return ret;
 }
 
-static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
-				int index)
+static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file)
 {
 #if defined(CONFIG_UNIX)
 	struct sock *sock = ctx->ring_sock->sk;
@@ -8882,7 +8878,7 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 		return 0;
 	}
 
-	return __io_sqe_files_scm(ctx, index);
+	return __io_sqe_files_scm(ctx, file);
 #else
 	return 0;
 #endif
@@ -8942,15 +8938,11 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 		needs_switch = true;
 	}
 
-	*io_get_tag_slot(ctx->file_data, slot_index) = 0;
-	io_fixed_file_set(file_slot, file);
-	ret = io_sqe_file_register(ctx, file, slot_index);
-	if (ret) {
-		file_slot->file_ptr = 0;
-		goto err;
+	ret = io_sqe_file_register(ctx, file);
+	if (!ret) {
+		*io_get_tag_slot(ctx->file_data, slot_index) = 0;
+		io_fixed_file_set(file_slot, file);
 	}
-
-	ret = 0;
 err:
 	if (needs_switch)
 		io_rsrc_node_switch(ctx, ctx->file_data);
@@ -9061,14 +9053,13 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				err = -EBADF;
 				break;
 			}
-			*io_get_tag_slot(data, i) = tag;
-			io_fixed_file_set(file_slot, file);
-			err = io_sqe_file_register(ctx, file, i);
+			err = io_sqe_file_register(ctx, file);
 			if (err) {
-				file_slot->file_ptr = 0;
 				fput(file);
 				break;
 			}
+			*io_get_tag_slot(data, i) = tag;
+			io_fixed_file_set(file_slot, file);
 		}
 	}
 

From 73b25d3badbf3642d15fe267ffc5b1a5c0bc07b3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 7 Apr 2022 13:40:04 +0100
Subject: [PATCH 019/111] io_uring: deduplicate SCM accounting

Merge io_sqe_file_register() and io_sqe_file_register(). The only
real difference left between them is from where we get an skb.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/dddda3039c71fcbec24b3465cbe8c7e7ae7bb0e8.1649334991.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 107 +++++++++++++++++++-------------------------------
 1 file changed, 40 insertions(+), 67 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f97a0a5c55a0..f644c1e9c89d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8601,7 +8601,6 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
 	return sqd;
 }
 
-#if defined(CONFIG_UNIX)
 /*
  * Ensure the UNIX gc is aware of our file set, so we are certain that
  * the io_uring can be safely unregistered on process exit, even if we have
@@ -8609,38 +8608,59 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
  * files because otherwise they can't form a loop and so are not interesting
  * for GC.
  */
-static int __io_sqe_files_scm(struct io_ring_ctx *ctx, struct file *file)
+static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file)
 {
+#if defined(CONFIG_UNIX)
 	struct sock *sk = ctx->ring_sock->sk;
+	struct sk_buff_head *head = &sk->sk_receive_queue;
 	struct scm_fp_list *fpl;
 	struct sk_buff *skb;
 
-	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
-	if (!fpl)
-		return -ENOMEM;
+	if (likely(!io_file_need_scm(file)))
+		return 0;
+
+	/*
+	 * See if we can merge this file into an existing skb SCM_RIGHTS
+	 * file set. If there's no room, fall back to allocating a new skb
+	 * and filling it in.
+	 */
+	spin_lock_irq(&head->lock);
+	skb = skb_peek(head);
+	if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
+		__skb_unlink(skb, head);
+	else
+		skb = NULL;
+	spin_unlock_irq(&head->lock);
 
-	skb = alloc_skb(0, GFP_KERNEL);
 	if (!skb) {
-		kfree(fpl);
-		return -ENOMEM;
+		fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
+		if (!fpl)
+			return -ENOMEM;
+
+		skb = alloc_skb(0, GFP_KERNEL);
+		if (!skb) {
+			kfree(fpl);
+			return -ENOMEM;
+		}
+
+		fpl->user = get_uid(current_user());
+		fpl->max = SCM_MAX_FD;
+		fpl->count = 0;
+
+		UNIXCB(skb).fp = fpl;
+		skb->sk = sk;
+		skb->destructor = unix_destruct_scm;
+		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
 	}
 
-	skb->sk = sk;
-
-	fpl->user = get_uid(current_user());
-	fpl->fp[0] = get_file(file);
+	fpl = UNIXCB(skb).fp;
+	fpl->fp[fpl->count++] = get_file(file);
 	unix_inflight(fpl->user, file);
-
-	fpl->max = SCM_MAX_FD;
-	fpl->count = 1;
-	UNIXCB(skb).fp = fpl;
-	skb->destructor = unix_destruct_scm;
-	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
-	skb_queue_head(&sk->sk_receive_queue, skb);
+	skb_queue_head(head, skb);
 	fput(file);
+#endif
 	return 0;
 }
-#endif
 
 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
 {
@@ -8760,8 +8780,6 @@ static void io_rsrc_put_work(struct work_struct *work)
 	}
 }
 
-static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file);
-
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 				 unsigned nr_args, u64 __user *tags)
 {
@@ -8839,51 +8857,6 @@ fail:
 	return ret;
 }
 
-static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file)
-{
-#if defined(CONFIG_UNIX)
-	struct sock *sock = ctx->ring_sock->sk;
-	struct sk_buff_head *head = &sock->sk_receive_queue;
-	struct sk_buff *skb;
-
-	if (!io_file_need_scm(file))
-		return 0;
-
-	/*
-	 * See if we can merge this file into an existing skb SCM_RIGHTS
-	 * file set. If there's no room, fall back to allocating a new skb
-	 * and filling it in.
-	 */
-	spin_lock_irq(&head->lock);
-	skb = skb_peek(head);
-	if (skb) {
-		struct scm_fp_list *fpl = UNIXCB(skb).fp;
-
-		if (fpl->count < SCM_MAX_FD) {
-			__skb_unlink(skb, head);
-			spin_unlock_irq(&head->lock);
-			fpl->fp[fpl->count] = get_file(file);
-			unix_inflight(fpl->user, fpl->fp[fpl->count]);
-			fpl->count++;
-			spin_lock_irq(&head->lock);
-			__skb_queue_head(head, skb);
-		} else {
-			skb = NULL;
-		}
-	}
-	spin_unlock_irq(&head->lock);
-
-	if (skb) {
-		fput(file);
-		return 0;
-	}
-
-	return __io_sqe_files_scm(ctx, file);
-#else
-	return 0;
-#endif
-}
-
 static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 				 struct io_rsrc_node *node, void *rsrc)
 {

From 8b3171bdf53c51c8edd609c3087932a24c42087f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 7 Apr 2022 13:40:05 +0100
Subject: [PATCH 020/111] io_uring: rename io_sqe_file_register

Rename io_sqe_file_register(), so the name better reflects what the
function is doing.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d5091518883786969e244d2f0854a47bbdaa5061.1649334991.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f644c1e9c89d..921f7c68c735 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8608,7 +8608,7 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
  * files because otherwise they can't form a loop and so are not interesting
  * for GC.
  */
-static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file)
+static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
 {
 #if defined(CONFIG_UNIX)
 	struct sock *sk = ctx->ring_sock->sk;
@@ -8841,7 +8841,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			fput(file);
 			goto fail;
 		}
-		ret = io_sqe_file_register(ctx, file);
+		ret = io_scm_file_account(ctx, file);
 		if (ret) {
 			fput(file);
 			goto fail;
@@ -8911,7 +8911,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 		needs_switch = true;
 	}
 
-	ret = io_sqe_file_register(ctx, file);
+	ret = io_scm_file_account(ctx, file);
 	if (!ret) {
 		*io_get_tag_slot(ctx->file_data, slot_index) = 0;
 		io_fixed_file_set(file_slot, file);
@@ -9026,7 +9026,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				err = -EBADF;
 				break;
 			}
-			err = io_sqe_file_register(ctx, file);
+			err = io_scm_file_account(ctx, file);
 			if (err) {
 				fput(file);
 				break;

From cef216fc32d7628206c523994e7e267e7a8dda59 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Apr 2022 15:09:43 +0100
Subject: [PATCH 021/111] io_uring: explicitly keep a CQE in io_kiocb

We already have req->{result,user_data,cflags}, which mimic struct
io_uring_cqe and are intended to store CQE data. Combine them into a
struct io_uring_cqe field.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e1efe65d5005cd6a9ec3440767eb15a9fa9351cf.1649771823.git.asml.silence@gmail.com
[axboe: add mirror cqe to cater to fd union]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 134 ++++++++++++++++++++++++++------------------------
 1 file changed, 69 insertions(+), 65 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 921f7c68c735..252c61912ebf 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -866,6 +866,16 @@ enum {
 	IORING_RSRC_BUFFER		= 1,
 };
 
+struct io_cqe {
+	__u64	user_data;
+	__s32	res;
+	/* fd initially, then cflags for completion */
+	union {
+		__u32	flags;
+		int	fd;
+	};
+};
+
 /*
  * NOTE! Each of the iocb union members has the file pointer
  * as the first entry in their struct definition. So you can
@@ -909,13 +919,7 @@ struct io_kiocb {
 	u16				buf_index;
 	unsigned int			flags;
 
-	u64				user_data;
-	u32				result;
-	/* fd initially, then cflags for completion */
-	union {
-		u32			cflags;
-		int			fd;
-	};
+	struct io_cqe			cqe;
 
 	struct io_ring_ctx		*ctx;
 	struct task_struct		*task;
@@ -1501,7 +1505,7 @@ static inline void req_set_fail(struct io_kiocb *req)
 static inline void req_fail_link_node(struct io_kiocb *req, int res)
 {
 	req_set_fail(req);
-	req->result = res;
+	req->cqe.res = res;
 }
 
 static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
@@ -1733,7 +1737,7 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
 	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
 		req->work.flags |= IO_WQ_WORK_CANCEL;
 
-	trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags,
+	trace_io_uring_queue_async_work(ctx, req, req->cqe.user_data, req->opcode, req->flags,
 					&req->work, io_wq_is_hashed(&req->work));
 	io_wq_enqueue(tctx->io_wq, &req->work);
 	if (link)
@@ -2073,8 +2077,8 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
 
 static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
 {
-	trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags);
-	return __io_fill_cqe(req->ctx, req->user_data, res, cflags);
+	trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags);
+	return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags);
 }
 
 static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
@@ -2140,8 +2144,8 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res,
 static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
 					 u32 cflags)
 {
-	req->result = res;
-	req->cflags = cflags;
+	req->cqe.res = res;
+	req->cqe.flags = cflags;
 	req->flags |= REQ_F_COMPLETE_INLINE;
 }
 
@@ -2173,7 +2177,7 @@ static void io_req_complete_fail_submit(struct io_kiocb *req)
 	 */
 	req->flags &= ~REQ_F_HARDLINK;
 	req->flags |= REQ_F_LINK;
-	io_req_complete_failed(req, req->result);
+	io_req_complete_failed(req, req->cqe.res);
 }
 
 /*
@@ -2186,7 +2190,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
 	req->link = NULL;
 	req->async_data = NULL;
 	/* not necessary, but safer to zero */
-	req->result = 0;
+	req->cqe.res = 0;
 }
 
 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
@@ -2340,12 +2344,12 @@ static void io_fail_links(struct io_kiocb *req)
 		long res = -ECANCELED;
 
 		if (link->flags & REQ_F_FAIL)
-			res = link->result;
+			res = link->cqe.res;
 
 		nxt = link->link;
 		link->link = NULL;
 
-		trace_io_uring_fail_link(req->ctx, req, req->user_data,
+		trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data,
 					req->opcode, link);
 
 		if (!ignore_cqes) {
@@ -2465,7 +2469,7 @@ static void handle_prev_tw_list(struct io_wq_work_node *node,
 		if (likely(*uring_locked))
 			req->io_task_work.func(req, uring_locked);
 		else
-			__io_req_complete_post(req, req->result,
+			__io_req_complete_post(req, req->cqe.res,
 						io_put_kbuf_comp(req));
 		node = next;
 	} while (node);
@@ -2595,7 +2599,7 @@ static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
 
 	/* not needed for normal modes, but SQPOLL depends on it */
 	io_tw_lock(ctx, locked);
-	io_req_complete_failed(req, req->result);
+	io_req_complete_failed(req, req->cqe.res);
 }
 
 static void io_req_task_submit(struct io_kiocb *req, bool *locked)
@@ -2612,7 +2616,7 @@ static void io_req_task_submit(struct io_kiocb *req, bool *locked)
 
 static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
 {
-	req->result = ret;
+	req->cqe.res = ret;
 	req->io_task_work.func = io_req_task_cancel;
 	io_req_task_work_add(req, false);
 }
@@ -2712,7 +2716,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 						    comp_list);
 
 			if (!(req->flags & REQ_F_CQE_SKIP))
-				__io_fill_cqe_req(req, req->result, req->cflags);
+				__io_fill_cqe_req(req, req->cqe.res, req->cqe.flags);
 		}
 
 		io_commit_cqring(ctx);
@@ -2837,7 +2841,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		nr_events++;
 		if (unlikely(req->flags & REQ_F_CQE_SKIP))
 			continue;
-		__io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0));
+		__io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0));
 	}
 
 	if (unlikely(!nr_events))
@@ -2995,21 +2999,21 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 	} else {
 		fsnotify_access(req->file);
 	}
-	if (unlikely(res != req->result)) {
+	if (unlikely(res != req->cqe.res)) {
 		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
 		    io_rw_should_reissue(req)) {
 			req->flags |= REQ_F_REISSUE;
 			return true;
 		}
 		req_set_fail(req);
-		req->result = res;
+		req->cqe.res = res;
 	}
 	return false;
 }
 
 static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
 {
-	int res = req->result;
+	int res = req->cqe.res;
 
 	if (*locked) {
 		io_req_complete_state(req, res, io_put_kbuf(req, 0));
@@ -3025,7 +3029,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res,
 {
 	if (__io_complete_rw_common(req, res))
 		return;
-	__io_req_complete(req, issue_flags, req->result,
+	__io_req_complete(req, issue_flags, req->cqe.res,
 				io_put_kbuf(req, issue_flags));
 }
 
@@ -3035,7 +3039,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
 
 	if (__io_complete_rw_common(req, res))
 		return;
-	req->result = res;
+	req->cqe.res = res;
 	req->io_task_work.func = io_req_task_complete;
 	io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
 }
@@ -3046,12 +3050,12 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
 
 	if (kiocb->ki_flags & IOCB_WRITE)
 		kiocb_end_write(req);
-	if (unlikely(res != req->result)) {
+	if (unlikely(res != req->cqe.res)) {
 		if (res == -EAGAIN && io_rw_should_reissue(req)) {
 			req->flags |= REQ_F_REISSUE;
 			return;
 		}
-		req->result = res;
+		req->cqe.res = res;
 	}
 
 	/* order with io_iopoll_complete() checking ->iopoll_completed */
@@ -3844,7 +3848,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 		kfree(iovec);
 		return ret;
 	}
-	req->result = iov_iter_count(&s->iter);
+	req->cqe.res = iov_iter_count(&s->iter);
 
 	if (force_nonblock) {
 		/* If the file doesn't support async, just async punt */
@@ -3860,7 +3864,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 
 	ppos = io_kiocb_update_pos(req);
 
-	ret = rw_verify_area(READ, req->file, ppos, req->result);
+	ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
 	if (unlikely(ret)) {
 		kfree(iovec);
 		return ret;
@@ -3882,7 +3886,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 		ret = 0;
 	} else if (ret == -EIOCBQUEUED) {
 		goto out_free;
-	} else if (ret == req->result || ret <= 0 || !force_nonblock ||
+	} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
 		   (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
 		/* read all, failed, already did sync or don't want to retry */
 		goto done;
@@ -3972,7 +3976,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 		kfree(iovec);
 		return ret;
 	}
-	req->result = iov_iter_count(&s->iter);
+	req->cqe.res = iov_iter_count(&s->iter);
 
 	if (force_nonblock) {
 		/* If the file doesn't support async, just async punt */
@@ -3992,7 +3996,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 
 	ppos = io_kiocb_update_pos(req);
 
-	ret = rw_verify_area(WRITE, req->file, ppos, req->result);
+	ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
 	if (unlikely(ret))
 		goto out_free;
 
@@ -5777,7 +5781,7 @@ static void io_poll_req_insert(struct io_kiocb *req)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct hlist_head *list;
 
-	list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
+	list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)];
 	hlist_add_head(&req->hash_node, list);
 }
 
@@ -5842,7 +5846,7 @@ static void io_poll_remove_entries(struct io_kiocb *req)
  *
  * Returns a negative error on failure. >0 when no action require, which is
  * either spurious wakeup or multishot CQE is served. 0 when it's done with
- * the request, then the mask is stored in req->result.
+ * the request, then the mask is stored in req->cqe.res.
  */
 static int io_poll_check_events(struct io_kiocb *req, bool locked)
 {
@@ -5862,29 +5866,29 @@ static int io_poll_check_events(struct io_kiocb *req, bool locked)
 		if (v & IO_POLL_CANCEL_FLAG)
 			return -ECANCELED;
 
-		if (!req->result) {
+		if (!req->cqe.res) {
 			struct poll_table_struct pt = { ._key = req->apoll_events };
 			unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED;
 
 			if (unlikely(!io_assign_file(req, flags)))
 				return -EBADF;
-			req->result = vfs_poll(req->file, &pt) & req->apoll_events;
+			req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
 		}
 
 		/* multishot, just fill an CQE and proceed */
-		if (req->result && !(req->apoll_events & EPOLLONESHOT)) {
-			__poll_t mask = mangle_poll(req->result & req->apoll_events);
+		if (req->cqe.res && !(req->apoll_events & EPOLLONESHOT)) {
+			__poll_t mask = mangle_poll(req->cqe.res & req->apoll_events);
 			bool filled;
 
 			spin_lock(&ctx->completion_lock);
-			filled = io_fill_cqe_aux(ctx, req->user_data, mask,
+			filled = io_fill_cqe_aux(ctx, req->cqe.user_data, mask,
 						 IORING_CQE_F_MORE);
 			io_commit_cqring(ctx);
 			spin_unlock(&ctx->completion_lock);
 			if (unlikely(!filled))
 				return -ECANCELED;
 			io_cqring_ev_posted(ctx);
-		} else if (req->result) {
+		} else if (req->cqe.res) {
 			return 0;
 		}
 
@@ -5907,16 +5911,16 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 		return;
 
 	if (!ret) {
-		req->result = mangle_poll(req->result & req->poll.events);
+		req->cqe.res = mangle_poll(req->cqe.res & req->poll.events);
 	} else {
-		req->result = ret;
+		req->cqe.res = ret;
 		req_set_fail(req);
 	}
 
 	io_poll_remove_entries(req);
 	spin_lock(&ctx->completion_lock);
 	hash_del(&req->hash_node);
-	__io_req_complete_post(req, req->result, 0);
+	__io_req_complete_post(req, req->cqe.res, 0);
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
@@ -5944,7 +5948,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
 
 static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
 {
-	req->result = mask;
+	req->cqe.res = mask;
 	/*
 	 * This is useful for poll that is armed on behalf of another
 	 * request, and where the wakeup path could be on a different
@@ -5957,7 +5961,7 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
 	else
 		req->io_task_work.func = io_apoll_task_func;
 
-	trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask);
+	trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask);
 	io_req_task_work_add(req, false);
 }
 
@@ -6207,7 +6211,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 	if (ret || ipt.error)
 		return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
 
-	trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode,
+	trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode,
 				mask, apoll->poll.events);
 	return IO_APOLL_OK;
 }
@@ -6249,7 +6253,7 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
 
 	list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
 	hlist_for_each_entry(req, list, hash_node) {
-		if (sqe_addr != req->user_data)
+		if (sqe_addr != req->cqe.user_data)
 			continue;
 		if (poll_only && req->opcode != IORING_OP_POLL_ADD)
 			continue;
@@ -6386,7 +6390,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
 			preq->poll.events |= IO_POLL_UNMASK;
 		}
 		if (req->poll_update.update_user_data)
-			preq->user_data = req->poll_update.new_user_data;
+			preq->cqe.user_data = req->poll_update.new_user_data;
 
 		ret2 = io_poll_add(preq, issue_flags);
 		/* successfully updated, don't complete poll request */
@@ -6395,7 +6399,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
 	}
 
 	req_set_fail(preq);
-	preq->result = -ECANCELED;
+	preq->cqe.res = -ECANCELED;
 	locked = !(issue_flags & IO_URING_F_UNLOCKED);
 	io_req_task_complete(preq, &locked);
 out:
@@ -6423,7 +6427,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
 		req_set_fail(req);
 
-	req->result = -ETIME;
+	req->cqe.res = -ETIME;
 	req->io_task_work.func = io_req_task_complete;
 	io_req_task_work_add(req, false);
 	return HRTIMER_NORESTART;
@@ -6438,7 +6442,7 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 	bool found = false;
 
 	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
-		found = user_data == req->user_data;
+		found = user_data == req->cqe.user_data;
 		if (found)
 			break;
 	}
@@ -6489,7 +6493,7 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 	bool found = false;
 
 	list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
-		found = user_data == req->user_data;
+		found = user_data == req->cqe.user_data;
 		if (found)
 			break;
 	}
@@ -6715,7 +6719,7 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data)
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	struct io_cancel_data *cd = data;
 
-	return req->ctx == cd->ctx && req->user_data == cd->user_data;
+	return req->ctx == cd->ctx && req->cqe.user_data == cd->user_data;
 }
 
 static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
@@ -7016,7 +7020,7 @@ fail:
 		goto queue;
 	}
 
-	trace_io_uring_defer(ctx, req, req->user_data, req->opcode);
+	trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode);
 	de->req = req;
 	de->seq = seq;
 	list_add_tail(&de->list, &ctx->defer_list);
@@ -7100,14 +7104,14 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
 		return true;
 
 	if (req->flags & REQ_F_FIXED_FILE)
-		req->file = io_file_get_fixed(req, req->fd, issue_flags);
+		req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
 	else
-		req->file = io_file_get_normal(req, req->fd);
+		req->file = io_file_get_normal(req, req->cqe.fd);
 	if (req->file)
 		return true;
 
 	req_set_fail(req);
-	req->result = -EBADF;
+	req->cqe.res = -EBADF;
 	return false;
 }
 
@@ -7396,7 +7400,7 @@ static struct file *io_file_get_normal(struct io_kiocb *req, int fd)
 {
 	struct file *file = fget(fd);
 
-	trace_io_uring_file_get(req->ctx, req, req->user_data, fd);
+	trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd);
 
 	/* we don't allow fixed io_uring files */
 	if (file && file->f_op == &io_uring_fops)
@@ -7411,7 +7415,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
 
 	if (prev) {
 		if (!(req->task->flags & PF_EXITING))
-			ret = io_try_cancel_userdata(req, prev->user_data);
+			ret = io_try_cancel_userdata(req, prev->cqe.user_data);
 		io_req_complete_post(req, ret ?: -ETIME, 0);
 		io_put_req(prev);
 	} else {
@@ -7602,7 +7606,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->opcode = opcode = READ_ONCE(sqe->opcode);
 	/* same numerical values with corresponding REQ_F_*, safe to copy */
 	req->flags = sqe_flags = READ_ONCE(sqe->flags);
-	req->user_data = READ_ONCE(sqe->user_data);
+	req->cqe.user_data = READ_ONCE(sqe->user_data);
 	req->file = NULL;
 	req->fixed_rsrc_refs = NULL;
 	req->task = current;
@@ -7643,7 +7647,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	if (io_op_defs[opcode].needs_file) {
 		struct io_submit_state *state = &ctx->submit_state;
 
-		req->fd = READ_ONCE(sqe->fd);
+		req->cqe.fd = READ_ONCE(sqe->fd);
 
 		/*
 		 * Plug now if we have more than 2 IO left after this, and the
@@ -7692,7 +7696,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			 * we can judge a link req is failed or cancelled by if
 			 * REQ_F_FAIL is set, but the head is an exception since
 			 * it may be set REQ_F_FAIL because of other req's failure
-			 * so let's leverage req->result to distinguish if a head
+			 * so let's leverage req->cqe.res to distinguish if a head
 			 * is set REQ_F_FAIL because of its failure or other req's
 			 * failure so that we can set the correct ret code for it.
 			 * init result here to avoid affecting the normal path.
@@ -7711,7 +7715,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	}
 
 	/* don't need @sqe from now on */
-	trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode,
+	trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode,
 				  req->flags, true,
 				  ctx->flags & IORING_SETUP_SQPOLL);
 

From 90e7c35fb89154439a4a604699faf36adbfa871b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Apr 2022 15:09:44 +0100
Subject: [PATCH 022/111] io_uring: memcpy CQE from req

We can do CQE filling a bit more efficiently when req->cqe is fully
filled by memcpy()'ing it to the userspace instead of doing it field by
field. It's easier on register spilling, removes a couple of extra
loads/stores and write combines two u32 memory writes.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ee3f514ff28b1fe3347a8eca93a9d91647f2eaad.1649771823.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 252c61912ebf..188923c38bfc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2075,6 +2075,28 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
 	return io_cqring_event_overflow(ctx, user_data, res, cflags);
 }
 
+static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx,
+					    struct io_kiocb *req)
+{
+	struct io_uring_cqe *cqe;
+
+	trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
+				req->cqe.res, req->cqe.flags);
+
+	/*
+	 * If we can't get a cq entry, userspace overflowed the
+	 * submission (by quite a lot). Increment the overflow count in
+	 * the ring.
+	 */
+	cqe = io_get_cqe(ctx);
+	if (likely(cqe)) {
+		memcpy(cqe, &req->cqe, sizeof(*cqe));
+		return true;
+	}
+	return io_cqring_event_overflow(ctx, req->cqe.user_data,
+					req->cqe.res, req->cqe.flags);
+}
+
 static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
 {
 	trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags);
@@ -2716,7 +2738,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 						    comp_list);
 
 			if (!(req->flags & REQ_F_CQE_SKIP))
-				__io_fill_cqe_req(req, req->cqe.res, req->cqe.flags);
+				__io_fill_cqe_req_filled(ctx, req);
 		}
 
 		io_commit_cqring(ctx);

From e126391c092014af7ea6786648c9f1d3fce512a3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Apr 2022 15:09:45 +0100
Subject: [PATCH 023/111] io_uring: shrink final link flush

All good users should not set IOSQE_IO_*LINK flags for the last request
of a link. io_uring flushes collected links at the end of submission,
but it's not the optimal way and so we don't care too much about it.
Replace io_queue_sqe() call with io_queue_sqe_fallback() as the former
one is inlined and will generate a bunch of extra code. This will also
help compilers with the submission path inlining.

> size ./fs/io_uring.o
   text    data     bss     dec     hex filename
  87265   13734       8  101007   18a8f ./fs/io_uring.o
> size ./fs/io_uring.o
   text    data     bss     dec     hex filename
  87073   13734       8  100815   189cf ./fs/io_uring.o

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/01fb5e417ef49925d544a0b0bae30409845ed2b4.1649771823.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 188923c38bfc..a3a3c80e2faa 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7785,8 +7785,8 @@ static void io_submit_state_end(struct io_ring_ctx *ctx)
 {
 	struct io_submit_state *state = &ctx->submit_state;
 
-	if (state->link.head)
-		io_queue_sqe(state->link.head);
+	if (unlikely(state->link.head))
+		io_queue_sqe_fallback(state->link.head);
 	/* flush only after queuing links as they can generate completions */
 	io_submit_flush_completions(ctx);
 	if (state->plug_started)

From 23a5c43b2fc0cc210dcb36264fe3050468c79b17 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Apr 2022 15:09:46 +0100
Subject: [PATCH 024/111] io_uring: inline io_flush_cached_reqs

io_flush_cached_reqs() isn't descriptive and has only one caller, inline
it into __io_alloc_req_refill().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ec38abe65a883d9fe6b169793119ce86806655a4.1649771823.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a3a3c80e2faa..7b83d6ed13ba 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2224,21 +2224,6 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
 	spin_unlock(&ctx->completion_lock);
 }
 
-/* Returns true IFF there are requests in the cache */
-static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
-{
-	struct io_submit_state *state = &ctx->submit_state;
-
-	/*
-	 * If we have more than a batch's worth of requests in our IRQ side
-	 * locked cache, grab the lock and move them over to our submission
-	 * side cache.
-	 */
-	if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
-		io_flush_cached_locked_reqs(ctx, state);
-	return !!state->free_list.next;
-}
-
 /*
  * A request might get retired back into the request caches even before opcode
  * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
@@ -2251,11 +2236,18 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 	struct io_submit_state *state = &ctx->submit_state;
 	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 	void *reqs[IO_REQ_ALLOC_BATCH];
-	struct io_kiocb *req;
 	int ret, i;
 
-	if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
-		return true;
+	/*
+	 * If we have more than a batch's worth of requests in our IRQ side
+	 * locked cache, grab the lock and move them over to our submission
+	 * side cache.
+	 */
+	if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) {
+		io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
+		if (state->free_list.next)
+			return true;
+	}
 
 	ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
 
@@ -2272,7 +2264,7 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 
 	percpu_ref_get_many(&ctx->refs, ret);
 	for (i = 0; i < ret; i++) {
-		req = reqs[i];
+		struct io_kiocb *req = reqs[i];
 
 		io_preinit_req(req, ctx);
 		wq_stack_add_head(&req->comp_list, &state->free_list);

From 88ab95be7e408ff9cedc81ad5402ad4bd700baf9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Apr 2022 15:09:47 +0100
Subject: [PATCH 025/111] io_uring: helper for empty req cache checks

Add io_req_cache_empty(), which checks if there are requests in the
inline req cache or not. It'll be needed in the future, but also nicely
cleans up a few spots poking into ->free_list directly.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b18662389f3fb483d0bd07906647f65f6037475a.1649771823.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7b83d6ed13ba..b385e0f6e92d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2224,6 +2224,11 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
 	spin_unlock(&ctx->completion_lock);
 }
 
+static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
+{
+	return !ctx->submit_state.free_list.next;
+}
+
 /*
  * A request might get retired back into the request caches even before opcode
  * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
@@ -2245,7 +2250,7 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 	 */
 	if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) {
 		io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
-		if (state->free_list.next)
+		if (!io_req_cache_empty(ctx))
 			return true;
 	}
 
@@ -2274,7 +2279,7 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 
 static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
 {
-	if (unlikely(!ctx->submit_state.free_list.next))
+	if (unlikely(io_req_cache_empty(ctx)))
 		return __io_alloc_req_refill(ctx);
 	return true;
 }
@@ -9809,7 +9814,7 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 	mutex_lock(&ctx->uring_lock);
 	io_flush_cached_locked_reqs(ctx, state);
 
-	while (state->free_list.next) {
+	while (!io_req_cache_empty(ctx)) {
 		struct io_wq_work_node *node;
 		struct io_kiocb *req;
 

From fa05457a603e7c8e97cd4cfb62de64bc9547ecb5 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Apr 2022 15:09:48 +0100
Subject: [PATCH 026/111] io_uring: add helper to return req to cache list

Don't hand code wq_stack_add_head() to ->free_list, which serves for
recycling io_kiocb, add a helper doing it for us.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f206f575486a8dd3d52f074ab37ed146b2d215b7.1649771823.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b385e0f6e92d..1cdfcdd0a74f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1508,6 +1508,11 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res)
 	req->cqe.res = res;
 }
 
+static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
+{
+	wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+}
+
 static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
 {
 	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -2238,7 +2243,6 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
 static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 	__must_hold(&ctx->uring_lock)
 {
-	struct io_submit_state *state = &ctx->submit_state;
 	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 	void *reqs[IO_REQ_ALLOC_BATCH];
 	int ret, i;
@@ -2272,7 +2276,7 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 		struct io_kiocb *req = reqs[i];
 
 		io_preinit_req(req, ctx);
-		wq_stack_add_head(&req->comp_list, &state->free_list);
+		io_req_add_to_cache(req, ctx);
 	}
 	return true;
 }
@@ -2715,7 +2719,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
 		}
 		task_refs++;
 		node = req->comp_list.next;
-		wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+		io_req_add_to_cache(req, ctx);
 	} while (node);
 
 	if (task)
@@ -7872,7 +7876,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 		req = io_alloc_req(ctx);
 		sqe = io_get_sqe(ctx);
 		if (unlikely(!sqe)) {
-			wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+			io_req_add_to_cache(req, ctx);
 			break;
 		}
 		/* will complete beyond this point, count as submitted */

From 8e6971a819dfd289e40642405a9200076f5dd17f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Apr 2022 15:09:49 +0100
Subject: [PATCH 027/111] io_uring: optimise submission loop invariant

Instead of keeping @submitted in io_submit_sqes(), which for each
iteration requires comparison with the initial number of SQEs, store the
number of SQEs left to submit. We'll need nr only for when we're done
with SQE handling.

note: if we can't allocate a req for the first SQE we always has been
returning -EAGAIN to the userspace, save this behaviour by looking into
the cache in a slow path.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c3b3df9aeae4c2f7a53fd8386385742e4e261e77.1649771823.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1cdfcdd0a74f..3e84963b14ff 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7855,24 +7855,22 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 	__must_hold(&ctx->uring_lock)
 {
 	unsigned int entries = io_sqring_entries(ctx);
-	int submitted = 0;
+	unsigned int left;
+	int ret;
 
 	if (unlikely(!entries))
 		return 0;
 	/* make sure SQ entry isn't read before tail */
-	nr = min3(nr, ctx->sq_entries, entries);
-	io_get_task_refs(nr);
+	ret = left = min3(nr, ctx->sq_entries, entries);
+	io_get_task_refs(left);
+	io_submit_state_start(&ctx->submit_state, left);
 
-	io_submit_state_start(&ctx->submit_state, nr);
 	do {
 		const struct io_uring_sqe *sqe;
 		struct io_kiocb *req;
 
-		if (unlikely(!io_alloc_req_refill(ctx))) {
-			if (!submitted)
-				submitted = -EAGAIN;
+		if (unlikely(!io_alloc_req_refill(ctx)))
 			break;
-		}
 		req = io_alloc_req(ctx);
 		sqe = io_get_sqe(ctx);
 		if (unlikely(!sqe)) {
@@ -7880,7 +7878,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 			break;
 		}
 		/* will complete beyond this point, count as submitted */
-		submitted++;
+		left--;
 		if (io_submit_sqe(ctx, req, sqe)) {
 			/*
 			 * Continue submitting even for sqe failure if the
@@ -7889,20 +7887,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 			if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL))
 				break;
 		}
-	} while (submitted < nr);
+	} while (left);
 
-	if (unlikely(submitted != nr)) {
-		int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
-		int unused = nr - ref_used;
-
-		current->io_uring->cached_refs += unused;
+	if (unlikely(left)) {
+		ret -= left;
+		/* try again if it submitted nothing and can't allocate a req */
+		if (!ret && io_req_cache_empty(ctx))
+			ret = -EAGAIN;
+		current->io_uring->cached_refs += left;
 	}
 
 	io_submit_state_end(ctx);
 	 /* Commit SQ ring head once we've consumed and submitted all SQEs */
 	io_commit_sqring(ctx);
-
-	return submitted;
+	return ret;
 }
 
 static inline bool io_sqd_events_pending(struct io_sq_data *sqd)

From 1cd15904b6e836f46e925229da8950785252b315 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Apr 2022 15:09:50 +0100
Subject: [PATCH 028/111] io_uring: optimise submission left counting

Considering all inlining io_submit_sqe() is huge and usually ends up
calling some other functions.

We decrement @left in io_submit_sqes() just before calling
io_submit_sqe() and use it later after the call. Considering how huge
io_submit_sqe() is, there is not much hope @left will be treated
gracefully by compilers.

Decrement it after the call, not only it's easier on register spilling
and probably saves stack write/read, but also at least for x64 uses
CPU flags set by the dec instead of doing (read/write and tests).

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/807f9a276b54ee8ff4e42e2b78721484f1c71743.1649771823.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3e84963b14ff..c3cadb36326b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7877,17 +7877,17 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 			io_req_add_to_cache(req, ctx);
 			break;
 		}
-		/* will complete beyond this point, count as submitted */
-		left--;
-		if (io_submit_sqe(ctx, req, sqe)) {
-			/*
-			 * Continue submitting even for sqe failure if the
-			 * ring was setup with IORING_SETUP_SUBMIT_ALL
-			 */
-			if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL))
-				break;
+
+		/*
+		 * Continue submitting even for sqe failure if the
+		 * ring was setup with IORING_SETUP_SUBMIT_ALL
+		 */
+		if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
+		    !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
+			left--;
+			break;
 		}
-	} while (left);
+	} while (--left);
 
 	if (unlikely(left)) {
 		ret -= left;

From d8da428b7a9a71bc2ee6bf628a8c0f9beb96a195 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Apr 2022 15:09:51 +0100
Subject: [PATCH 029/111] io_uring: optimise io_get_cqe()

io_get_cqe() is expensive because of a bunch of loads, masking, etc.
However, most of the time we should have enough of entries in the CQ,
so we can cache two pointers representing a range of contiguous CQE
memory we can use. When the range is exhausted we'll go through a slower
path to set up a new range. When there are no CQEs avaliable, pointers
will naturally point to the same address.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/487eeef00f3146537b3d9c1a9cef2fc0b9a86f81.1649771823.git.asml.silence@gmail.com
[axboe: santinel -> sentinel]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 44 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c3cadb36326b..fc47f8fdae7c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -416,6 +416,13 @@ struct io_ring_ctx {
 	unsigned long		check_cq_overflow;
 
 	struct {
+		/*
+		 * We cache a range of free CQEs we can use, once exhausted it
+		 * should go through a slower range setup, see __io_get_cqe()
+		 */
+		struct io_uring_cqe	*cqe_cached;
+		struct io_uring_cqe	*cqe_sentinel;
+
 		unsigned		cached_cq_tail;
 		unsigned		cq_entries;
 		struct io_ev_fd	__rcu	*io_ev_fd;
@@ -1844,21 +1851,38 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
 	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
 }
 
-static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
+/*
+ * writes to the cq entry need to come after reading head; the
+ * control dependency is enough as we're using WRITE_ONCE to
+ * fill the cq entry
+ */
+static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
 {
 	struct io_rings *rings = ctx->rings;
-	unsigned tail, mask = ctx->cq_entries - 1;
+	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
+	unsigned int free, queued, len;
 
-	/*
-	 * writes to the cq entry need to come after reading head; the
-	 * control dependency is enough as we're using WRITE_ONCE to
-	 * fill the cq entry
-	 */
-	if (__io_cqring_events(ctx) == ctx->cq_entries)
+	/* userspace may cheat modifying the tail, be safe and do min */
+	queued = min(__io_cqring_events(ctx), ctx->cq_entries);
+	free = ctx->cq_entries - queued;
+	/* we need a contiguous range, limit based on the current array offset */
+	len = min(free, ctx->cq_entries - off);
+	if (!len)
 		return NULL;
 
-	tail = ctx->cached_cq_tail++;
-	return &rings->cqes[tail & mask];
+	ctx->cached_cq_tail++;
+	ctx->cqe_cached = &rings->cqes[off];
+	ctx->cqe_sentinel = ctx->cqe_cached + len;
+	return ctx->cqe_cached++;
+}
+
+static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
+{
+	if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
+		ctx->cached_cq_tail++;
+		return ctx->cqe_cached++;
+	}
+	return __io_get_cqe(ctx);
 }
 
 static void io_eventfd_signal(struct io_ring_ctx *ctx)

From f22190570b213dcc84216ac07cfd0eeada010013 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Apr 2022 22:08:20 +0100
Subject: [PATCH 030/111] io_uring: clean poll tw PF_EXITING handling

When we meet PF_EXITING in io_poll_check_events(), don't overcomplicate
the code with io_poll_mark_cancelled() but just return -ECANCELED and
the callers will deal with the rest.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f0cc981af82a5b193658f8f44397eeb3bf838b7b.1650056133.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index fc47f8fdae7c..76d51c6e3ab9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5902,7 +5902,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool locked)
 
 	/* req->task == current here, checking PF_EXITING is safe */
 	if (unlikely(req->task->flags & PF_EXITING))
-		io_poll_mark_cancelled(req);
+		return -ECANCELED;
 
 	do {
 		v = atomic_read(&req->poll_refs);

From 971cf9c19e97bad172bdb7142a23e6489d1b38f0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Apr 2022 22:08:22 +0100
Subject: [PATCH 031/111] io_uring: minor refactoring for some tw handlers

Get rid of some useless local variables

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/7798327b684b7015f7e4300420142ddfcd317297.1650056133.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 76d51c6e3ab9..acb7978a31a5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1729,7 +1729,6 @@ static inline void io_req_add_compl_list(struct io_kiocb *req)
 
 static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
 {
-	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *link = io_prep_linked_timeout(req);
 	struct io_uring_task *tctx = req->task->io_uring;
 
@@ -1749,8 +1748,9 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
 	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
 		req->work.flags |= IO_WQ_WORK_CANCEL;
 
-	trace_io_uring_queue_async_work(ctx, req, req->cqe.user_data, req->opcode, req->flags,
-					&req->work, io_wq_is_hashed(&req->work));
+	trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data,
+					req->opcode, req->flags, &req->work,
+					io_wq_is_hashed(&req->work));
 	io_wq_enqueue(tctx->io_wq, &req->work);
 	if (link)
 		io_queue_linked_timeout(link);
@@ -2642,18 +2642,14 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority)
 
 static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
 {
-	struct io_ring_ctx *ctx = req->ctx;
-
 	/* not needed for normal modes, but SQPOLL depends on it */
-	io_tw_lock(ctx, locked);
+	io_tw_lock(req->ctx, locked);
 	io_req_complete_failed(req, req->cqe.res);
 }
 
 static void io_req_task_submit(struct io_kiocb *req, bool *locked)
 {
-	struct io_ring_ctx *ctx = req->ctx;
-
-	io_tw_lock(ctx, locked);
+	io_tw_lock(req->ctx, locked);
 	/* req->task == current here, checking PF_EXITING is safe */
 	if (likely(!(req->task->flags & PF_EXITING)))
 		__io_queue_sqe(req);

From 4e118cd9e9e609c805822ee423aa23ae406e7da0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Apr 2022 22:08:23 +0100
Subject: [PATCH 032/111] io_uring: kill io_put_req_deferred()

We have several spots where a call to io_fill_cqe_req() is immediately
followed by io_put_req_deferred(). Replace them with
__io_req_complete_post() and get rid of io_put_req_deferred() and
io_fill_cqe_req().

> size ./fs/io_uring.o
   text    data     bss     dec     hex filename
  86942   13734       8  100684   1894c ./fs/io_uring.o
> size ./fs/io_uring.o
   text    data     bss     dec     hex filename
  86438   13654       8  100100   18704 ./fs/io_uring.o

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/10672a538774ac8986bee6468d960527af59169d.1650056133.git.asml.silence@gmail.com
[axboe: fold in followup fix]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 56 ++++++++++++++++++++-------------------------------
 1 file changed, 22 insertions(+), 34 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index acb7978a31a5..df29a9365c2b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1188,10 +1188,8 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 bool cancel_all);
 static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 
-static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags);
-
+static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags);
 static void io_put_req(struct io_kiocb *req);
-static void io_put_req_deferred(struct io_kiocb *req);
 static void io_dismantle_req(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
@@ -1216,6 +1214,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
 
 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
 static void io_eventfd_signal(struct io_ring_ctx *ctx);
+static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
 
 static struct kmem_cache *req_cachep;
 
@@ -1768,8 +1767,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
 		atomic_set(&req->ctx->cq_timeouts,
 			atomic_read(&req->ctx->cq_timeouts) + 1);
 		list_del_init(&req->timeout.list);
-		io_fill_cqe_req(req, status, 0);
-		io_put_req_deferred(req);
+		io_req_tw_post_queue(req, status, 0);
 	}
 }
 
@@ -2132,12 +2130,6 @@ static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
 	return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags);
 }
 
-static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
-{
-	if (!(req->flags & REQ_F_CQE_SKIP))
-		__io_fill_cqe_req(req, res, cflags);
-}
-
 static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
 				     s32 res, u32 cflags)
 {
@@ -2371,9 +2363,7 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
 		link->timeout.head = NULL;
 		if (hrtimer_try_to_cancel(&io->timer) != -1) {
 			list_del(&link->timeout.list);
-			/* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
-			io_fill_cqe_req(link, -ECANCELED, 0);
-			io_put_req_deferred(link);
+			io_req_tw_post_queue(link, -ECANCELED, 0);
 			return true;
 		}
 	}
@@ -2399,11 +2389,11 @@ static void io_fail_links(struct io_kiocb *req)
 		trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data,
 					req->opcode, link);
 
-		if (!ignore_cqes) {
+		if (ignore_cqes)
+			link->flags |= REQ_F_CQE_SKIP;
+		else
 			link->flags &= ~REQ_F_CQE_SKIP;
-			io_fill_cqe_req(link, res, 0);
-		}
-		io_put_req_deferred(link);
+		__io_req_complete_post(link, res, 0);
 		link = nxt;
 	}
 }
@@ -2419,9 +2409,7 @@ static bool io_disarm_next(struct io_kiocb *req)
 		req->flags &= ~REQ_F_ARM_LTIMEOUT;
 		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
 			io_remove_next_linked(req);
-			/* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
-			io_fill_cqe_req(link, -ECANCELED, 0);
-			io_put_req_deferred(link);
+			io_req_tw_post_queue(link, -ECANCELED, 0);
 			posted = true;
 		}
 	} else if (req->flags & REQ_F_LINK_TIMEOUT) {
@@ -2640,6 +2628,19 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority)
 	}
 }
 
+static void io_req_tw_post(struct io_kiocb *req, bool *locked)
+{
+	io_req_complete_post(req, req->cqe.res, req->cqe.flags);
+}
+
+static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
+{
+	req->cqe.res = res;
+	req->cqe.flags = cflags;
+	req->io_task_work.func = io_req_tw_post;
+	io_req_task_work_add(req, false);
+}
+
 static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
 {
 	/* not needed for normal modes, but SQPOLL depends on it */
@@ -2690,11 +2691,6 @@ static void io_free_req(struct io_kiocb *req)
 	__io_free_req(req);
 }
 
-static void io_free_req_work(struct io_kiocb *req, bool *locked)
-{
-	io_free_req(req);
-}
-
 static void io_free_batch_list(struct io_ring_ctx *ctx,
 				struct io_wq_work_node *node)
 	__must_hold(&ctx->uring_lock)
@@ -2794,14 +2790,6 @@ static inline void io_put_req(struct io_kiocb *req)
 		io_free_req(req);
 }
 
-static inline void io_put_req_deferred(struct io_kiocb *req)
-{
-	if (req_ref_put_and_test(req)) {
-		req->io_task_work.func = io_free_req_work;
-		io_req_task_work_add(req, false);
-	}
-}
-
 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
 {
 	/* See comment at the top of this file */

From f5c6cf2a310d8de3bd02aa8a217f8ca63df6f236 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Apr 2022 22:08:24 +0100
Subject: [PATCH 033/111] io_uring: inline io_free_req()

Inline io_free_req() into its only user and remove an underscore prefix
from __io_free_req().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ed114edef5c256a644f4839bb372df70d8df8e3f.1650056133.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index df29a9365c2b..e9f0c89280cd 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1189,7 +1189,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 
 static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags);
-static void io_put_req(struct io_kiocb *req);
 static void io_dismantle_req(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
@@ -2328,7 +2327,7 @@ static inline void io_dismantle_req(struct io_kiocb *req)
 		io_put_file(req->file);
 }
 
-static __cold void __io_free_req(struct io_kiocb *req)
+static __cold void io_free_req(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
@@ -2685,12 +2684,6 @@ static void io_queue_next(struct io_kiocb *req)
 		io_req_task_queue(nxt);
 }
 
-static void io_free_req(struct io_kiocb *req)
-{
-	io_queue_next(req);
-	__io_free_req(req);
-}
-
 static void io_free_batch_list(struct io_ring_ctx *ctx,
 				struct io_wq_work_node *node)
 	__must_hold(&ctx->uring_lock)
@@ -2779,15 +2772,17 @@ static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
 	if (req_ref_put_and_test(req)) {
 		if (unlikely(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))
 			nxt = io_req_find_next(req);
-		__io_free_req(req);
+		io_free_req(req);
 	}
 	return nxt;
 }
 
 static inline void io_put_req(struct io_kiocb *req)
 {
-	if (req_ref_put_and_test(req))
+	if (req_ref_put_and_test(req)) {
+		io_queue_next(req);
 		io_free_req(req);
+	}
 }
 
 static unsigned io_cqring_events(struct io_ring_ctx *ctx)

From cb2d344c7551c250c07c009639eb81d54235ef8b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Apr 2022 22:08:25 +0100
Subject: [PATCH 034/111] io_uring: helper for prep+queuing linked timeouts

We try to aggresively inline the submission path, so it's a good idea to
not pollute it with colder code. One of them is linked timeout
preparation + queue, which can be extracted into a function.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ecf74df7ac77389b6d9211211ec4954e91de98ba.1650056133.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e9f0c89280cd..946906e93d51 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1675,6 +1675,17 @@ static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 	return __io_prep_linked_timeout(req);
 }
 
+static noinline void __io_arm_ltimeout(struct io_kiocb *req)
+{
+	io_queue_linked_timeout(__io_prep_linked_timeout(req));
+}
+
+static inline void io_arm_ltimeout(struct io_kiocb *req)
+{
+	if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
+		__io_arm_ltimeout(req);
+}
+
 static void io_prep_async_work(struct io_kiocb *req)
 {
 	const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -7301,7 +7312,6 @@ static void io_wq_submit_work(struct io_wq_work *work)
 	const struct io_op_def *def = &io_op_defs[req->opcode];
 	unsigned int issue_flags = IO_URING_F_UNLOCKED;
 	bool needs_poll = false;
-	struct io_kiocb *timeout;
 	int ret = 0, err = -ECANCELED;
 
 	/* one will be dropped by ->io_free_work() after returning to io-wq */
@@ -7310,10 +7320,7 @@ static void io_wq_submit_work(struct io_wq_work *work)
 	else
 		req_ref_get(req);
 
-	timeout = io_prep_linked_timeout(req);
-	if (timeout)
-		io_queue_linked_timeout(timeout);
-
+	io_arm_ltimeout(req);
 
 	/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
 	if (work->flags & IO_WQ_WORK_CANCEL) {
@@ -7528,7 +7535,6 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
 static inline void __io_queue_sqe(struct io_kiocb *req)
 	__must_hold(&req->ctx->uring_lock)
 {
-	struct io_kiocb *linked_timeout;
 	int ret;
 
 	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
@@ -7542,9 +7548,7 @@ static inline void __io_queue_sqe(struct io_kiocb *req)
 	 * doesn't support non-blocking read/write attempts
 	 */
 	if (likely(!ret)) {
-		linked_timeout = io_prep_linked_timeout(req);
-		if (linked_timeout)
-			io_queue_linked_timeout(linked_timeout);
+		io_arm_ltimeout(req);
 	} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
 		io_queue_sqe_arm_apoll(req);
 	} else {

From cbc2e2038845ddc6d1eb5969eff4f02bd5187b47 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Apr 2022 22:08:26 +0100
Subject: [PATCH 035/111] io_uring: inline io_queue_sqe()

Inline io_queue_sqe() as there is only one caller left, and rename
__io_queue_sqe().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d5742683b7a7caceb1c054e91e5b9135b0f3b858.1650056133.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 946906e93d51..25f5fbcfd0cb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1200,7 +1200,7 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd);
 static void io_drop_inflight_file(struct io_kiocb *req);
 static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags);
-static void __io_queue_sqe(struct io_kiocb *req);
+static void io_queue_sqe(struct io_kiocb *req);
 static void io_rsrc_put_work(struct work_struct *work);
 
 static void io_req_task_queue(struct io_kiocb *req);
@@ -2663,7 +2663,7 @@ static void io_req_task_submit(struct io_kiocb *req, bool *locked)
 	io_tw_lock(req->ctx, locked);
 	/* req->task == current here, checking PF_EXITING is safe */
 	if (likely(!(req->task->flags & PF_EXITING)))
-		__io_queue_sqe(req);
+		io_queue_sqe(req);
 	else
 		io_req_complete_failed(req, -EFAULT);
 }
@@ -7532,7 +7532,7 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
 		io_queue_linked_timeout(linked_timeout);
 }
 
-static inline void __io_queue_sqe(struct io_kiocb *req)
+static inline void io_queue_sqe(struct io_kiocb *req)
 	__must_hold(&req->ctx->uring_lock)
 {
 	int ret;
@@ -7573,15 +7573,6 @@ static void io_queue_sqe_fallback(struct io_kiocb *req)
 	}
 }
 
-static inline void io_queue_sqe(struct io_kiocb *req)
-	__must_hold(&req->ctx->uring_lock)
-{
-	if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
-		__io_queue_sqe(req);
-	else
-		io_queue_sqe_fallback(req);
-}
-
 /*
  * Check SQE restrictions (opcode and flags).
  *
@@ -7782,7 +7773,11 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		return 0;
 	}
 
-	io_queue_sqe(req);
+	if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
+		io_queue_sqe(req);
+	else
+		io_queue_sqe_fallback(req);
+
 	return 0;
 }
 

From 77955efbc46220adce9ecb803fb9ff9042bd7236 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Apr 2022 22:08:27 +0100
Subject: [PATCH 036/111] io_uring: rename io_queue_async_work()

Rename io_queue_async_work(). The name is pretty old but now doesn't
reflect well what the function is doing.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/5d4b25c54cccf084f9f2fd63bd4e4fa4515e998e.1650056133.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 25f5fbcfd0cb..5d3f1b21ee76 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1736,7 +1736,7 @@ static inline void io_req_add_compl_list(struct io_kiocb *req)
 	wq_list_add_tail(&req->comp_list, &state->compl_reqs);
 }
 
-static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
+static void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
 {
 	struct io_kiocb *link = io_prep_linked_timeout(req);
 	struct io_uring_task *tctx = req->task->io_uring;
@@ -2683,7 +2683,7 @@ static void io_req_task_queue(struct io_kiocb *req)
 
 static void io_req_task_queue_reissue(struct io_kiocb *req)
 {
-	req->io_task_work.func = io_queue_async_work;
+	req->io_task_work.func = io_queue_iowq;
 	io_req_task_work_add(req, false);
 }
 
@@ -7522,7 +7522,7 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
 		 * Queued up for async execution, worker will release
 		 * submit reference when the iocb is actually submitted.
 		 */
-		io_queue_async_work(req, NULL);
+		io_queue_iowq(req, NULL);
 		break;
 	case IO_APOLL_OK:
 		break;
@@ -7569,7 +7569,7 @@ static void io_queue_sqe_fallback(struct io_kiocb *req)
 		if (unlikely(ret))
 			io_req_complete_failed(req, ret);
 		else
-			io_queue_async_work(req, NULL);
+			io_queue_iowq(req, NULL);
 	}
 }
 

From 7bfa9badc793ba5e8b530dc3a3cb092a1c22685b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Apr 2022 22:08:28 +0100
Subject: [PATCH 037/111] io_uring: refactor io_queue_sqe()

io_queue_sqe() is a part of the submission path and we try hard to keep
it inlined, so shed some extra bytes from it by moving the error
checking part into io_queue_sqe_arm_apoll() and renaming it accordingly.

note: io_queue_sqe_arm_apoll() is not inlined, thus the patch doesn't
change the number of function calls for the apoll path.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9b79edd246336decfaca79b949a15ac69123490d.1650056133.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5d3f1b21ee76..5a99a2fd8d2d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7508,10 +7508,17 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
 	io_put_req(req);
 }
 
-static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
+static void io_queue_async(struct io_kiocb *req, int ret)
 	__must_hold(&req->ctx->uring_lock)
 {
-	struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+	struct io_kiocb *linked_timeout;
+
+	if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
+		io_req_complete_failed(req, ret);
+		return;
+	}
+
+	linked_timeout = io_prep_linked_timeout(req);
 
 	switch (io_arm_poll_handler(req, 0)) {
 	case IO_APOLL_READY:
@@ -7547,13 +7554,10 @@ static inline void io_queue_sqe(struct io_kiocb *req)
 	 * We async punt it if the file wasn't marked NOWAIT, or if the file
 	 * doesn't support non-blocking read/write attempts
 	 */
-	if (likely(!ret)) {
+	if (likely(!ret))
 		io_arm_ltimeout(req);
-	} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
-		io_queue_sqe_arm_apoll(req);
-	} else {
-		io_req_complete_failed(req, ret);
-	}
+	else
+		io_queue_async(req, ret);
 }
 
 static void io_queue_sqe_fallback(struct io_kiocb *req)

From da1a08c5b28176398773de30a056ba7b994e6988 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Apr 2022 22:08:29 +0100
Subject: [PATCH 038/111] io_uring: introduce IO_REQ_LINK_FLAGS

Add a macro for all link request flags to avoid duplication.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/df38b883e31e7e0ca4e364d25a0743862961b180.1650056133.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5a99a2fd8d2d..04dedc37e285 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1180,6 +1180,7 @@ static const struct io_op_def io_op_defs[] = {
 
 /* requests with any of those set should undergo io_disarm_next() */
 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
+#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
 
 static bool io_disarm_next(struct io_kiocb *req);
 static void io_uring_del_tctx_node(unsigned long index);
@@ -2160,7 +2161,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,
 	 * free_list cache.
 	 */
 	if (req_ref_put_and_test(req)) {
-		if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+		if (req->flags & IO_REQ_LINK_FLAGS) {
 			if (req->flags & IO_DISARM_MASK)
 				io_disarm_next(req);
 			if (req->link) {
@@ -2721,7 +2722,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
 						&ctx->apoll_cache);
 				req->flags &= ~REQ_F_POLLED;
 			}
-			if (req->flags & (REQ_F_LINK|REQ_F_HARDLINK))
+			if (req->flags & IO_REQ_LINK_FLAGS)
 				io_queue_next(req);
 			if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
 				io_clean_op(req);
@@ -2781,7 +2782,7 @@ static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
 	struct io_kiocb *nxt = NULL;
 
 	if (req_ref_put_and_test(req)) {
-		if (unlikely(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))
+		if (unlikely(req->flags & IO_REQ_LINK_FLAGS))
 			nxt = io_req_find_next(req);
 		io_free_req(req);
 	}
@@ -7728,7 +7729,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			 */
 			if (!(link->head->flags & REQ_F_FAIL))
 				req_fail_link_node(link->head, -ECANCELED);
-		} else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
+		} else if (!(req->flags & IO_REQ_LINK_FLAGS)) {
 			/*
 			 * the current req is a normal req, we should return
 			 * error and thus break the submittion loop.
@@ -7766,12 +7767,12 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		link->last->link = req;
 		link->last = req;
 
-		if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
+		if (req->flags & IO_REQ_LINK_FLAGS)
 			return 0;
 		/* last request of a link, enqueue the link */
 		link->head = NULL;
 		req = head;
-	} else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+	} else if (req->flags & IO_REQ_LINK_FLAGS) {
 		link->head = req;
 		link->last = req;
 		return 0;

From df3becde8d9dcaaf17a615009301479f8f9c6323 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Apr 2022 22:08:30 +0100
Subject: [PATCH 039/111] io_uring: refactor lazy link fail

Remove the lazy link fail logic from io_submit_sqe() and hide it into a
helper. It simplifies the code and will be needed in next patches.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/6a68aca9cf4492132da1d7c8a09068b74aba3c65.1650056133.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 85 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 46 insertions(+), 39 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 04dedc37e285..cb7c3bc7c1b7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7705,7 +7705,44 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	return io_req_prep(req, sqe);
 }
 
-static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
+				      struct io_kiocb *req, int ret)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_submit_link *link = &ctx->submit_state.link;
+	struct io_kiocb *head = link->head;
+
+	trace_io_uring_req_failed(sqe, ctx, req, ret);
+
+	/*
+	 * Avoid breaking links in the middle as it renders links with SQPOLL
+	 * unusable. Instead of failing eagerly, continue assembling the link if
+	 * applicable and mark the head with REQ_F_FAIL. The link flushing code
+	 * should find the flag and handle the rest.
+	 */
+	req_fail_link_node(req, ret);
+	if (head && !(head->flags & REQ_F_FAIL))
+		req_fail_link_node(head, -ECANCELED);
+
+	if (!(req->flags & IO_REQ_LINK_FLAGS)) {
+		if (head) {
+			link->last->link = req;
+			link->head = NULL;
+			req = head;
+		}
+		io_queue_sqe_fallback(req);
+		return ret;
+	}
+
+	if (head)
+		link->last->link = req;
+	else
+		link->head = req;
+	link->last = req;
+	return 0;
+}
+
+static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			 const struct io_uring_sqe *sqe)
 	__must_hold(&ctx->uring_lock)
 {
@@ -7713,32 +7750,8 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	int ret;
 
 	ret = io_init_req(ctx, req, sqe);
-	if (unlikely(ret)) {
-		trace_io_uring_req_failed(sqe, ctx, req, ret);
-
-		/* fail even hard links since we don't submit */
-		if (link->head) {
-			/*
-			 * we can judge a link req is failed or cancelled by if
-			 * REQ_F_FAIL is set, but the head is an exception since
-			 * it may be set REQ_F_FAIL because of other req's failure
-			 * so let's leverage req->cqe.res to distinguish if a head
-			 * is set REQ_F_FAIL because of its failure or other req's
-			 * failure so that we can set the correct ret code for it.
-			 * init result here to avoid affecting the normal path.
-			 */
-			if (!(link->head->flags & REQ_F_FAIL))
-				req_fail_link_node(link->head, -ECANCELED);
-		} else if (!(req->flags & IO_REQ_LINK_FLAGS)) {
-			/*
-			 * the current req is a normal req, we should return
-			 * error and thus break the submittion loop.
-			 */
-			io_req_complete_failed(req, ret);
-			return ret;
-		}
-		req_fail_link_node(req, ret);
-	}
+	if (unlikely(ret))
+		return io_submit_fail_init(sqe, req, ret);
 
 	/* don't need @sqe from now on */
 	trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode,
@@ -7753,25 +7766,19 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	 * conditions are true (normal request), then just queue it.
 	 */
 	if (link->head) {
-		struct io_kiocb *head = link->head;
+		ret = io_req_prep_async(req);
+		if (unlikely(ret))
+			return io_submit_fail_init(sqe, req, ret);
 
-		if (!(req->flags & REQ_F_FAIL)) {
-			ret = io_req_prep_async(req);
-			if (unlikely(ret)) {
-				req_fail_link_node(req, ret);
-				if (!(head->flags & REQ_F_FAIL))
-					req_fail_link_node(head, -ECANCELED);
-			}
-		}
-		trace_io_uring_link(ctx, req, head);
+		trace_io_uring_link(ctx, req, link->head);
 		link->last->link = req;
 		link->last = req;
 
 		if (req->flags & IO_REQ_LINK_FLAGS)
 			return 0;
-		/* last request of a link, enqueue the link */
+		/* last request of the link, flush it */
+		req = link->head;
 		link->head = NULL;
-		req = head;
 	} else if (req->flags & IO_REQ_LINK_FLAGS) {
 		link->head = req;
 		link->last = req;

From 924a07e482baae0ced5fb9922b827c46f0c0972d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Apr 2022 22:08:31 +0100
Subject: [PATCH 040/111] io_uring: refactor io_submit_sqe()

Remove one extra if for non-linked path of io_submit_sqe().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/03183199d1bf494b4a72eca16d792c8a5945acb4.1650056133.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cb7c3bc7c1b7..fe9df32be921 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7765,7 +7765,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	 * submitted sync once the chain is complete. If none of those
 	 * conditions are true (normal request), then just queue it.
 	 */
-	if (link->head) {
+	if (unlikely(link->head)) {
 		ret = io_req_prep_async(req);
 		if (unlikely(ret))
 			return io_submit_fail_init(sqe, req, ret);
@@ -7779,17 +7779,22 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		/* last request of the link, flush it */
 		req = link->head;
 		link->head = NULL;
-	} else if (req->flags & IO_REQ_LINK_FLAGS) {
-		link->head = req;
-		link->last = req;
+		if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
+			goto fallback;
+
+	} else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
+					  REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
+		if (req->flags & IO_REQ_LINK_FLAGS) {
+			link->head = req;
+			link->last = req;
+		} else {
+fallback:
+			io_queue_sqe_fallback(req);
+		}
 		return 0;
 	}
 
-	if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
-		io_queue_sqe(req);
-	else
-		io_queue_sqe_fallback(req);
-
+	io_queue_sqe(req);
 	return 0;
 }
 

From 17b147f6c1f21caef554004e19207810cbc08a8e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Apr 2022 22:08:32 +0100
Subject: [PATCH 041/111] io_uring: inline io_req_complete_fail_submit()

Inline io_req_complete_fail_submit(), there is only one caller and the
name doesn't tell us much.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/fe5851af01dcd39fc84b71b8539c7cbe4658fb6d.1650056133.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index fe9df32be921..5bb46fe161c1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2223,17 +2223,6 @@ static void io_req_complete_failed(struct io_kiocb *req, s32 res)
 	io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
 }
 
-static void io_req_complete_fail_submit(struct io_kiocb *req)
-{
-	/*
-	 * We don't submit, fail them all, for that replace hardlinks with
-	 * normal links. Extra REQ_F_LINK is tolerated.
-	 */
-	req->flags &= ~REQ_F_HARDLINK;
-	req->flags |= REQ_F_LINK;
-	io_req_complete_failed(req, req->cqe.res);
-}
-
 /*
  * Don't initialise the fields below on every allocation, but do that in
  * advance and keep them valid across allocations.
@@ -7564,8 +7553,14 @@ static inline void io_queue_sqe(struct io_kiocb *req)
 static void io_queue_sqe_fallback(struct io_kiocb *req)
 	__must_hold(&req->ctx->uring_lock)
 {
-	if (req->flags & REQ_F_FAIL) {
-		io_req_complete_fail_submit(req);
+	if (unlikely(req->flags & REQ_F_FAIL)) {
+		/*
+		 * We don't submit, fail them all, for that replace hardlinks
+		 * with normal links. Extra REQ_F_LINK is tolerated.
+		 */
+		req->flags &= ~REQ_F_HARDLINK;
+		req->flags |= REQ_F_LINK;
+		io_req_complete_failed(req, req->cqe.res);
 	} else if (unlikely(req->ctx->drain_active)) {
 		io_drain_req(req);
 	} else {

From a6d97a8a77cb66cbbf6a5afc6708b1c2f5b00ca2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Apr 2022 22:08:33 +0100
Subject: [PATCH 042/111] io_uring: add data_race annotations

We have several racy reads, mark them with data_race() to demonstrate
this fact.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/7e56e750d294c70b2a56938bd733386f19f0eb53.1650056133.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5bb46fe161c1..f29ab9055852 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2268,7 +2268,7 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 	 * locked cache, grab the lock and move them over to our submission
 	 * side cache.
 	 */
-	if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) {
+	if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
 		io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
 		if (!io_req_cache_empty(ctx))
 			return true;
@@ -2562,8 +2562,8 @@ static void tctx_task_work(struct callback_head *cb)
 			handle_tw_list(node2, &ctx, &uring_locked);
 		cond_resched();
 
-		if (!tctx->task_list.first &&
-		    !tctx->prior_task_list.first && uring_locked)
+		if (data_race(!tctx->task_list.first) &&
+		    data_race(!tctx->prior_task_list.first) && uring_locked)
 			io_submit_flush_completions(ctx);
 	}
 

From 93f052cb39e1f8c7a002e2ecbc7f45f354dcf0c7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 18 Apr 2022 20:51:11 +0100
Subject: [PATCH 043/111] io_uring: use right helpers for file assign locking

We have io_ring_submit_[un]lock() functions helping us with conditional
->uring_lock locking, use them in io_file_get_fixed() instead of hand
coding.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c9c9ff1e046f6eb68da0a251962a697f8a2275fa.1650311386.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f29ab9055852..1b8597b82f6a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7388,8 +7388,7 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 	struct file *file = NULL;
 	unsigned long file_ptr;
 
-	if (issue_flags & IO_URING_F_UNLOCKED)
-		mutex_lock(&ctx->uring_lock);
+	io_ring_submit_lock(ctx, issue_flags);
 
 	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
 		goto out;
@@ -7401,8 +7400,7 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 	req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
 	io_req_set_rsrc_node(req, ctx, 0);
 out:
-	if (issue_flags & IO_URING_F_UNLOCKED)
-		mutex_unlock(&ctx->uring_lock);
+	io_ring_submit_unlock(ctx, issue_flags);
 	return file;
 }
 

From 772f5e002b9ef65c22956990be9ecb2df58cd9c9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 18 Apr 2022 20:51:12 +0100
Subject: [PATCH 044/111] io_uring: refactor io_assign_file error path

All io_assign_file() callers do error handling themselves,
req_set_fail() in the io_assign_file()'s fail path needlessly bloats the
kernel and is not the best abstraction to have. Simplify the error path.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/eff77fb1eac2b6a90cca5223813e6a396ffedec0.1650311386.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1b8597b82f6a..75be769ce652 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7134,12 +7134,8 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
 		req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
 	else
 		req->file = io_file_get_normal(req, req->cqe.fd);
-	if (req->file)
-		return true;
 
-	req_set_fail(req);
-	req->cqe.res = -EBADF;
-	return false;
+	return !!req->file;
 }
 
 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)

From c1bdf8ed1e84ccbcf5e35fac1b311094572e5047 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 18 Apr 2022 20:51:13 +0100
Subject: [PATCH 045/111] io_uring: store rsrc node in req instead of refs

req->fixed_rsrc_refs keeps a pointer to rsrc node pcpu references, but
it's more natural just to store rsrc node directly. There were some
reasons for that in the past but not anymore.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/cee1c86ec9023f3e4f6ce8940d58c017ef8782f4.1650311386.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 75be769ce652..9ad9db3223d2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -931,7 +931,7 @@ struct io_kiocb {
 	struct io_ring_ctx		*ctx;
 	struct task_struct		*task;
 
-	struct percpu_ref		*fixed_rsrc_refs;
+	struct io_rsrc_node		*rsrc_node;
 	/* store used ubuf, so we can prevent reloading */
 	struct io_mapped_ubuf		*imu;
 
@@ -1330,20 +1330,20 @@ static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
 					  struct io_ring_ctx *ctx)
 	__must_hold(&ctx->uring_lock)
 {
-	struct percpu_ref *ref = req->fixed_rsrc_refs;
+	struct io_rsrc_node *node = req->rsrc_node;
 
-	if (ref) {
-		if (ref == &ctx->rsrc_node->refs)
+	if (node) {
+		if (node == ctx->rsrc_node)
 			ctx->rsrc_cached_refs++;
 		else
-			percpu_ref_put(ref);
+			percpu_ref_put(&node->refs);
 	}
 }
 
 static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
 {
-	if (req->fixed_rsrc_refs)
-		percpu_ref_put(req->fixed_rsrc_refs);
+	if (req->rsrc_node)
+		percpu_ref_put(&req->rsrc_node->refs);
 }
 
 static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
@@ -1366,8 +1366,8 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
 					struct io_ring_ctx *ctx,
 					unsigned int issue_flags)
 {
-	if (!req->fixed_rsrc_refs) {
-		req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
+	if (!req->rsrc_node) {
+		req->rsrc_node = ctx->rsrc_node;
 
 		if (!(issue_flags & IO_URING_F_UNLOCKED)) {
 			lockdep_assert_held(&ctx->uring_lock);
@@ -1375,7 +1375,7 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
 			if (unlikely(ctx->rsrc_cached_refs < 0))
 				io_rsrc_refs_refill(ctx);
 		} else {
-			percpu_ref_get(req->fixed_rsrc_refs);
+			percpu_ref_get(&req->rsrc_node->refs);
 		}
 	}
 }
@@ -7623,7 +7623,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->flags = sqe_flags = READ_ONCE(sqe->flags);
 	req->cqe.user_data = READ_ONCE(sqe->user_data);
 	req->file = NULL;
-	req->fixed_rsrc_refs = NULL;
+	req->rsrc_node = NULL;
 	req->task = current;
 
 	if (unlikely(opcode >= IORING_OP_LAST)) {

From 25a15d3c668bf9cd23780fb0191b7f7f5054d879 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 18 Apr 2022 20:51:14 +0100
Subject: [PATCH 046/111] io_uring: add a helper for putting rsrc nodes

Add a simple helper to encapsulating dropping rsrc nodes references,
it's cleaner and will help if we'd change rsrc refcounting or play with
percpu_ref_put() [no]inlining.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/63fdd953ac75898734cd50e8f69e95e6664f46fe.1650311386.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9ad9db3223d2..7841a196c868 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1326,6 +1326,11 @@ static inline void io_req_set_refcount(struct io_kiocb *req)
 
 #define IO_RSRC_REF_BATCH	100
 
+static void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
+{
+	percpu_ref_put_many(&node->refs, nr);
+}
+
 static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
 					  struct io_ring_ctx *ctx)
 	__must_hold(&ctx->uring_lock)
@@ -1336,21 +1341,21 @@ static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
 		if (node == ctx->rsrc_node)
 			ctx->rsrc_cached_refs++;
 		else
-			percpu_ref_put(&node->refs);
+			io_rsrc_put_node(node, 1);
 	}
 }
 
 static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
 {
 	if (req->rsrc_node)
-		percpu_ref_put(&req->rsrc_node->refs);
+		io_rsrc_put_node(req->rsrc_node, 1);
 }
 
 static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
 	__must_hold(&ctx->uring_lock)
 {
 	if (ctx->rsrc_cached_refs) {
-		percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
+		io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
 		ctx->rsrc_cached_refs = 0;
 	}
 }

From 7ac1edc4a9bbbfc43c4a1fa7c2884029df3363bf Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 18 Apr 2022 20:51:15 +0100
Subject: [PATCH 047/111] io_uring: kill ctx arg from io_req_put_rsrc

The ctx argument of io_req_put_rsrc() is not used, kill it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/bb51bf3ff02775b03e6ea21bc79c25d7870d1644.1650311386.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7841a196c868..fb7704d7c6eb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1345,7 +1345,7 @@ static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
 	}
 }
 
-static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
+static inline void io_req_put_rsrc(struct io_kiocb *req)
 {
 	if (req->rsrc_node)
 		io_rsrc_put_node(req->rsrc_node, 1);
@@ -2174,7 +2174,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,
 				req->link = NULL;
 			}
 		}
-		io_req_put_rsrc(req, ctx);
+		io_req_put_rsrc(req);
 		/*
 		 * Selected buffer deallocation in io_clean_op() assumes that
 		 * we don't hold ->completion_lock. Clean them here to avoid
@@ -2337,7 +2337,7 @@ static __cold void io_free_req(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	io_req_put_rsrc(req, ctx);
+	io_req_put_rsrc(req);
 	io_dismantle_req(req);
 	io_put_task(req->task, 1);
 

From 5e45690a1cb8cb591ccf4a517cdd6ad6cb369ac4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 20 Apr 2022 16:15:27 -0600
Subject: [PATCH 048/111] io_uring: store SCM state in io_fixed_file->file_ptr

A previous commit removed SCM accounting for non-unix sockets, as those
are the only ones that can cause a fixed file reference. While that is
true, it also means we're now dereferencing the file as part of the
workqueue driven __io_sqe_files_unregister() after the process has
exited. This isn't safe for SCM files, as unix gc may have already
reaped them when the process exited. KASAN complains about this:

[   12.307040] Freed by task 0:
[   12.307592]  kasan_save_stack+0x28/0x4c
[   12.308318]  kasan_set_track+0x28/0x38
[   12.309049]  kasan_set_free_info+0x24/0x44
[   12.309890]  ____kasan_slab_free+0x108/0x11c
[   12.310739]  __kasan_slab_free+0x14/0x1c
[   12.311482]  slab_free_freelist_hook+0xd4/0x164
[   12.312382]  kmem_cache_free+0x100/0x1dc
[   12.313178]  file_free_rcu+0x58/0x74
[   12.313864]  rcu_core+0x59c/0x7c0
[   12.314675]  rcu_core_si+0xc/0x14
[   12.315496]  _stext+0x30c/0x414
[   12.316287]
[   12.316687] Last potentially related work creation:
[   12.317885]  kasan_save_stack+0x28/0x4c
[   12.318845]  __kasan_record_aux_stack+0x9c/0xb0
[   12.319976]  kasan_record_aux_stack_noalloc+0x10/0x18
[   12.321268]  call_rcu+0x50/0x35c
[   12.322082]  __fput+0x2fc/0x324
[   12.322873]  ____fput+0xc/0x14
[   12.323644]  task_work_run+0xac/0x10c
[   12.324561]  do_notify_resume+0x37c/0xe74
[   12.325420]  el0_svc+0x5c/0x68
[   12.326050]  el0t_64_sync_handler+0xb0/0x12c
[   12.326918]  el0t_64_sync+0x164/0x168
[   12.327657]
[   12.327976] Second to last potentially related work creation:
[   12.329134]  kasan_save_stack+0x28/0x4c
[   12.329864]  __kasan_record_aux_stack+0x9c/0xb0
[   12.330735]  kasan_record_aux_stack+0x10/0x18
[   12.331576]  task_work_add+0x34/0xf0
[   12.332284]  fput_many+0x11c/0x134
[   12.332960]  fput+0x10/0x94
[   12.333524]  __scm_destroy+0x80/0x84
[   12.334213]  unix_destruct_scm+0xc4/0x144
[   12.334948]  skb_release_head_state+0x5c/0x6c
[   12.335696]  skb_release_all+0x14/0x38
[   12.336339]  __kfree_skb+0x14/0x28
[   12.336928]  kfree_skb_reason+0xf4/0x108
[   12.337604]  unix_gc+0x1e8/0x42c
[   12.338154]  unix_release_sock+0x25c/0x2dc
[   12.338895]  unix_release+0x58/0x78
[   12.339531]  __sock_release+0x68/0xec
[   12.340170]  sock_close+0x14/0x20
[   12.340729]  __fput+0x18c/0x324
[   12.341254]  ____fput+0xc/0x14
[   12.341763]  task_work_run+0xac/0x10c
[   12.342367]  do_notify_resume+0x37c/0xe74
[   12.343086]  el0_svc+0x5c/0x68
[   12.343510]  el0t_64_sync_handler+0xb0/0x12c
[   12.344086]  el0t_64_sync+0x164/0x168

We have an extra bit we can use in file_ptr on 64-bit, use that to store
whether this file is SCM'ed or not, avoiding the need to look at the
file contents itself. This does mean that 32-bit will be stuck with SCM
for all registered files, just like 64-bit did before the referenced
commit.

Fixes: 1f59bc0f18cf ("io_uring: don't scm-account for non af_unix sockets")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 36 +++++++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index fb7704d7c6eb..4c8932d3cdc2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -223,6 +223,23 @@ struct io_overflow_cqe {
 	struct list_head list;
 };
 
+/*
+ * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0
+ * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we
+ * can't safely always dereference the file when the task has exited and ring
+ * cleanup is done. If a file is tracked and part of SCM, then unix gc on
+ * process exit may reap it before __io_sqe_files_unregister() is run.
+ */
+#define FFS_NOWAIT		0x1UL
+#define FFS_ISREG		0x2UL
+#if defined(CONFIG_64BIT)
+#define FFS_SCM			0x4UL
+#else
+#define IO_URING_SCM_ALL
+#define FFS_SCM			0x0UL
+#endif
+#define FFS_MASK		~(FFS_NOWAIT|FFS_ISREG|FFS_SCM)
+
 struct io_fixed_file {
 	/* file * with additional FFS_* flags */
 	unsigned long file_ptr;
@@ -1236,12 +1253,16 @@ EXPORT_SYMBOL(io_uring_get_socket);
 #if defined(CONFIG_UNIX)
 static inline bool io_file_need_scm(struct file *filp)
 {
+#if defined(IO_URING_SCM_ALL)
+	return true;
+#else
 	return !!unix_get_socket(filp);
+#endif
 }
 #else
 static inline bool io_file_need_scm(struct file *filp)
 {
-	return 0;
+	return false;
 }
 #endif
 
@@ -1651,10 +1672,6 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
 	return false;
 }
 
-#define FFS_NOWAIT		0x1UL
-#define FFS_ISREG		0x2UL
-#define FFS_MASK		~(FFS_NOWAIT|FFS_ISREG)
-
 static inline bool io_req_ffs_set(struct io_kiocb *req)
 {
 	return req->flags & REQ_F_FIXED_FILE;
@@ -3197,6 +3214,8 @@ static unsigned int io_file_get_flags(struct file *file)
 		res |= FFS_ISREG;
 	if (__io_file_supports_nowait(file, mode))
 		res |= FFS_NOWAIT;
+	if (io_file_need_scm(file))
+		res |= FFS_SCM;
 	return res;
 }
 
@@ -8478,14 +8497,17 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
 	int i;
 
+#if !defined(IO_URING_SCM_ALL)
 	for (i = 0; i < ctx->nr_user_files; i++) {
 		struct file *file = io_file_from_index(ctx, i);
 
-		if (!file || io_file_need_scm(file))
+		if (!file)
+			continue;
+		if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
 			continue;
-		io_fixed_file_slot(&ctx->file_table, i)->file_ptr = 0;
 		fput(file);
 	}
+#endif
 
 #if defined(CONFIG_UNIX)
 	if (ctx->ring_sock) {

From 3645c2000a7694022c39c545676c12fb9190855a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 20 Apr 2022 13:40:54 +0100
Subject: [PATCH 049/111] io_uring: move timeout locking in io_timeout_cancel()

Move ->timeout_lock grabbing inside of io_timeout_cancel(), so
we can do io_req_task_queue_fail() outside of the lock. It's much nicer
than relying on triple nested locking.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/cde758c2897930d31e205ed8f476d4ec879a8849.1650458197.git.asml.silence@gmail.com
[axboe: drop now wrong timeout_lock annotation]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4c8932d3cdc2..bf06e306d15b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6509,9 +6509,12 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 
 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 	__must_hold(&ctx->completion_lock)
-	__must_hold(&ctx->timeout_lock)
 {
-	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
+	struct io_kiocb *req;
+
+	spin_lock_irq(&ctx->timeout_lock);
+	req = io_timeout_extract(ctx, user_data);
+	spin_unlock_irq(&ctx->timeout_lock);
 
 	if (IS_ERR(req))
 		return PTR_ERR(req);
@@ -6630,9 +6633,7 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
 		spin_lock(&ctx->completion_lock);
-		spin_lock_irq(&ctx->timeout_lock);
 		ret = io_timeout_cancel(ctx, tr->addr);
-		spin_unlock_irq(&ctx->timeout_lock);
 		spin_unlock(&ctx->completion_lock);
 	} else {
 		enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
@@ -6818,10 +6819,7 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
 	ret = io_poll_cancel(ctx, sqe_addr, false);
 	if (ret != -ENOENT)
 		goto out;
-
-	spin_lock_irq(&ctx->timeout_lock);
 	ret = io_timeout_cancel(ctx, sqe_addr);
-	spin_unlock_irq(&ctx->timeout_lock);
 out:
 	spin_unlock(&ctx->completion_lock);
 	return ret;

From 81ec803b4ecda446aa965e1c0d8eb4241847ed9c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 20 Apr 2022 13:40:55 +0100
Subject: [PATCH 050/111] io_uring: refactor io_disarm_next() locking

Split timeout handling into removal + failing, so we can reduce
spinlocking time and remove another instance of triple nested locking.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0f00d115f9d4c5749028f19623708ad3695512d6.1650458197.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bf06e306d15b..d3bf21470873 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2372,7 +2372,7 @@ static inline void io_remove_next_linked(struct io_kiocb *req)
 	nxt->link = NULL;
 }
 
-static bool io_kill_linked_timeout(struct io_kiocb *req)
+static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
 	__must_hold(&req->ctx->completion_lock)
 	__must_hold(&req->ctx->timeout_lock)
 {
@@ -2385,11 +2385,10 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
 		link->timeout.head = NULL;
 		if (hrtimer_try_to_cancel(&io->timer) != -1) {
 			list_del(&link->timeout.list);
-			io_req_tw_post_queue(link, -ECANCELED, 0);
-			return true;
+			return link;
 		}
 	}
-	return false;
+	return NULL;
 }
 
 static void io_fail_links(struct io_kiocb *req)
@@ -2423,11 +2422,11 @@ static void io_fail_links(struct io_kiocb *req)
 static bool io_disarm_next(struct io_kiocb *req)
 	__must_hold(&req->ctx->completion_lock)
 {
+	struct io_kiocb *link = NULL;
 	bool posted = false;
 
 	if (req->flags & REQ_F_ARM_LTIMEOUT) {
-		struct io_kiocb *link = req->link;
-
+		link = req->link;
 		req->flags &= ~REQ_F_ARM_LTIMEOUT;
 		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
 			io_remove_next_linked(req);
@@ -2438,8 +2437,12 @@ static bool io_disarm_next(struct io_kiocb *req)
 		struct io_ring_ctx *ctx = req->ctx;
 
 		spin_lock_irq(&ctx->timeout_lock);
-		posted = io_kill_linked_timeout(req);
+		link = io_disarm_linked_timeout(req);
 		spin_unlock_irq(&ctx->timeout_lock);
+		if (link) {
+			posted = true;
+			io_req_tw_post_queue(link, -ECANCELED, 0);
+		}
 	}
 	if (unlikely((req->flags & REQ_F_FAIL) &&
 		     !(req->flags & REQ_F_HARDLINK))) {

From 98d3dcc8be97a11f7f3b674acb25f81673bb43c3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Apr 2022 10:43:58 -0600
Subject: [PATCH 051/111] io_uring: remove dead 'poll_only' argument to
 io_poll_cancel()

It's only called from one location, and it always passes in 'false'.
Kill the argument, and just pass in 'false' to io_poll_find().

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20220418164402.75259-2-axboe@kernel.dk
---
 fs/io_uring.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d3bf21470873..be625f5440e8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6326,11 +6326,10 @@ static bool io_poll_disarm(struct io_kiocb *req)
 	return true;
 }
 
-static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
-			  bool poll_only)
+static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
 	__must_hold(&ctx->completion_lock)
 {
-	struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only);
+	struct io_kiocb *req = io_poll_find(ctx, sqe_addr, false);
 
 	if (!req)
 		return -ENOENT;
@@ -6819,7 +6818,7 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
 		return 0;
 
 	spin_lock(&ctx->completion_lock);
-	ret = io_poll_cancel(ctx, sqe_addr, false);
+	ret = io_poll_cancel(ctx, sqe_addr);
 	if (ret != -ENOENT)
 		goto out;
 	ret = io_timeout_cancel(ctx, sqe_addr);

From b21432b4d580d064b6dee6d66c9a6b0071b499ce Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Apr 2022 10:43:59 -0600
Subject: [PATCH 052/111] io_uring: pass in struct io_cancel_data consistently

In preparation for being able to not only key cancel off the user_data,
pass in the io_cancel_data struct for the various functions that deal
with request cancelation.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20220418164402.75259-3-axboe@kernel.dk
---
 fs/io_uring.c | 76 +++++++++++++++++++++++++++++----------------------
 1 file changed, 44 insertions(+), 32 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index be625f5440e8..b72323c3fe9b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -988,6 +988,11 @@ struct io_defer_entry {
 	u32			seq;
 };
 
+struct io_cancel_data {
+	struct io_ring_ctx *ctx;
+	u64 data;
+};
+
 struct io_op_def {
 	/* needs req->file assigned */
 	unsigned		needs_file : 1;
@@ -6298,16 +6303,16 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
 	return found;
 }
 
-static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
-				     bool poll_only)
+static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
+				     struct io_cancel_data *cd)
 	__must_hold(&ctx->completion_lock)
 {
 	struct hlist_head *list;
 	struct io_kiocb *req;
 
-	list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
+	list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)];
 	hlist_for_each_entry(req, list, hash_node) {
-		if (sqe_addr != req->cqe.user_data)
+		if (cd->data != req->cqe.user_data)
 			continue;
 		if (poll_only && req->opcode != IORING_OP_POLL_ADD)
 			continue;
@@ -6326,10 +6331,10 @@ static bool io_poll_disarm(struct io_kiocb *req)
 	return true;
 }
 
-static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
+static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 	__must_hold(&ctx->completion_lock)
 {
-	struct io_kiocb *req = io_poll_find(ctx, sqe_addr, false);
+	struct io_kiocb *req = io_poll_find(ctx, false, cd);
 
 	if (!req)
 		return -ENOENT;
@@ -6421,13 +6426,14 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
 {
+	struct io_cancel_data cd = { .data = req->poll_update.old_user_data, };
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *preq;
 	int ret2, ret = 0;
 	bool locked;
 
 	spin_lock(&ctx->completion_lock);
-	preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
+	preq = io_poll_find(ctx, true, &cd);
 	if (!preq || !io_poll_disarm(preq)) {
 		spin_unlock(&ctx->completion_lock);
 		ret = preq ? -EALREADY : -ENOENT;
@@ -6487,7 +6493,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 }
 
 static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
-					   __u64 user_data)
+					   struct io_cancel_data *cd)
 	__must_hold(&ctx->timeout_lock)
 {
 	struct io_timeout_data *io;
@@ -6495,7 +6501,7 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 	bool found = false;
 
 	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
-		found = user_data == req->cqe.user_data;
+		found = cd->data == req->cqe.user_data;
 		if (found)
 			break;
 	}
@@ -6509,13 +6515,13 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 	return req;
 }
 
-static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+static int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 	__must_hold(&ctx->completion_lock)
 {
 	struct io_kiocb *req;
 
 	spin_lock_irq(&ctx->timeout_lock);
-	req = io_timeout_extract(ctx, user_data);
+	req = io_timeout_extract(ctx, cd);
 	spin_unlock_irq(&ctx->timeout_lock);
 
 	if (IS_ERR(req))
@@ -6569,7 +6575,8 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 			     struct timespec64 *ts, enum hrtimer_mode mode)
 	__must_hold(&ctx->timeout_lock)
 {
-	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
+	struct io_cancel_data cd = { .data = user_data, };
+	struct io_kiocb *req = io_timeout_extract(ctx, &cd);
 	struct io_timeout_data *data;
 
 	if (IS_ERR(req))
@@ -6634,8 +6641,10 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 	int ret;
 
 	if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
+		struct io_cancel_data cd = { .data = tr->addr, };
+
 		spin_lock(&ctx->completion_lock);
-		ret = io_timeout_cancel(ctx, tr->addr);
+		ret = io_timeout_cancel(ctx, &cd);
 		spin_unlock(&ctx->completion_lock);
 	} else {
 		enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
@@ -6763,30 +6772,24 @@ add:
 	return 0;
 }
 
-struct io_cancel_data {
-	struct io_ring_ctx *ctx;
-	u64 user_data;
-};
-
 static bool io_cancel_cb(struct io_wq_work *work, void *data)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	struct io_cancel_data *cd = data;
 
-	return req->ctx == cd->ctx && req->cqe.user_data == cd->user_data;
+	return req->ctx == cd->ctx && req->cqe.user_data == cd->data;
 }
 
-static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
-			       struct io_ring_ctx *ctx)
+static int io_async_cancel_one(struct io_uring_task *tctx,
+			       struct io_cancel_data *cd)
 {
-	struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
 	enum io_wq_cancel cancel_ret;
 	int ret = 0;
 
 	if (!tctx || !tctx->io_wq)
 		return -ENOENT;
 
-	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
+	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, false);
 	switch (cancel_ret) {
 	case IO_WQ_CANCEL_OK:
 		ret = 0;
@@ -6802,14 +6805,14 @@ static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
 	return ret;
 }
 
-static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
+static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
 	WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
 
-	ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
+	ret = io_async_cancel_one(req->task->io_uring, cd);
 	/*
 	 * Fall-through even for -EALREADY, as we may have poll armed
 	 * that need unarming.
@@ -6818,10 +6821,10 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
 		return 0;
 
 	spin_lock(&ctx->completion_lock);
-	ret = io_poll_cancel(ctx, sqe_addr);
+	ret = io_poll_cancel(ctx, cd);
 	if (ret != -ENOENT)
 		goto out;
-	ret = io_timeout_cancel(ctx, sqe_addr);
+	ret = io_timeout_cancel(ctx, cd);
 out:
 	spin_unlock(&ctx->completion_lock);
 	return ret;
@@ -6845,11 +6848,14 @@ static int io_async_cancel_prep(struct io_kiocb *req,
 static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	u64 sqe_addr = req->cancel.addr;
+	struct io_cancel_data cd = {
+		.ctx		= ctx,
+		.data		= req->cancel.addr,
+	};
 	struct io_tctx_node *node;
 	int ret;
 
-	ret = io_try_cancel_userdata(req, sqe_addr);
+	ret = io_try_cancel(req, &cd);
 	if (ret != -ENOENT)
 		goto done;
 
@@ -6859,7 +6865,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
 		struct io_uring_task *tctx = node->task->io_uring;
 
-		ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
+		ret = io_async_cancel_one(tctx, &cd);
 		if (ret != -ENOENT)
 			break;
 	}
@@ -7455,8 +7461,14 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
 	int ret = -ENOENT;
 
 	if (prev) {
-		if (!(req->task->flags & PF_EXITING))
-			ret = io_try_cancel_userdata(req, prev->cqe.user_data);
+		if (!(req->task->flags & PF_EXITING)) {
+			struct io_cancel_data cd = {
+				.ctx		= req->ctx,
+				.data		= prev->cqe.user_data,
+			};
+
+			ret = io_try_cancel(req, &cd);
+		}
 		io_req_complete_post(req, ret ?: -ETIME, 0);
 		io_put_req(prev);
 	} else {

From 8e29da69feade64ec7fe9e1a2824b967c5183a21 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Apr 2022 10:44:00 -0600
Subject: [PATCH 053/111] io_uring: add support for IORING_ASYNC_CANCEL_ALL

The current cancelation will lookup and cancel the first request it
finds based on the key passed in. Add a flag that allows to cancel any
request that matches they key. It completes with the number of requests
found and canceled, or res < 0 if an error occured.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20220418164402.75259-4-axboe@kernel.dk
---
 fs/io-wq.h                    |  1 +
 fs/io_uring.c                 | 91 +++++++++++++++++++++++++++--------
 include/uapi/linux/io_uring.h |  7 +++
 3 files changed, 78 insertions(+), 21 deletions(-)

diff --git a/fs/io-wq.h b/fs/io-wq.h
index dbecd27656c7..ba6eee76d028 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
 struct io_wq_work {
 	struct io_wq_work_node list;
 	unsigned flags;
+	int cancel_seq;
 };
 
 static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index b72323c3fe9b..b43cdf1a4555 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -403,6 +403,7 @@ struct io_ring_ctx {
 		 */
 		struct io_rsrc_node	*rsrc_node;
 		int			rsrc_cached_refs;
+		atomic_t		cancel_seq;
 		struct io_file_table	file_table;
 		unsigned		nr_user_files;
 		unsigned		nr_user_bufs;
@@ -585,6 +586,7 @@ struct io_sync {
 struct io_cancel {
 	struct file			*file;
 	u64				addr;
+	u32				flags;
 };
 
 struct io_timeout {
@@ -991,6 +993,8 @@ struct io_defer_entry {
 struct io_cancel_data {
 	struct io_ring_ctx *ctx;
 	u64 data;
+	u32 flags;
+	int seq;
 };
 
 struct io_op_def {
@@ -1726,6 +1730,7 @@ static void io_prep_async_work(struct io_kiocb *req)
 
 	req->work.list.next = NULL;
 	req->work.flags = 0;
+	req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
 	if (req->flags & REQ_F_FORCE_ASYNC)
 		req->work.flags |= IO_WQ_WORK_CONCURRENT;
 
@@ -6159,6 +6164,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 	int v;
 
 	INIT_HLIST_NODE(&req->hash_node);
+	req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
 	io_init_poll_iocb(poll, mask, io_poll_wake);
 	poll->file = req->file;
 
@@ -6316,6 +6322,11 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
 			continue;
 		if (poll_only && req->opcode != IORING_OP_POLL_ADD)
 			continue;
+		if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
+			if (cd->seq == req->work.cancel_seq)
+				continue;
+			req->work.cancel_seq = cd->seq;
+		}
 		return req;
 	}
 	return NULL;
@@ -6501,9 +6512,15 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 	bool found = false;
 
 	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
-		found = cd->data == req->cqe.user_data;
-		if (found)
-			break;
+		if (cd->data != req->cqe.user_data)
+			continue;
+		if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
+			if (cd->seq == req->work.cancel_seq)
+				continue;
+			req->work.cancel_seq = cd->seq;
+		}
+		found = true;
+		break;
 	}
 	if (!found)
 		return ERR_PTR(-ENOENT);
@@ -6777,7 +6794,16 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data)
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	struct io_cancel_data *cd = data;
 
-	return req->ctx == cd->ctx && req->cqe.user_data == cd->data;
+	if (req->ctx != cd->ctx)
+		return false;
+	if (req->cqe.user_data != cd->data)
+		return false;
+	if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
+		if (cd->seq == req->work.cancel_seq)
+			return false;
+		req->work.cancel_seq = cd->seq;
+	}
+	return true;
 }
 
 static int io_async_cancel_one(struct io_uring_task *tctx,
@@ -6789,7 +6815,8 @@ static int io_async_cancel_one(struct io_uring_task *tctx,
 	if (!tctx || !tctx->io_wq)
 		return -ENOENT;
 
-	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, false);
+	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd,
+					cd->flags & IORING_ASYNC_CANCEL_ALL);
 	switch (cancel_ret) {
 	case IO_WQ_CANCEL_OK:
 		ret = 0;
@@ -6837,27 +6864,33 @@ static int io_async_cancel_prep(struct io_kiocb *req,
 		return -EINVAL;
 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
-	    sqe->splice_fd_in)
+	if (sqe->ioprio || sqe->off || sqe->len || sqe->splice_fd_in)
 		return -EINVAL;
 
 	req->cancel.addr = READ_ONCE(sqe->addr);
+	req->cancel.flags = READ_ONCE(sqe->cancel_flags);
+	if (req->cancel.flags & ~IORING_ASYNC_CANCEL_ALL)
+		return -EINVAL;
+
 	return 0;
 }
 
-static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
+			     unsigned int issue_flags)
 {
-	struct io_ring_ctx *ctx = req->ctx;
-	struct io_cancel_data cd = {
-		.ctx		= ctx,
-		.data		= req->cancel.addr,
-	};
+	bool cancel_all = cd->flags & IORING_ASYNC_CANCEL_ALL;
+	struct io_ring_ctx *ctx = cd->ctx;
 	struct io_tctx_node *node;
-	int ret;
+	int ret, nr = 0;
 
-	ret = io_try_cancel(req, &cd);
-	if (ret != -ENOENT)
-		goto done;
+	do {
+		ret = io_try_cancel(req, cd);
+		if (ret == -ENOENT)
+			break;
+		if (!cancel_all)
+			return ret;
+		nr++;
+	} while (1);
 
 	/* slow path, try all io-wq's */
 	io_ring_submit_lock(ctx, issue_flags);
@@ -6865,12 +6898,28 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
 		struct io_uring_task *tctx = node->task->io_uring;
 
-		ret = io_async_cancel_one(tctx, &cd);
-		if (ret != -ENOENT)
-			break;
+		ret = io_async_cancel_one(tctx, cd);
+		if (ret != -ENOENT) {
+			if (!cancel_all)
+				break;
+			nr++;
+		}
 	}
 	io_ring_submit_unlock(ctx, issue_flags);
-done:
+	return cancel_all ? nr : ret;
+}
+
+static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_cancel_data cd = {
+		.ctx	= req->ctx,
+		.data	= req->cancel.addr,
+		.flags	= req->cancel.flags,
+		.seq	= atomic_inc_return(&req->ctx->cancel_seq),
+	};
+	int ret;
+
+	ret = __io_async_cancel(&cd, req, issue_flags);
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_complete_post(req, ret, 0);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1845cf7c80ba..476e58a2837f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -187,6 +187,13 @@ enum {
 #define IORING_POLL_UPDATE_EVENTS	(1U << 1)
 #define IORING_POLL_UPDATE_USER_DATA	(1U << 2)
 
+/*
+ * ASYNC_CANCEL flags.
+ *
+ * IORING_ASYNC_CANCEL_ALL	Cancel all requests that match the given key
+ */
+#define IORING_ASYNC_CANCEL_ALL	(1U << 0)
+
 /*
  * IO completion data structure (Completion Queue Entry)
  */

From 4bf94615b8886305199ed5755cb72fea88258d15 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Apr 2022 10:44:01 -0600
Subject: [PATCH 054/111] io_uring: allow IORING_OP_ASYNC_CANCEL with 'fd' key

Currently sqe->addr must contain the user_data of the request being
canceled. Introduce the IORING_ASYNC_CANCEL_FD flag, which tells the
kernel that we're keying off the file fd instead for cancelation. This
allows canceling any request that a) uses a file, and b) was assigned the
file based on the value being passed in.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20220418164402.75259-5-axboe@kernel.dk
---
 fs/io_uring.c                 | 67 +++++++++++++++++++++++++++++++----
 include/uapi/linux/io_uring.h |  3 ++
 2 files changed, 63 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b43cdf1a4555..cf0d5437b77d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -587,6 +587,7 @@ struct io_cancel {
 	struct file			*file;
 	u64				addr;
 	u32				flags;
+	s32				fd;
 };
 
 struct io_timeout {
@@ -992,7 +993,10 @@ struct io_defer_entry {
 
 struct io_cancel_data {
 	struct io_ring_ctx *ctx;
-	u64 data;
+	union {
+		u64 data;
+		struct file *file;
+	};
 	u32 flags;
 	int seq;
 };
@@ -6332,6 +6336,29 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
 	return NULL;
 }
 
+static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
+					  struct io_cancel_data *cd)
+	__must_hold(&ctx->completion_lock)
+{
+	struct io_kiocb *req;
+	int i;
+
+	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
+		struct hlist_head *list;
+
+		list = &ctx->cancel_hash[i];
+		hlist_for_each_entry(req, list, hash_node) {
+			if (req->file != cd->file)
+				continue;
+			if (cd->seq == req->work.cancel_seq)
+				continue;
+			req->work.cancel_seq = cd->seq;
+			return req;
+		}
+	}
+	return NULL;
+}
+
 static bool io_poll_disarm(struct io_kiocb *req)
 	__must_hold(&ctx->completion_lock)
 {
@@ -6345,8 +6372,12 @@ static bool io_poll_disarm(struct io_kiocb *req)
 static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 	__must_hold(&ctx->completion_lock)
 {
-	struct io_kiocb *req = io_poll_find(ctx, false, cd);
+	struct io_kiocb *req;
 
+	if (cd->flags & IORING_ASYNC_CANCEL_FD)
+		req = io_poll_file_find(ctx, cd);
+	else
+		req = io_poll_find(ctx, false, cd);
 	if (!req)
 		return -ENOENT;
 	io_poll_cancel_req(req);
@@ -6796,8 +6827,13 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data)
 
 	if (req->ctx != cd->ctx)
 		return false;
-	if (req->cqe.user_data != cd->data)
-		return false;
+	if (cd->flags & IORING_ASYNC_CANCEL_FD) {
+		if (req->file != cd->file)
+			return false;
+	} else {
+		if (req->cqe.user_data != cd->data)
+			return false;
+	}
 	if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
 		if (cd->seq == req->work.cancel_seq)
 			return false;
@@ -6851,7 +6887,8 @@ static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
 	ret = io_poll_cancel(ctx, cd);
 	if (ret != -ENOENT)
 		goto out;
-	ret = io_timeout_cancel(ctx, cd);
+	if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
+		ret = io_timeout_cancel(ctx, cd);
 out:
 	spin_unlock(&ctx->completion_lock);
 	return ret;
@@ -6862,15 +6899,17 @@ static int io_async_cancel_prep(struct io_kiocb *req,
 {
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+	if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
 		return -EINVAL;
 	if (sqe->ioprio || sqe->off || sqe->len || sqe->splice_fd_in)
 		return -EINVAL;
 
 	req->cancel.addr = READ_ONCE(sqe->addr);
 	req->cancel.flags = READ_ONCE(sqe->cancel_flags);
-	if (req->cancel.flags & ~IORING_ASYNC_CANCEL_ALL)
+	if (req->cancel.flags & ~(IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_FD))
 		return -EINVAL;
+	if (req->cancel.flags & IORING_ASYNC_CANCEL_FD)
+		req->cancel.fd = READ_ONCE(sqe->fd);
 
 	return 0;
 }
@@ -6919,7 +6958,21 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 	};
 	int ret;
 
+	if (cd.flags & IORING_ASYNC_CANCEL_FD) {
+		if (req->flags & REQ_F_FIXED_FILE)
+			req->file = io_file_get_fixed(req, req->cancel.fd,
+							issue_flags);
+		else
+			req->file = io_file_get_normal(req, req->cancel.fd);
+		if (!req->file) {
+			ret = -EBADF;
+			goto done;
+		}
+		cd.file = req->file;
+	}
+
 	ret = __io_async_cancel(&cd, req, issue_flags);
+done:
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_complete_post(req, ret, 0);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 476e58a2837f..cc7fe82a1798 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -191,8 +191,11 @@ enum {
  * ASYNC_CANCEL flags.
  *
  * IORING_ASYNC_CANCEL_ALL	Cancel all requests that match the given key
+ * IORING_ASYNC_CANCEL_FD	Key off 'fd' for cancelation rather than the
+ *				request 'user_data'
  */
 #define IORING_ASYNC_CANCEL_ALL	(1U << 0)
+#define IORING_ASYNC_CANCEL_FD	(1U << 1)
 
 /*
  * IO completion data structure (Completion Queue Entry)

From 970f256edb8c1259c8ed48d52b38215135396126 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Apr 2022 10:44:02 -0600
Subject: [PATCH 055/111] io_uring: add support for IORING_ASYNC_CANCEL_ANY

Rather than match on a specific key, be it user_data or file, allow
canceling any request that we can lookup. Works like
IORING_ASYNC_CANCEL_ALL in that it cancels multiple requests, but it
doesn't key off user_data or the file.

Can't be set with IORING_ASYNC_CANCEL_FD, as that's a key selector.
Only one may be used at the time.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20220418164402.75259-6-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 39 ++++++++++++++++++++++-------------
 include/uapi/linux/io_uring.h |  2 ++
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cf0d5437b77d..03134ec070a3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6348,7 +6348,8 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
 
 		list = &ctx->cancel_hash[i];
 		hlist_for_each_entry(req, list, hash_node) {
-			if (req->file != cd->file)
+			if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
+			    req->file != cd->file)
 				continue;
 			if (cd->seq == req->work.cancel_seq)
 				continue;
@@ -6374,7 +6375,7 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 {
 	struct io_kiocb *req;
 
-	if (cd->flags & IORING_ASYNC_CANCEL_FD)
+	if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
 		req = io_poll_file_find(ctx, cd);
 	else
 		req = io_poll_find(ctx, false, cd);
@@ -6543,9 +6544,10 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 	bool found = false;
 
 	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
-		if (cd->data != req->cqe.user_data)
+		if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
+		    cd->data != req->cqe.user_data)
 			continue;
-		if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
+		if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
 			if (cd->seq == req->work.cancel_seq)
 				continue;
 			req->work.cancel_seq = cd->seq;
@@ -6827,14 +6829,16 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data)
 
 	if (req->ctx != cd->ctx)
 		return false;
-	if (cd->flags & IORING_ASYNC_CANCEL_FD) {
+	if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
+		;
+	} else if (cd->flags & IORING_ASYNC_CANCEL_FD) {
 		if (req->file != cd->file)
 			return false;
 	} else {
 		if (req->cqe.user_data != cd->data)
 			return false;
 	}
-	if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
+	if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
 		if (cd->seq == req->work.cancel_seq)
 			return false;
 		req->work.cancel_seq = cd->seq;
@@ -6847,12 +6851,13 @@ static int io_async_cancel_one(struct io_uring_task *tctx,
 {
 	enum io_wq_cancel cancel_ret;
 	int ret = 0;
+	bool all;
 
 	if (!tctx || !tctx->io_wq)
 		return -ENOENT;
 
-	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd,
-					cd->flags & IORING_ASYNC_CANCEL_ALL);
+	all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
+	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all);
 	switch (cancel_ret) {
 	case IO_WQ_CANCEL_OK:
 		ret = 0;
@@ -6894,6 +6899,9 @@ out:
 	return ret;
 }
 
+#define CANCEL_FLAGS	(IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
+			 IORING_ASYNC_CANCEL_ANY)
+
 static int io_async_cancel_prep(struct io_kiocb *req,
 				const struct io_uring_sqe *sqe)
 {
@@ -6906,10 +6914,13 @@ static int io_async_cancel_prep(struct io_kiocb *req,
 
 	req->cancel.addr = READ_ONCE(sqe->addr);
 	req->cancel.flags = READ_ONCE(sqe->cancel_flags);
-	if (req->cancel.flags & ~(IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_FD))
+	if (req->cancel.flags & ~CANCEL_FLAGS)
 		return -EINVAL;
-	if (req->cancel.flags & IORING_ASYNC_CANCEL_FD)
+	if (req->cancel.flags & IORING_ASYNC_CANCEL_FD) {
+		if (req->cancel.flags & IORING_ASYNC_CANCEL_ANY)
+			return -EINVAL;
 		req->cancel.fd = READ_ONCE(sqe->fd);
+	}
 
 	return 0;
 }
@@ -6917,7 +6928,7 @@ static int io_async_cancel_prep(struct io_kiocb *req,
 static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
 			     unsigned int issue_flags)
 {
-	bool cancel_all = cd->flags & IORING_ASYNC_CANCEL_ALL;
+	bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
 	struct io_ring_ctx *ctx = cd->ctx;
 	struct io_tctx_node *node;
 	int ret, nr = 0;
@@ -6926,7 +6937,7 @@ static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
 		ret = io_try_cancel(req, cd);
 		if (ret == -ENOENT)
 			break;
-		if (!cancel_all)
+		if (!all)
 			return ret;
 		nr++;
 	} while (1);
@@ -6939,13 +6950,13 @@ static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
 
 		ret = io_async_cancel_one(tctx, cd);
 		if (ret != -ENOENT) {
-			if (!cancel_all)
+			if (!all)
 				break;
 			nr++;
 		}
 	}
 	io_ring_submit_unlock(ctx, issue_flags);
-	return cancel_all ? nr : ret;
+	return all ? nr : ret;
 }
 
 static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index cc7fe82a1798..980d82eb196e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -193,9 +193,11 @@ enum {
  * IORING_ASYNC_CANCEL_ALL	Cancel all requests that match the given key
  * IORING_ASYNC_CANCEL_FD	Key off 'fd' for cancelation rather than the
  *				request 'user_data'
+ * IORING_ASYNC_CANCEL_ANY	Match any request
  */
 #define IORING_ASYNC_CANCEL_ALL	(1U << 0)
 #define IORING_ASYNC_CANCEL_FD	(1U << 1)
+#define IORING_ASYNC_CANCEL_ANY	(1U << 2)
 
 /*
  * IO completion data structure (Completion Queue Entry)

From 4c3c09439c08b03d9503df0ca4c7619c5842892e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 20 Apr 2022 19:21:36 -0600
Subject: [PATCH 056/111] io_uring: support MSG_WAITALL for IORING_OP_SEND(MSG)

Like commit 7ba89d2af17a for recv/recvmsg, support MSG_WAITALL for the
send side. If this flag is set and we do a short send, retry for a
stream of seqpacket socket.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 36 +++++++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 03134ec070a3..e18d8891b98b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5235,6 +5235,13 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
 }
 
 #if defined(CONFIG_NET)
+static bool io_net_retry(struct socket *sock, int flags)
+{
+	if (!(flags & MSG_WAITALL))
+		return false;
+	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
+}
+
 static int io_setup_async_msg(struct io_kiocb *req,
 			      struct io_async_msghdr *kmsg)
 {
@@ -5293,12 +5300,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (req->ctx->compat)
 		sr->msg_flags |= MSG_CMSG_COMPAT;
 #endif
+	sr->done_io = 0;
 	return 0;
 }
 
 static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_async_msghdr iomsg, *kmsg;
+	struct io_sr_msg *sr = &req->sr_msg;
 	struct socket *sock;
 	unsigned flags;
 	int min_ret = 0;
@@ -5330,12 +5339,21 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 			return io_setup_async_msg(req, kmsg);
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
+		if (ret > 0 && io_net_retry(sock, flags)) {
+			sr->done_io += ret;
+			req->flags |= REQ_F_PARTIAL_IO;
+			return io_setup_async_msg(req, kmsg);
+		}
 		req_set_fail(req);
 	}
 	/* fast path, check for non-NULL to avoid function call */
 	if (kmsg->free_iov)
 		kfree(kmsg->free_iov);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
+	if (ret >= 0)
+		ret += sr->done_io;
+	else if (sr->done_io)
+		ret = sr->done_io;
 	__io_req_complete(req, issue_flags, ret, 0);
 	return 0;
 }
@@ -5376,8 +5394,19 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
 			return -EAGAIN;
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
+		if (ret > 0 && io_net_retry(sock, flags)) {
+			sr->len -= ret;
+			sr->buf += ret;
+			sr->done_io += ret;
+			req->flags |= REQ_F_PARTIAL_IO;
+			return -EAGAIN;
+		}
 		req_set_fail(req);
 	}
+	if (ret >= 0)
+		ret += sr->done_io;
+	else if (sr->done_io)
+		ret = sr->done_io;
 	__io_req_complete(req, issue_flags, ret, 0);
 	return 0;
 }
@@ -5509,13 +5538,6 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static bool io_net_retry(struct socket *sock, int flags)
-{
-	if (!(flags & MSG_WAITALL))
-		return false;
-	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
-}
-
 static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_async_msghdr iomsg, *kmsg;

From 10c873334febaeea9aa0c25c10b5ac0951b77a5f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 20 Apr 2022 19:27:41 -0600
Subject: [PATCH 057/111] io_uring: allow re-poll if we made progress

We currently check REQ_F_POLLED before arming async poll for a
notification to retry. If it's set, then we don't allow poll and will
punt to io-wq instead. This is done to prevent a situation where a buggy
driver will repeatedly return that there's space/data available yet we
get -EAGAIN.

However, if we already transferred data, then it should be safe to rely
on poll again. Gate the check on whether or not REQ_F_PARTIAL_IO is
also set.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e18d8891b98b..97ed7f419dae 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6266,7 +6266,9 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 
 	if (!def->pollin && !def->pollout)
 		return IO_APOLL_ABORTED;
-	if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
+	if (!file_can_poll(req->file))
+		return IO_APOLL_ABORTED;
+	if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
 		return IO_APOLL_ABORTED;
 
 	if (def->pollin) {
@@ -6281,8 +6283,10 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 	}
 	if (def->poll_exclusive)
 		mask |= EPOLLEXCLUSIVE;
-	if (!(issue_flags & IO_URING_F_UNLOCKED) &&
-	    !list_empty(&ctx->apoll_cache)) {
+	if (req->flags & REQ_F_POLLED) {
+		apoll = req->apoll;
+	} else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
+		   !list_empty(&ctx->apoll_cache)) {
 		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
 						poll.wait.entry);
 		list_del_init(&apoll->poll.wait.entry);

From 47894438e916ca3fe44db33c2d4a1670fd296d6f Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 21 Apr 2022 02:13:40 -0700
Subject: [PATCH 058/111] io_uring: add trace support for CQE overflow

Add trace function for overflowing CQ ring.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220421091345.2115755-2-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/io_uring.h | 42 ++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index cddf5b6fbeb4..42534ec2ab9d 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -530,7 +530,7 @@ TRACE_EVENT(io_uring_req_failed,
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, "
-		"op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, "
+		  "op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, "
 		  "len=%u, rw_flags=0x%x, buf_index=%d, "
 		  "personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d",
 		  __entry->ctx, __entry->req, __entry->user_data,
@@ -543,6 +543,46 @@ TRACE_EVENT(io_uring_req_failed,
 		  (unsigned long long) __entry->pad2, __entry->error)
 );
 
+
+/*
+ * io_uring_cqe_overflow - a CQE overflowed
+ *
+ * @ctx:		pointer to a ring context structure
+ * @user_data:		user data associated with the request
+ * @res:		CQE result
+ * @cflags:		CQE flags
+ * @ocqe:		pointer to the overflow cqe (if available)
+ *
+ */
+TRACE_EVENT(io_uring_cqe_overflow,
+
+	TP_PROTO(void *ctx, unsigned long long user_data, s32 res, u32 cflags,
+		 void *ocqe),
+
+	TP_ARGS(ctx, user_data, res, cflags, ocqe),
+
+	TP_STRUCT__entry (
+		__field(  void *,		ctx		)
+		__field(  unsigned long long,	user_data	)
+		__field(  s32,			res		)
+		__field(  u32,			cflags		)
+		__field(  void *,		ocqe		)
+	),
+
+	TP_fast_assign(
+		__entry->ctx		= ctx;
+		__entry->user_data	= user_data;
+		__entry->res		= res;
+		__entry->cflags		= cflags;
+		__entry->ocqe		= ocqe;
+	),
+
+	TP_printk("ring %p, user_data 0x%llx, res %d, flags %x, "
+		  "overflow_cqe %p",
+		  __entry->ctx, __entry->user_data, __entry->res,
+		  __entry->cflags, __entry->ocqe)
+);
+
 #endif /* _TRACE_IO_URING_H */
 
 /* This part must be outside protection */

From 08dcd0288f6ecb97bc94260856e6b44cca4149e2 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 21 Apr 2022 02:13:41 -0700
Subject: [PATCH 059/111] io_uring: trace cqe overflows

Trace cqe overflows in io_uring. Print ocqe before the check, so if it is
NULL it indicates that it has been dropped.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220421091345.2115755-3-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 97ed7f419dae..e92b1b9d4b5f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2107,6 +2107,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 	struct io_overflow_cqe *ocqe;
 
 	ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
+	trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
 	if (!ocqe) {
 		/*
 		 * If we're in ring overflow flush mode, or in task cancel mode,

From 3e813c9026720c1291ef3c91caa2e26f88e28367 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 21 Apr 2022 02:13:42 -0700
Subject: [PATCH 060/111] io_uring: rework io_uring_enter to simplify return
 value

io_uring_enter returns the count submitted preferrably over an error
code. In some code paths this check is not required, so reorganise the
code so that the check is only done as needed.
This is also a prep for returning error codes only in waiting scenarios.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220421091345.2115755-4-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e92b1b9d4b5f..3b13eb72736b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -10872,7 +10872,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		size_t, argsz)
 {
 	struct io_ring_ctx *ctx;
-	int submitted = 0;
 	struct fd f;
 	long ret;
 
@@ -10935,15 +10934,15 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			if (ret)
 				goto out;
 		}
-		submitted = to_submit;
+		ret = to_submit;
 	} else if (to_submit) {
 		ret = io_uring_add_tctx_node(ctx);
 		if (unlikely(ret))
 			goto out;
 
 		mutex_lock(&ctx->uring_lock);
-		submitted = io_submit_sqes(ctx, to_submit);
-		if (submitted != to_submit) {
+		ret = io_submit_sqes(ctx, to_submit);
+		if (ret != to_submit) {
 			mutex_unlock(&ctx->uring_lock);
 			goto out;
 		}
@@ -10952,6 +10951,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		mutex_unlock(&ctx->uring_lock);
 	}
 	if (flags & IORING_ENTER_GETEVENTS) {
+		int ret2;
 		if (ctx->syscall_iopoll) {
 			/*
 			 * We disallow the app entering submit/complete with
@@ -10961,22 +10961,29 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			 */
 			mutex_lock(&ctx->uring_lock);
 iopoll_locked:
-			ret = io_validate_ext_arg(flags, argp, argsz);
-			if (likely(!ret)) {
-				min_complete = min(min_complete, ctx->cq_entries);
-				ret = io_iopoll_check(ctx, min_complete);
+			ret2 = io_validate_ext_arg(flags, argp, argsz);
+			if (likely(!ret2)) {
+				min_complete = min(min_complete,
+						   ctx->cq_entries);
+				ret2 = io_iopoll_check(ctx, min_complete);
 			}
 			mutex_unlock(&ctx->uring_lock);
 		} else {
 			const sigset_t __user *sig;
 			struct __kernel_timespec __user *ts;
 
-			ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
-			if (unlikely(ret))
-				goto out;
-			min_complete = min(min_complete, ctx->cq_entries);
-			ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
+			ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
+			if (likely(!ret2)) {
+				min_complete = min(min_complete,
+						   ctx->cq_entries);
+				ret2 = io_cqring_wait(ctx, min_complete, sig,
+						      argsz, ts);
+			}
 		}
+
+		if (!ret)
+			ret = ret2;
+
 	}
 
 out:
@@ -10984,7 +10991,7 @@ out:
 out_fput:
 	if (!(flags & IORING_ENTER_REGISTERED_RING))
 		fdput(f);
-	return submitted ? submitted : ret;
+	return ret;
 }
 
 #ifdef CONFIG_PROC_FS

From 10988a0a67ba918c5b3ac7de47efa7470264a291 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 21 Apr 2022 02:13:43 -0700
Subject: [PATCH 061/111] io_uring: use constants for cq_overflow bitfield

Prepare to use this bitfield for more flags by using constants instead of
magic value 0

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220421091345.2115755-5-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3b13eb72736b..04bcf08dfe2c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -431,7 +431,7 @@ struct io_ring_ctx {
 	struct wait_queue_head	sqo_sq_wait;
 	struct list_head	sqd_list;
 
-	unsigned long		check_cq_overflow;
+	unsigned long		check_cq;
 
 	struct {
 		/*
@@ -903,6 +903,10 @@ struct io_cqe {
 	};
 };
 
+enum {
+	IO_CHECK_CQ_OVERFLOW_BIT,
+};
+
 /*
  * NOTE! Each of the iocb union members has the file pointer
  * as the first entry in their struct definition. So you can
@@ -2024,7 +2028,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 
 	all_flushed = list_empty(&ctx->cq_overflow_list);
 	if (all_flushed) {
-		clear_bit(0, &ctx->check_cq_overflow);
+		clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
 		WRITE_ONCE(ctx->rings->sq_flags,
 			   ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
 	}
@@ -2040,7 +2044,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
 {
 	bool ret = true;
 
-	if (test_bit(0, &ctx->check_cq_overflow)) {
+	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
 		/* iopoll syncs against uring_lock, not completion_lock */
 		if (ctx->flags & IORING_SETUP_IOPOLL)
 			mutex_lock(&ctx->uring_lock);
@@ -2118,7 +2122,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 		return false;
 	}
 	if (list_empty(&ctx->cq_overflow_list)) {
-		set_bit(0, &ctx->check_cq_overflow);
+		set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
 		WRITE_ONCE(ctx->rings->sq_flags,
 			   ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
 
@@ -2960,7 +2964,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 	 * If we do, we can potentially be spinning for commands that
 	 * already triggered a CQE (eg in error).
 	 */
-	if (test_bit(0, &ctx->check_cq_overflow))
+	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
 		__io_cqring_overflow_flush(ctx, false);
 	if (io_cqring_events(ctx))
 		return 0;
@@ -8300,7 +8304,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
 	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
 	 * the task, and the next invocation will do it.
 	 */
-	if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
+	if (io_should_wake(iowq) ||
+	    test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq))
 		return autoremove_wake_function(curr, mode, wake_flags, key);
 	return -1;
 }
@@ -8328,7 +8333,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	if (ret || io_should_wake(iowq))
 		return ret;
 	/* let the caller flush overflows, retry */
-	if (test_bit(0, &ctx->check_cq_overflow))
+	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
 		return 1;
 
 	if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
@@ -10123,7 +10128,8 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 	 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
 	 * pushs them to do the flush.
 	 */
-	if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
+	if (io_cqring_events(ctx) ||
+	    test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;

From 155bc9505dbd6613585abbf0be6466f1c21536c4 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 21 Apr 2022 02:13:44 -0700
Subject: [PATCH 062/111] io_uring: return an error when cqe is dropped

Right now io_uring will not actively inform userspace if a CQE is
dropped. This is extremely rare, requiring a CQ ring overflow, as well as
a GFP_ATOMIC kmalloc failure. However the consequences could cause for
example applications to go into an undefined state, possibly waiting for a
CQE that never arrives.

Return an error code (EBADR) in these cases. Since this is expected to be
incredibly rare, try and avoid as much as possible affecting the hot code
paths, and so it only is returned lazily and when there is no other
available CQEs.

Once the error is returned, reset the error condition assuming the user is
either ok with it or will clean up appropriately.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220421091345.2115755-6-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 04bcf08dfe2c..e57d47a23682 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -905,6 +905,7 @@ struct io_cqe {
 
 enum {
 	IO_CHECK_CQ_OVERFLOW_BIT,
+	IO_CHECK_CQ_DROPPED_BIT,
 };
 
 /*
@@ -2119,6 +2120,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 		 * on the floor.
 		 */
 		io_account_cq_overflow(ctx);
+		set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
 		return false;
 	}
 	if (list_empty(&ctx->cq_overflow_list)) {
@@ -2958,16 +2960,26 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 {
 	unsigned int nr_events = 0;
 	int ret = 0;
+	unsigned long check_cq;
 
 	/*
 	 * Don't enter poll loop if we already have events pending.
 	 * If we do, we can potentially be spinning for commands that
 	 * already triggered a CQE (eg in error).
 	 */
-	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
+	check_cq = READ_ONCE(ctx->check_cq);
+	if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
 		__io_cqring_overflow_flush(ctx, false);
 	if (io_cqring_events(ctx))
 		return 0;
+
+	/*
+	 * Similarly do not spin if we have not informed the user of any
+	 * dropped CQE.
+	 */
+	if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
+		return -EBADR;
+
 	do {
 		/*
 		 * If a submit got punted to a workqueue, we can have the
@@ -8327,15 +8339,18 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 					  ktime_t timeout)
 {
 	int ret;
+	unsigned long check_cq;
 
 	/* make sure we run task_work before checking for signals */
 	ret = io_run_task_work_sig();
 	if (ret || io_should_wake(iowq))
 		return ret;
+	check_cq = READ_ONCE(ctx->check_cq);
 	/* let the caller flush overflows, retry */
-	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
+	if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
 		return 1;
-
+	if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
+		return -EBADR;
 	if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
 		return -ETIME;
 	return 1;
@@ -10987,9 +11002,18 @@ iopoll_locked:
 			}
 		}
 
-		if (!ret)
+		if (!ret) {
 			ret = ret2;
 
+			/*
+			 * EBADR indicates that one or more CQE were dropped.
+			 * Once the user has been informed we can clear the bit
+			 * as they are obviously ok with those drops.
+			 */
+			if (unlikely(ret2 == -EBADR))
+				clear_bit(IO_CHECK_CQ_DROPPED_BIT,
+					  &ctx->check_cq);
+		}
 	}
 
 out:

From 69cc1b6fa565993b62210f314614be166d902a54 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 25 Apr 2022 16:43:45 -0600
Subject: [PATCH 063/111] io_uring: fix compile warning for 32-bit builds

If IO_URING_SCM_ALL isn't set, as it would not be on 32-bit builds,
then we trigger a warning:

fs/io_uring.c: In function '__io_sqe_files_unregister':
fs/io_uring.c:8992:13: warning: unused variable 'i' [-Wunused-variable]
 8992 |         int i;
      |             ^

Move the ifdef up to include the 'i' variable declaration.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Fixes: 5e45690a1cb8 ("io_uring: store SCM state in io_fixed_file->file_ptr")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e57d47a23682..b40b640924db 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8667,9 +8667,9 @@ static void io_free_file_tables(struct io_file_table *table)
 
 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
+#if !defined(IO_URING_SCM_ALL)
 	int i;
 
-#if !defined(IO_URING_SCM_ALL)
 	for (i = 0; i < ctx->nr_user_files; i++) {
 		struct file *file = io_file_from_index(ctx, i);
 

From e788be95a57a9bebe446878ce9bf2750f6fe4974 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Apr 2022 17:25:16 -0600
Subject: [PATCH 064/111] task_work: allow TWA_SIGNAL without a rescheduling
 IPI

Some use cases don't always need an IPI when sending a TWA_SIGNAL
notification. Add TWA_SIGNAL_NO_IPI, which is just like TWA_SIGNAL, except
it doesn't send an IPI to the target task. It merely sets
TIF_NOTIFY_SIGNAL and wakes up the task.

This can be useful in avoiding a forceful transition to the kernel if the
task is running in userspace. Depending on the task_work in question, it
may be quite fine waiting for the next reschedule or kernel enter anyway,
or the use case may even have other mechanisms for hinting to the task
that a transition may be useful. This can drive more cooperative
scheduling of task_work.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/821f42b6-7d91-8074-8212-d34998097de4@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sched/signal.h | 13 +++++++++++--
 include/linux/task_work.h    |  1 +
 kernel/task_work.c           | 25 +++++++++++++++++++------
 3 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 3c8b34876744..66b689f6cfcb 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -355,14 +355,23 @@ static inline void clear_notify_signal(void)
 	smp_mb__after_atomic();
 }
 
+/*
+ * Returns 'true' if kick_process() is needed to force a transition from
+ * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work.
+ */
+static inline bool __set_notify_signal(struct task_struct *task)
+{
+	return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
+	       !wake_up_state(task, TASK_INTERRUPTIBLE);
+}
+
 /*
  * Called to break out of interruptible wait loops, and enter the
  * exit_to_user_mode_loop().
  */
 static inline void set_notify_signal(struct task_struct *task)
 {
-	if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
-	    !wake_up_state(task, TASK_INTERRUPTIBLE))
+	if (__set_notify_signal(task))
 		kick_process(task);
 }
 
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index 897494b597ba..795ef5a68429 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -17,6 +17,7 @@ enum task_work_notify_mode {
 	TWA_NONE,
 	TWA_RESUME,
 	TWA_SIGNAL,
+	TWA_SIGNAL_NO_IPI,
 };
 
 static inline bool task_work_pending(struct task_struct *task)
diff --git a/kernel/task_work.c b/kernel/task_work.c
index c59e1a49bc40..dff75bcde151 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -12,12 +12,22 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
  * @notify: how to notify the targeted task
  *
  * Queue @work for task_work_run() below and notify the @task if @notify
- * is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the
- * it will interrupt the targeted task and run the task_work. @TWA_RESUME
- * work is run only when the task exits the kernel and returns to user mode,
- * or before entering guest mode. Fails if the @task is exiting/exited and thus
- * it can't process this @work. Otherwise @work->func() will be called when the
- * @task goes through one of the aforementioned transitions, or exits.
+ * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI.
+ *
+ * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted
+ * task and run the task_work, regardless of whether the task is currently
+ * running in the kernel or userspace.
+ * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a
+ * reschedule IPI to force the targeted task to reschedule and run task_work.
+ * This can be advantageous if there's no strict requirement that the
+ * task_work be run as soon as possible, just whenever the task enters the
+ * kernel anyway.
+ * @TWA_RESUME work is run only when the task exits the kernel and returns to
+ * user mode, or before entering guest mode.
+ *
+ * Fails if the @task is exiting/exited and thus it can't process this @work.
+ * Otherwise @work->func() will be called when the @task goes through one of
+ * the aforementioned transitions, or exits.
  *
  * If the targeted task is exiting, then an error is returned and the work item
  * is not queued. It's up to the caller to arrange for an alternative mechanism
@@ -53,6 +63,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
 	case TWA_SIGNAL:
 		set_notify_signal(task);
 		break;
+	case TWA_SIGNAL_NO_IPI:
+		__set_notify_signal(task);
+		break;
 	default:
 		WARN_ON_ONCE(1);
 		break;

From 3a4b89a25ce59a87fcfff1f030cba8d544fd402c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 25 Apr 2022 19:49:00 -0600
Subject: [PATCH 065/111] io_uring: serialize ctx->rings->sq_flags with
 atomic_or/and

Rather than require ctx->completion_lock for ensuring that we don't
clobber the flags, use the atomic bitop helpers instead. This removes
the need to grab the completion_lock, in preparation for needing to set
or clear sq_flags when we don't know the status of this lock.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/20220426014904.60384-3-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 36 ++++++++++--------------------------
 1 file changed, 10 insertions(+), 26 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b40b640924db..e92029bd3936 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -169,7 +169,7 @@ struct io_rings {
 	 * The application needs a full memory barrier before checking
 	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 	 */
-	u32			sq_flags;
+	atomic_t		sq_flags;
 	/*
 	 * Runtime CQ flags
 	 *
@@ -2030,8 +2030,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 	all_flushed = list_empty(&ctx->cq_overflow_list);
 	if (all_flushed) {
 		clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
-		WRITE_ONCE(ctx->rings->sq_flags,
-			   ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
+		atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
 	}
 
 	io_commit_cqring(ctx);
@@ -2125,8 +2124,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 	}
 	if (list_empty(&ctx->cq_overflow_list)) {
 		set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
-		WRITE_ONCE(ctx->rings->sq_flags,
-			   ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
+		atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
 
 	}
 	ocqe->cqe.user_data = user_data;
@@ -8108,23 +8106,6 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
 	return READ_ONCE(sqd->state);
 }
 
-static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
-{
-	/* Tell userspace we may need a wakeup call */
-	spin_lock(&ctx->completion_lock);
-	WRITE_ONCE(ctx->rings->sq_flags,
-		   ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
-	spin_unlock(&ctx->completion_lock);
-}
-
-static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
-{
-	spin_lock(&ctx->completion_lock);
-	WRITE_ONCE(ctx->rings->sq_flags,
-		   ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
-	spin_unlock(&ctx->completion_lock);
-}
-
 static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 {
 	unsigned int to_submit;
@@ -8240,8 +8221,8 @@ static int io_sq_thread(void *data)
 			bool needs_sched = true;
 
 			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
-				io_ring_set_wakeup_flag(ctx);
-
+				atomic_or(IORING_SQ_NEED_WAKEUP,
+						&ctx->rings->sq_flags);
 				if ((ctx->flags & IORING_SETUP_IOPOLL) &&
 				    !wq_list_empty(&ctx->iopoll_list)) {
 					needs_sched = false;
@@ -8266,7 +8247,8 @@ static int io_sq_thread(void *data)
 				mutex_lock(&sqd->lock);
 			}
 			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
-				io_ring_clear_wakeup_flag(ctx);
+				atomic_andnot(IORING_SQ_NEED_WAKEUP,
+						&ctx->rings->sq_flags);
 		}
 
 		finish_wait(&sqd->wait, &wait);
@@ -8276,7 +8258,7 @@ static int io_sq_thread(void *data)
 	io_uring_cancel_generic(true, sqd);
 	sqd->thread = NULL;
 	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
-		io_ring_set_wakeup_flag(ctx);
+		atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);
 	io_run_task_work();
 	mutex_unlock(&sqd->lock);
 
@@ -12029,6 +12011,8 @@ static int __init io_uring_init(void)
 	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
 	BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
 
+	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
+
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
 				SLAB_ACCOUNT);
 	return 0;

From 6cf5862e3c2caafd11f39a373b98b00c5a04b83a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 25 Apr 2022 19:49:01 -0600
Subject: [PATCH 066/111] io-wq: use __set_notify_signal() to wake workers

The only difference between set_notify_signal() and __set_notify_signal()
is that the former checks if it needs to deliver an IPI to force a
reschedule. As the io-wq workers never leave the kernel, and IPI is never
needed, they simply need a wakeup.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/20220426014904.60384-4-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 32aeb2c581c5..824623bcf1a5 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -871,7 +871,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,
 
 static bool io_wq_worker_wake(struct io_worker *worker, void *data)
 {
-	set_notify_signal(worker->task);
+	__set_notify_signal(worker->task);
 	wake_up_process(worker->task);
 	return false;
 }
@@ -991,7 +991,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker,
 {
 	if (work && match->fn(work, match->data)) {
 		work->flags |= IO_WQ_WORK_CANCEL;
-		set_notify_signal(worker->task);
+		__set_notify_signal(worker->task);
 		return true;
 	}
 

From 9f010507bbc1be19dbeedc1a254209fea44adc14 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 25 Apr 2022 19:49:02 -0600
Subject: [PATCH 067/111] io_uring: set task_work notify method at init time

While doing so, switch SQPOLL to TWA_SIGNAL_NO_IPI as well, as that
just does a task wakeup and then we can remove the special wakeup we
have in task_work_add.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/20220426014904.60384-5-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e92029bd3936..3c669d8f5e57 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -366,6 +366,7 @@ struct io_ring_ctx {
 
 		struct io_rings		*rings;
 		unsigned int		flags;
+		enum task_work_notify_mode	notify_method;
 		unsigned int		compat: 1;
 		unsigned int		drain_next: 1;
 		unsigned int		restricted: 1;
@@ -2621,8 +2622,8 @@ static void tctx_task_work(struct callback_head *cb)
 static void io_req_task_work_add(struct io_kiocb *req, bool priority)
 {
 	struct task_struct *tsk = req->task;
+	struct io_ring_ctx *ctx = req->ctx;
 	struct io_uring_task *tctx = tsk->io_uring;
-	enum task_work_notify_mode notify;
 	struct io_wq_work_node *node;
 	unsigned long flags;
 	bool running;
@@ -2645,18 +2646,8 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority)
 	if (running)
 		return;
 
-	/*
-	 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
-	 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
-	 * processing task_work. There's no reliable way to tell if TWA_RESUME
-	 * will do the job.
-	 */
-	notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
-	if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
-		if (notify == TWA_NONE)
-			wake_up_process(tsk);
+	if (likely(!task_work_add(tsk, &tctx->task_work, ctx->notify_method)))
 		return;
-	}
 
 	spin_lock_irqsave(&tctx->task_lock, flags);
 	tctx->task_running = false;
@@ -11335,6 +11326,14 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	if (!capable(CAP_IPC_LOCK))
 		ctx->user = get_uid(current_user());
 
+	/*
+	 * For SQPOLL, we just need a wakeup, always.
+	 */
+	if (ctx->flags & IORING_SETUP_SQPOLL)
+		ctx->notify_method = TWA_SIGNAL_NO_IPI;
+	else
+		ctx->notify_method = TWA_SIGNAL;
+
 	/*
 	 * This is just grabbed for accounting purposes. When a process exits,
 	 * the mm is exited and dropped before the files, hence we need to hang

From e1169f06d5bbdbc2b22ae4e3083a4bf75ae5ecee Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 25 Apr 2022 19:49:03 -0600
Subject: [PATCH 068/111] io_uring: use TWA_SIGNAL_NO_IPI if
 IORING_SETUP_COOP_TASKRUN is used

If this is set, io_uring will never use an IPI to deliver a task_work
notification. This can be used in the common case where a single task or
thread communicates with the ring, and doesn't rely on
io_uring_cqe_peek().

This provides a noticeable win in performance, both from eliminating
the IPI itself, but also from avoiding interrupting the submitting
task unnecessarily.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/20220426014904.60384-6-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 17 +++++++++++++----
 include/uapi/linux/io_uring.h |  8 ++++++++
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3c669d8f5e57..0b9ae3615911 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -11327,12 +11327,20 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 		ctx->user = get_uid(current_user());
 
 	/*
-	 * For SQPOLL, we just need a wakeup, always.
+	 * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
+	 * COOP_TASKRUN is set, then IPIs are never needed by the app.
 	 */
-	if (ctx->flags & IORING_SETUP_SQPOLL)
+	ret = -EINVAL;
+	if (ctx->flags & IORING_SETUP_SQPOLL) {
+		/* IPI related flags don't make sense with SQPOLL */
+		if (ctx->flags & IORING_SETUP_COOP_TASKRUN)
+			goto err;
 		ctx->notify_method = TWA_SIGNAL_NO_IPI;
-	else
+	} else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
+		ctx->notify_method = TWA_SIGNAL_NO_IPI;
+	} else {
 		ctx->notify_method = TWA_SIGNAL;
+	}
 
 	/*
 	 * This is just grabbed for accounting purposes. When a process exits,
@@ -11431,7 +11439,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
 			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
 			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
-			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL))
+			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
+			IORING_SETUP_COOP_TASKRUN))
 		return -EINVAL;
 
 	return  io_uring_create(entries, &p, params);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 980d82eb196e..a84f29d657c3 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -102,6 +102,14 @@ enum {
 #define IORING_SETUP_ATTACH_WQ	(1U << 5)	/* attach to existing wq */
 #define IORING_SETUP_R_DISABLED	(1U << 6)	/* start with ring disabled */
 #define IORING_SETUP_SUBMIT_ALL	(1U << 7)	/* continue submit on error */
+/*
+ * Cooperative task running. When requests complete, they often require
+ * forcing the submitter to transition to the kernel to complete. If this
+ * flag is set, work will be done when the task transitions anyway, rather
+ * than force an inter-processor interrupt reschedule. This avoids interrupting
+ * a task running in userspace, and saves an IPI.
+ */
+#define IORING_SETUP_COOP_TASKRUN	(1U << 8)
 
 enum {
 	IORING_OP_NOP,

From ef060ea9e4fd3b763e7060a3af0a258d2d5d7c0d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 25 Apr 2022 19:49:04 -0600
Subject: [PATCH 069/111] io_uring: add IORING_SETUP_TASKRUN_FLAG

If IORING_SETUP_COOP_TASKRUN is set to use cooperative scheduling for
running task_work, then IORING_SETUP_TASKRUN_FLAG can be set so the
application can tell if task_work is pending in the kernel for this
ring. This allows use cases like io_uring_peek_cqe() to still function
appropriately, or for the task to know when it would be useful to
call io_uring_wait_cqe() to run pending events.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/20220426014904.60384-7-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 14 +++++++++++---
 include/uapi/linux/io_uring.h |  7 +++++++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0b9ae3615911..72cb2d50125c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2506,6 +2506,8 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 {
 	if (!ctx)
 		return;
+	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
 	if (*locked) {
 		io_submit_flush_completions(ctx);
 		mutex_unlock(&ctx->uring_lock);
@@ -2646,6 +2648,9 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority)
 	if (running)
 		return;
 
+	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+
 	if (likely(!task_work_add(tsk, &tctx->task_work, ctx->notify_method)))
 		return;
 
@@ -11333,12 +11338,15 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	ret = -EINVAL;
 	if (ctx->flags & IORING_SETUP_SQPOLL) {
 		/* IPI related flags don't make sense with SQPOLL */
-		if (ctx->flags & IORING_SETUP_COOP_TASKRUN)
+		if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
+				  IORING_SETUP_TASKRUN_FLAG))
 			goto err;
 		ctx->notify_method = TWA_SIGNAL_NO_IPI;
 	} else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
 		ctx->notify_method = TWA_SIGNAL_NO_IPI;
 	} else {
+		if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+			goto err;
 		ctx->notify_method = TWA_SIGNAL;
 	}
 
@@ -11440,10 +11448,10 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
 			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
 			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
-			IORING_SETUP_COOP_TASKRUN))
+			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG))
 		return -EINVAL;
 
-	return  io_uring_create(entries, &p, params);
+	return io_uring_create(entries, &p, params);
 }
 
 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index a84f29d657c3..fad63564678a 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -110,6 +110,12 @@ enum {
  * a task running in userspace, and saves an IPI.
  */
 #define IORING_SETUP_COOP_TASKRUN	(1U << 8)
+/*
+ * If COOP_TASKRUN is set, get notified if task work is available for
+ * running and a kernel transition would be needed to run it. This sets
+ * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
+ */
+#define IORING_SETUP_TASKRUN_FLAG	(1U << 9)
 
 enum {
 	IORING_OP_NOP,
@@ -256,6 +262,7 @@ struct io_sqring_offsets {
  */
 #define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
 #define IORING_SQ_CQ_OVERFLOW	(1U << 1) /* CQ ring is overflown */
+#define IORING_SQ_TASKRUN	(1U << 2) /* task should enter the kernel */
 
 struct io_cqring_offsets {
 	__u32 head;

From f2e030dd7aaea5a937a2547dc980fab418fbc5e7 Mon Sep 17 00:00:00 2001
From: Almog Khaikin <almogkh@gmail.com>
Date: Tue, 26 Apr 2022 19:34:03 +0300
Subject: [PATCH 070/111] io_uring: replace smp_mb() with
 smp_mb__after_atomic() in io_sq_thread()

The IORING_SQ_NEED_WAKEUP flag is now set using atomic_or() which
implies a full barrier on some architectures but it is not required to
do so. Use the more appropriate smp_mb__after_atomic() which avoids the
extra barrier on those architectures.

Signed-off-by: Almog Khaikin <almogkh@gmail.com>
Link: https://lore.kernel.org/r/20220426163403.112692-1-almogkh@gmail.com
Fixes: 8018823e6987 ("io_uring: serialize ctx->rings->sq_flags with atomic_or/and")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 72cb2d50125c..1e7466079af7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8229,7 +8229,7 @@ static int io_sq_thread(void *data)
 				 * Ensure the store of the wakeup flag is not
 				 * reordered with the load of the SQ tail
 				 */
-				smp_mb();
+				smp_mb__after_atomic();
 
 				if (io_sqring_entries(ctx)) {
 					needs_sched = false;

From 73911426aaaadbae54fa72359b33a7b6a56947db Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 26 Apr 2022 11:34:56 -0600
Subject: [PATCH 071/111] io_uring: check IOPOLL/ioprio support upfront

Don't punt this check to the op prep handlers, add the support to
io_op_defs and we can check them while setting up the request.

This reduces the text size by 500 bytes on aarch64, and makes this less
fragile by having the check in one spot and needing opcodes to opt in
to IOPOLL or ioprio support.

Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 153 +++++++++++++++++++-------------------------------
 1 file changed, 58 insertions(+), 95 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1e7466079af7..efe4e92ad8ad 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1028,12 +1028,19 @@ struct io_op_def {
 	unsigned		not_supported : 1;
 	/* skip auditing */
 	unsigned		audit_skip : 1;
+	/* supports ioprio */
+	unsigned		ioprio : 1;
+	/* supports iopoll */
+	unsigned		iopoll : 1;
 	/* size of async data needed, if any */
 	unsigned short		async_size;
 };
 
 static const struct io_op_def io_op_defs[] = {
-	[IORING_OP_NOP] = {},
+	[IORING_OP_NOP] = {
+		.audit_skip		= 1,
+		.iopoll			= 1,
+	},
 	[IORING_OP_READV] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
@@ -1042,6 +1049,8 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_async_setup	= 1,
 		.plug			= 1,
 		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
 		.async_size		= sizeof(struct io_async_rw),
 	},
 	[IORING_OP_WRITEV] = {
@@ -1052,6 +1061,8 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_async_setup	= 1,
 		.plug			= 1,
 		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
 		.async_size		= sizeof(struct io_async_rw),
 	},
 	[IORING_OP_FSYNC] = {
@@ -1064,6 +1075,8 @@ static const struct io_op_def io_op_defs[] = {
 		.pollin			= 1,
 		.plug			= 1,
 		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
 		.async_size		= sizeof(struct io_async_rw),
 	},
 	[IORING_OP_WRITE_FIXED] = {
@@ -1073,6 +1086,8 @@ static const struct io_op_def io_op_defs[] = {
 		.pollout		= 1,
 		.plug			= 1,
 		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
 		.async_size		= sizeof(struct io_async_rw),
 	},
 	[IORING_OP_POLL_ADD] = {
@@ -1137,6 +1152,7 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_CLOSE] = {},
 	[IORING_OP_FILES_UPDATE] = {
 		.audit_skip		= 1,
+		.iopoll			= 1,
 	},
 	[IORING_OP_STATX] = {
 		.audit_skip		= 1,
@@ -1148,6 +1164,8 @@ static const struct io_op_def io_op_defs[] = {
 		.buffer_select		= 1,
 		.plug			= 1,
 		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
 		.async_size		= sizeof(struct io_async_rw),
 	},
 	[IORING_OP_WRITE] = {
@@ -1157,6 +1175,8 @@ static const struct io_op_def io_op_defs[] = {
 		.pollout		= 1,
 		.plug			= 1,
 		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
 		.async_size		= sizeof(struct io_async_rw),
 	},
 	[IORING_OP_FADVISE] = {
@@ -1191,9 +1211,11 @@ static const struct io_op_def io_op_defs[] = {
 	},
 	[IORING_OP_PROVIDE_BUFFERS] = {
 		.audit_skip		= 1,
+		.iopoll			= 1,
 	},
 	[IORING_OP_REMOVE_BUFFERS] = {
 		.audit_skip		= 1,
+		.iopoll			= 1,
 	},
 	[IORING_OP_TEE] = {
 		.needs_file		= 1,
@@ -1211,6 +1233,7 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_LINKAT] = {},
 	[IORING_OP_MSG_RING] = {
 		.needs_file		= 1,
+		.iopoll			= 1,
 	},
 };
 
@@ -4139,9 +4162,7 @@ static int io_renameat_prep(struct io_kiocb *req,
 	struct io_rename *ren = &req->rename;
 	const char __user *oldf, *newf;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
+	if (sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
@@ -4190,10 +4211,7 @@ static int io_unlinkat_prep(struct io_kiocb *req,
 	struct io_unlink *un = &req->unlink;
 	const char __user *fname;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
-	    sqe->splice_fd_in)
+	if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
@@ -4239,10 +4257,7 @@ static int io_mkdirat_prep(struct io_kiocb *req,
 	struct io_mkdir *mkd = &req->mkdir;
 	const char __user *fname;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
-	    sqe->splice_fd_in)
+	if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
@@ -4282,10 +4297,7 @@ static int io_symlinkat_prep(struct io_kiocb *req,
 	struct io_symlink *sl = &req->symlink;
 	const char __user *oldpath, *newpath;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
-	    sqe->splice_fd_in)
+	if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
@@ -4331,9 +4343,7 @@ static int io_linkat_prep(struct io_kiocb *req,
 	struct io_hardlink *lnk = &req->hardlink;
 	const char __user *oldf, *newf;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
+	if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
@@ -4380,9 +4390,7 @@ static int io_shutdown_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
 #if defined(CONFIG_NET)
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
+	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
 		     sqe->buf_index || sqe->splice_fd_in))
 		return -EINVAL;
 
@@ -4422,9 +4430,6 @@ static int __io_splice_prep(struct io_kiocb *req,
 	struct io_splice *sp = &req->splice;
 	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-
 	sp->len = READ_ONCE(sqe->len);
 	sp->flags = READ_ONCE(sqe->splice_flags);
 	if (unlikely(sp->flags & ~valid_flags))
@@ -4523,11 +4528,6 @@ done:
  */
 static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_ring_ctx *ctx = req->ctx;
-
-	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-
 	__io_req_complete(req, issue_flags, 0, 0);
 	return 0;
 }
@@ -4535,8 +4535,8 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 static int io_msg_ring_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
-	if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags ||
-		     sqe->splice_fd_in || sqe->buf_index || sqe->personality))
+	if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
+		     sqe->buf_index || sqe->personality))
 		return -EINVAL;
 
 	req->msg.user_data = READ_ONCE(sqe->off);
@@ -4577,12 +4577,7 @@ done:
 
 static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_ring_ctx *ctx = req->ctx;
-
-	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
-		     sqe->splice_fd_in))
+	if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
 		return -EINVAL;
 
 	req->sync.flags = READ_ONCE(sqe->fsync_flags);
@@ -4615,10 +4610,7 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
 static int io_fallocate_prep(struct io_kiocb *req,
 			     const struct io_uring_sqe *sqe)
 {
-	if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
-	    sqe->splice_fd_in)
-		return -EINVAL;
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+	if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
 		return -EINVAL;
 
 	req->sync.off = READ_ONCE(sqe->off);
@@ -4649,9 +4641,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	const char __user *fname;
 	int ret;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (unlikely(sqe->ioprio || sqe->buf_index))
+	if (unlikely(sqe->buf_index))
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
@@ -4783,7 +4773,7 @@ static int io_remove_buffers_prep(struct io_kiocb *req,
 	struct io_provide_buf *p = &req->pbuf;
 	u64 tmp;
 
-	if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
+	if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
 	    sqe->splice_fd_in)
 		return -EINVAL;
 
@@ -4850,7 +4840,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
 	struct io_provide_buf *p = &req->pbuf;
 	u64 tmp;
 
-	if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
+	if (sqe->rw_flags || sqe->splice_fd_in)
 		return -EINVAL;
 
 	tmp = READ_ONCE(sqe->fd);
@@ -4980,9 +4970,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
 			     const struct io_uring_sqe *sqe)
 {
 #if defined(CONFIG_EPOLL)
-	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
-		return -EINVAL;
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+	if (sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 
 	req->epoll.epfd = READ_ONCE(sqe->fd);
@@ -5026,9 +5014,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
-	if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
-		return -EINVAL;
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+	if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
 		return -EINVAL;
 
 	req->madvise.addr = READ_ONCE(sqe->addr);
@@ -5061,9 +5047,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
-		return -EINVAL;
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+	if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
 		return -EINVAL;
 
 	req->fadvise.offset = READ_ONCE(sqe->off);
@@ -5099,9 +5083,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	const char __user *path;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
+	if (sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	if (req->flags & REQ_F_FIXED_FILE)
 		return -EBADF;
@@ -5146,10 +5128,7 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
-	    sqe->rw_flags || sqe->buf_index)
+	if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
 		return -EINVAL;
 	if (req->flags & REQ_F_FIXED_FILE)
 		return -EBADF;
@@ -5215,12 +5194,7 @@ err:
 
 static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_ring_ctx *ctx = req->ctx;
-
-	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
-		     sqe->splice_fd_in))
+	if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
 		return -EINVAL;
 
 	req->sync.off = READ_ONCE(sqe->off);
@@ -5298,7 +5272,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *sr = &req->sr_msg;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+	if (unlikely(sqe->addr2 || sqe->file_index))
 		return -EINVAL;
 
 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -5531,7 +5505,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *sr = &req->sr_msg;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+	if (unlikely(sqe->addr2 || sqe->file_index))
 		return -EINVAL;
 
 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -5688,9 +5662,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_accept *accept = &req->accept;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->len || sqe->buf_index)
+	if (sqe->len || sqe->buf_index)
 		return -EINVAL;
 
 	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -5756,10 +5728,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_connect *conn = &req->connect;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
-	    sqe->splice_fd_in)
+	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
 		return -EINVAL;
 
 	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -6442,9 +6411,7 @@ static int io_poll_update_prep(struct io_kiocb *req,
 	struct io_poll_update *upd = &req->poll_update;
 	u32 flags;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
+	if (sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	flags = READ_ONCE(sqe->len);
 	if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
@@ -6474,9 +6441,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	struct io_poll_iocb *poll = &req->poll;
 	u32 flags;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
+	if (sqe->buf_index || sqe->off || sqe->addr)
 		return -EINVAL;
 	flags = READ_ONCE(sqe->len);
 	if (flags & ~IORING_POLL_ADD_MULTI)
@@ -6683,11 +6648,9 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
 {
 	struct io_timeout_rem *tr = &req->timeout_rem;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
+	if (sqe->buf_index || sqe->len || sqe->splice_fd_in)
 		return -EINVAL;
 
 	tr->ltimeout = false;
@@ -6757,10 +6720,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	unsigned flags;
 	u32 off = READ_ONCE(sqe->off);
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
-	    sqe->splice_fd_in)
+	if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in)
 		return -EINVAL;
 	if (off && is_timeout_link)
 		return -EINVAL;
@@ -6942,11 +6902,9 @@ out:
 static int io_async_cancel_prep(struct io_kiocb *req,
 				const struct io_uring_sqe *sqe)
 {
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->off || sqe->len || sqe->splice_fd_in)
+	if (sqe->off || sqe->len || sqe->splice_fd_in)
 		return -EINVAL;
 
 	req->cancel.addr = READ_ONCE(sqe->addr);
@@ -7032,7 +6990,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
 {
 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
+	if (sqe->rw_flags || sqe->splice_fd_in)
 		return -EINVAL;
 
 	req->rsrc_update.offset = READ_ONCE(sqe->off);
@@ -7845,6 +7803,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		}
 	}
 
+	if (!io_op_defs[opcode].ioprio && sqe->ioprio)
+		return -EINVAL;
+	if (!io_op_defs[opcode].iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+
 	if (io_op_defs[opcode].needs_file) {
 		struct io_submit_state *state = &ctx->submit_state;
 

From 0455d4ccec548b0fb51db39a4d3350a7a80a0222 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 26 Apr 2022 12:11:33 -0600
Subject: [PATCH 072/111] io_uring: add POLL_FIRST support for send/sendmsg and
 recv/recvmsg

If IORING_RECVSEND_POLL_FIRST is set for recv/recvmsg or send/sendmsg,
then we arm poll first rather than attempt a receive or send upfront.
This can be useful if we expect there to be no data (or space) available
for the request, as we can then avoid wasting time on the initial
issue attempt.

Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 27 +++++++++++++++++++++++++--
 include/uapi/linux/io_uring.h | 10 ++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index efe4e92ad8ad..6db9ab8d4d15 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -637,6 +637,7 @@ struct io_sr_msg {
 	int				bgid;
 	size_t				len;
 	size_t				done_io;
+	unsigned int			flags;
 };
 
 struct io_open {
@@ -5272,11 +5273,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *sr = &req->sr_msg;
 
-	if (unlikely(sqe->addr2 || sqe->file_index))
+	if (unlikely(sqe->file_index))
 		return -EINVAL;
 
 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	sr->len = READ_ONCE(sqe->len);
+	sr->flags = READ_ONCE(sqe->addr2);
+	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+		return -EINVAL;
 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 	if (sr->msg_flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
@@ -5311,6 +5315,10 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 		kmsg = &iomsg;
 	}
 
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
+		return io_setup_async_msg(req, kmsg);
+
 	flags = req->sr_msg.msg_flags;
 	if (issue_flags & IO_URING_F_NONBLOCK)
 		flags |= MSG_DONTWAIT;
@@ -5353,6 +5361,10 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	int min_ret = 0;
 	int ret;
 
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
+		return -EAGAIN;
+
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
 		return -ENOTSOCK;
@@ -5505,11 +5517,14 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *sr = &req->sr_msg;
 
-	if (unlikely(sqe->addr2 || sqe->file_index))
+	if (unlikely(sqe->file_index))
 		return -EINVAL;
 
 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	sr->len = READ_ONCE(sqe->len);
+	sr->flags = READ_ONCE(sqe->addr2);
+	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+		return -EINVAL;
 	sr->bgid = READ_ONCE(sqe->buf_group);
 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 	if (sr->msg_flags & MSG_DONTWAIT)
@@ -5546,6 +5561,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 		kmsg = &iomsg;
 	}
 
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
+		return io_setup_async_msg(req, kmsg);
+
 	if (req->flags & REQ_F_BUFFER_SELECT) {
 		kbuf = io_recv_buffer_select(req, issue_flags);
 		if (IS_ERR(kbuf))
@@ -5603,6 +5622,10 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	int ret, min_ret = 0;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
+		return -EAGAIN;
+
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
 		return -ENOTSOCK;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index fad63564678a..06621a278cb6 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -213,6 +213,16 @@ enum {
 #define IORING_ASYNC_CANCEL_FD	(1U << 1)
 #define IORING_ASYNC_CANCEL_ANY	(1U << 2)
 
+/*
+ * send/sendmsg and recv/recvmsg flags (sqe->addr2)
+ *
+ * IORING_RECVSEND_POLL_FIRST	If set, instead of first attempting to send
+ *				or receive and arm poll if that yields an
+ *				-EAGAIN result, arm poll upfront and skip
+ *				the initial transfer attempt.
+ */
+#define IORING_RECVSEND_POLL_FIRST	(1U << 0)
+
 /*
  * IO completion data structure (Completion Queue Entry)
  */

From 0a352aaa9473006f6bc5b0b2d707827d1c1afe80 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 30 Apr 2022 15:22:44 -0600
Subject: [PATCH 073/111] io_uring: use 'sr' vs 'req->sr_msg' consistently

For all of send/sendmsg and recv/recvmsg we have the local 'sr' variable,
yet some cases still use req->sr_msg which sr points to. Use 'sr'
consistently.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6db9ab8d4d15..2d7190893a67 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5319,7 +5319,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
 		return io_setup_async_msg(req, kmsg);
 
-	flags = req->sr_msg.msg_flags;
+	flags = sr->msg_flags;
 	if (issue_flags & IO_URING_F_NONBLOCK)
 		flags |= MSG_DONTWAIT;
 	if (flags & MSG_WAITALL)
@@ -5378,7 +5378,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	msg.msg_controllen = 0;
 	msg.msg_namelen = 0;
 
-	flags = req->sr_msg.msg_flags;
+	flags = sr->msg_flags;
 	if (issue_flags & IO_URING_F_NONBLOCK)
 		flags |= MSG_DONTWAIT;
 	if (flags & MSG_WAITALL)
@@ -5570,19 +5570,18 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 		if (IS_ERR(kbuf))
 			return PTR_ERR(kbuf);
 		kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
-		kmsg->fast_iov[0].iov_len = req->sr_msg.len;
-		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
-				1, req->sr_msg.len);
+		kmsg->fast_iov[0].iov_len = sr->len;
+		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
+				sr->len);
 	}
 
-	flags = req->sr_msg.msg_flags;
+	flags = sr->msg_flags;
 	if (force_nonblock)
 		flags |= MSG_DONTWAIT;
 	if (flags & MSG_WAITALL)
 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
 
-	ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
-					kmsg->uaddr, flags);
+	ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);
 	if (ret < min_ret) {
 		if (ret == -EAGAIN && force_nonblock)
 			return io_setup_async_msg(req, kmsg);
@@ -5648,7 +5647,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	msg.msg_iocb = NULL;
 	msg.msg_flags = 0;
 
-	flags = req->sr_msg.msg_flags;
+	flags = sr->msg_flags;
 	if (force_nonblock)
 		flags |= MSG_DONTWAIT;
 	if (flags & MSG_WAITALL)

From 9396ed850f2e0ea286de372968987157aeeca617 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Apr 2022 13:36:50 -0600
Subject: [PATCH 074/111] io_uring: kill io_recv_buffer_select() wrapper

It's just a thin wrapper around io_buffer_select(), get rid of it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2d7190893a67..3dbf096f0425 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5495,14 +5495,6 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
 	return __io_recvmsg_copy_hdr(req, iomsg);
 }
 
-static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
-					       unsigned int issue_flags)
-{
-	struct io_sr_msg *sr = &req->sr_msg;
-
-	return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
-}
-
 static int io_recvmsg_prep_async(struct io_kiocb *req)
 {
 	int ret;
@@ -5566,7 +5558,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 		return io_setup_async_msg(req, kmsg);
 
 	if (req->flags & REQ_F_BUFFER_SELECT) {
-		kbuf = io_recv_buffer_select(req, issue_flags);
+		kbuf = io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
 		if (IS_ERR(kbuf))
 			return PTR_ERR(kbuf);
 		kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
@@ -5630,7 +5622,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 		return -ENOTSOCK;
 
 	if (req->flags & REQ_F_BUFFER_SELECT) {
-		kbuf = io_recv_buffer_select(req, issue_flags);
+		kbuf = io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
 		if (IS_ERR(kbuf))
 			return PTR_ERR(kbuf);
 		buf = u64_to_user_ptr(kbuf->addr);

From c54d52c2d6131d112176b26aa97dc5e1af930d6e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Apr 2022 13:57:23 -0600
Subject: [PATCH 075/111] io_uring: make io_buffer_select() return the user
 address directly

There's no point in having callers provide a kbuf, we're just returning
the address anyway.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 46 ++++++++++++++++++++--------------------------
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3dbf096f0425..2b87c89d2375 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3458,15 +3458,15 @@ static void io_buffer_add_list(struct io_ring_ctx *ctx,
 	list_add(&bl->list, list);
 }
 
-static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
-					  int bgid, unsigned int issue_flags)
+static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
+				     int bgid, unsigned int issue_flags)
 {
 	struct io_buffer *kbuf = req->kbuf;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer_list *bl;
 
 	if (req->flags & REQ_F_BUFFER_SELECTED)
-		return kbuf;
+		return u64_to_user_ptr(kbuf->addr);
 
 	io_ring_submit_lock(req->ctx, issue_flags);
 
@@ -3478,25 +3478,18 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
 			*len = kbuf->len;
 		req->flags |= REQ_F_BUFFER_SELECTED;
 		req->kbuf = kbuf;
-	} else {
-		kbuf = ERR_PTR(-ENOBUFS);
+		io_ring_submit_unlock(req->ctx, issue_flags);
+		return u64_to_user_ptr(kbuf->addr);
 	}
 
 	io_ring_submit_unlock(req->ctx, issue_flags);
-	return kbuf;
+	return ERR_PTR(-ENOBUFS);
 }
 
 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
 					unsigned int issue_flags)
 {
-	struct io_buffer *kbuf;
-	u16 bgid;
-
-	bgid = req->buf_index;
-	kbuf = io_buffer_select(req, len, bgid, issue_flags);
-	if (IS_ERR(kbuf))
-		return kbuf;
-	return u64_to_user_ptr(kbuf->addr);
+	return io_buffer_select(req, len, req->buf_index, issue_flags);
 }
 
 #ifdef CONFIG_COMPAT
@@ -5535,7 +5528,6 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_async_msghdr iomsg, *kmsg;
 	struct io_sr_msg *sr = &req->sr_msg;
 	struct socket *sock;
-	struct io_buffer *kbuf;
 	unsigned flags;
 	int ret, min_ret = 0;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
@@ -5558,10 +5550,12 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 		return io_setup_async_msg(req, kmsg);
 
 	if (req->flags & REQ_F_BUFFER_SELECT) {
-		kbuf = io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
-		if (IS_ERR(kbuf))
-			return PTR_ERR(kbuf);
-		kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
+		void __user *buf;
+
+		buf = io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
+		if (IS_ERR(buf))
+			return PTR_ERR(buf);
+		kmsg->fast_iov[0].iov_base = buf;
 		kmsg->fast_iov[0].iov_len = sr->len;
 		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
 				sr->len);
@@ -5603,10 +5597,8 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_buffer *kbuf;
 	struct io_sr_msg *sr = &req->sr_msg;
 	struct msghdr msg;
-	void __user *buf = sr->buf;
 	struct socket *sock;
 	struct iovec iov;
 	unsigned flags;
@@ -5622,13 +5614,15 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 		return -ENOTSOCK;
 
 	if (req->flags & REQ_F_BUFFER_SELECT) {
-		kbuf = io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
-		if (IS_ERR(kbuf))
-			return PTR_ERR(kbuf);
-		buf = u64_to_user_ptr(kbuf->addr);
+		void __user *buf;
+
+		buf = io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
+		if (IS_ERR(buf))
+			return PTR_ERR(buf);
+		sr->buf = buf;
 	}
 
-	ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
+	ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter);
 	if (unlikely(ret))
 		goto out_free;
 

From e5b003495e934321f78ba1f95e48da4a8c3a3a35 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Apr 2022 14:02:49 -0600
Subject: [PATCH 076/111] io_uring: kill io_rw_buffer_select() wrapper

After the recent changes, this is direct call to io_buffer_select()
anyway. With this change, there are no wrappers left for provided
buffer selection.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2b87c89d2375..e13316969e9e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3486,12 +3486,6 @@ static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 	return ERR_PTR(-ENOBUFS);
 }
 
-static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
-					unsigned int issue_flags)
-{
-	return io_buffer_select(req, len, req->buf_index, issue_flags);
-}
-
 #ifdef CONFIG_COMPAT
 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 				unsigned int issue_flags)
@@ -3499,7 +3493,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 	struct compat_iovec __user *uiov;
 	compat_ssize_t clen;
 	void __user *buf;
-	ssize_t len;
+	size_t len;
 
 	uiov = u64_to_user_ptr(req->rw.addr);
 	if (!access_ok(uiov, sizeof(*uiov)))
@@ -3510,7 +3504,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 		return -EINVAL;
 
 	len = clen;
-	buf = io_rw_buffer_select(req, &len, issue_flags);
+	buf = io_buffer_select(req, &len, req->buf_index, issue_flags);
 	if (IS_ERR(buf))
 		return PTR_ERR(buf);
 	iov[0].iov_base = buf;
@@ -3532,7 +3526,7 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 	len = iov[0].iov_len;
 	if (len < 0)
 		return -EINVAL;
-	buf = io_rw_buffer_select(req, &len, issue_flags);
+	buf = io_buffer_select(req, &len, req->buf_index, issue_flags);
 	if (IS_ERR(buf))
 		return PTR_ERR(buf);
 	iov[0].iov_base = buf;
@@ -3588,7 +3582,8 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
 
 	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
 		if (req->flags & REQ_F_BUFFER_SELECT) {
-			buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
+			buf = io_buffer_select(req, &sqe_len, req->buf_index,
+						issue_flags);
 			if (IS_ERR(buf))
 				return ERR_CAST(buf);
 			req->rw.len = sqe_len;

From bb68d504f7c4183178b00c1af68fca870728e7e0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 29 Apr 2022 09:25:12 -0600
Subject: [PATCH 077/111] io_uring: ignore ->buf_index if REQ_F_BUFFER_SELECT
 isn't set

There's no point in validity checking buf_index if the request doesn't
have REQ_F_BUFFER_SELECT set, as we will never use it for that case.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e13316969e9e..bd0ee2f7df2c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3573,10 +3573,6 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
 		return NULL;
 	}
 
-	/* buffer index only valid with fixed read/write, or buffer select  */
-	if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
-		return ERR_PTR(-EINVAL);
-
 	buf = u64_to_user_ptr(req->rw.addr);
 	sqe_len = req->rw.len;
 

From 4e9067025259d1227c7f4f18a02c166c93e49290 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Apr 2022 19:09:43 -0600
Subject: [PATCH 078/111] io_uring: always use req->buf_index for the provided
 buffer group

The read/write opcodes use it already, but the recv/recvmsg do not. If
we switch them over and read and validate this at init time while we're
checking if the opcode supports it anyway, then we can do it in one spot
and we don't have to pass in a separate group ID for io_buffer_select().

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bd0ee2f7df2c..b5efabc917c6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -634,7 +634,6 @@ struct io_sr_msg {
 		void __user			*buf;
 	};
 	int				msg_flags;
-	int				bgid;
 	size_t				len;
 	size_t				done_io;
 	unsigned int			flags;
@@ -3299,6 +3298,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	req->rw.addr = READ_ONCE(sqe->addr);
 	req->rw.len = READ_ONCE(sqe->len);
 	req->rw.flags = READ_ONCE(sqe->rw_flags);
+	/* used for fixed read/write too - just read unconditionally */
 	req->buf_index = READ_ONCE(sqe->buf_index);
 	return 0;
 }
@@ -3459,7 +3459,7 @@ static void io_buffer_add_list(struct io_ring_ctx *ctx,
 }
 
 static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
-				     int bgid, unsigned int issue_flags)
+				     unsigned int issue_flags)
 {
 	struct io_buffer *kbuf = req->kbuf;
 	struct io_ring_ctx *ctx = req->ctx;
@@ -3470,7 +3470,7 @@ static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 
 	io_ring_submit_lock(req->ctx, issue_flags);
 
-	bl = io_buffer_get_list(ctx, bgid);
+	bl = io_buffer_get_list(ctx, req->buf_index);
 	if (bl && !list_empty(&bl->buf_list)) {
 		kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
 		list_del(&kbuf->list);
@@ -3504,7 +3504,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 		return -EINVAL;
 
 	len = clen;
-	buf = io_buffer_select(req, &len, req->buf_index, issue_flags);
+	buf = io_buffer_select(req, &len, issue_flags);
 	if (IS_ERR(buf))
 		return PTR_ERR(buf);
 	iov[0].iov_base = buf;
@@ -3526,7 +3526,7 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 	len = iov[0].iov_len;
 	if (len < 0)
 		return -EINVAL;
-	buf = io_buffer_select(req, &len, req->buf_index, issue_flags);
+	buf = io_buffer_select(req, &len, issue_flags);
 	if (IS_ERR(buf))
 		return PTR_ERR(buf);
 	iov[0].iov_base = buf;
@@ -3578,8 +3578,7 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
 
 	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
 		if (req->flags & REQ_F_BUFFER_SELECT) {
-			buf = io_buffer_select(req, &sqe_len, req->buf_index,
-						issue_flags);
+			buf = io_buffer_select(req, &sqe_len, issue_flags);
 			if (IS_ERR(buf))
 				return ERR_CAST(buf);
 			req->rw.len = sqe_len;
@@ -5501,7 +5500,6 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	sr->flags = READ_ONCE(sqe->addr2);
 	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
 		return -EINVAL;
-	sr->bgid = READ_ONCE(sqe->buf_group);
 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 	if (sr->msg_flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
@@ -5543,7 +5541,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	if (req->flags & REQ_F_BUFFER_SELECT) {
 		void __user *buf;
 
-		buf = io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
+		buf = io_buffer_select(req, &sr->len, issue_flags);
 		if (IS_ERR(buf))
 			return PTR_ERR(buf);
 		kmsg->fast_iov[0].iov_base = buf;
@@ -5607,7 +5605,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	if (req->flags & REQ_F_BUFFER_SELECT) {
 		void __user *buf;
 
-		buf = io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
+		buf = io_buffer_select(req, &sr->len, issue_flags);
 		if (IS_ERR(buf))
 			return PTR_ERR(buf);
 		sr->buf = buf;
@@ -7777,9 +7775,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		/* enforce forwards compatibility on users */
 		if (sqe_flags & ~SQE_VALID_FLAGS)
 			return -EINVAL;
-		if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
-		    !io_op_defs[opcode].buffer_select)
-			return -EOPNOTSUPP;
+		if (sqe_flags & IOSQE_BUFFER_SELECT) {
+			if (!io_op_defs[opcode].buffer_select)
+				return -EOPNOTSUPP;
+			req->buf_index = READ_ONCE(sqe->buf_group);
+		}
 		if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
 			ctx->drain_disabled = true;
 		if (sqe_flags & IOSQE_IO_DRAIN) {

From 9cfc7e94e42be9c73072606b84d4574a0a2ec270 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 1 May 2022 10:52:44 -0600
Subject: [PATCH 079/111] io_uring: get rid of hashed provided buffer groups

Use a plain array for any group ID that's less than 64, and punt
anything beyond that to an xarray. 64 fits in a page even for 4KB
page sizes and with the planned additions.

This makes the expected group usage faster by avoiding a hash and lookup
to find our list, and it uses less memory upfront by not allocating any
memory for provided buffers unless it's actually being used.

Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 97 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 58 insertions(+), 39 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b5efabc917c6..0672ce0bd832 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -282,7 +282,6 @@ struct io_rsrc_data {
 };
 
 struct io_buffer_list {
-	struct list_head list;
 	struct list_head buf_list;
 	__u16 bgid;
 };
@@ -357,7 +356,7 @@ struct io_ev_fd {
 	struct rcu_head		rcu;
 };
 
-#define IO_BUFFERS_HASH_BITS	5
+#define BGID_ARRAY	64
 
 struct io_ring_ctx {
 	/* const or read-mostly hot data */
@@ -414,7 +413,8 @@ struct io_ring_ctx {
 		struct list_head	timeout_list;
 		struct list_head	ltimeout_list;
 		struct list_head	cq_overflow_list;
-		struct list_head	*io_buffers;
+		struct io_buffer_list	*io_bl;
+		struct xarray		io_bl_xa;
 		struct list_head	io_buffers_cache;
 		struct list_head	apoll_cache;
 		struct xarray		personalities;
@@ -1507,15 +1507,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
 static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
 						 unsigned int bgid)
 {
-	struct list_head *hash_list;
-	struct io_buffer_list *bl;
+	if (ctx->io_bl && bgid < BGID_ARRAY)
+		return &ctx->io_bl[bgid];
 
-	hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
-	list_for_each_entry(bl, hash_list, list)
-		if (bl->bgid == bgid || bgid == -1U)
-			return bl;
-
-	return NULL;
+	return xa_load(&ctx->io_bl_xa, bgid);
 }
 
 static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
@@ -1621,12 +1616,14 @@ static __cold void io_fallback_req_func(struct work_struct *work)
 static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 {
 	struct io_ring_ctx *ctx;
-	int i, hash_bits;
+	int hash_bits;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
 		return NULL;
 
+	xa_init(&ctx->io_bl_xa);
+
 	/*
 	 * Use 5 bits less than the max cq entries, that should give us around
 	 * 32 entries per hash list if totally full and uniformly spread.
@@ -1648,13 +1645,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	/* set invalid range, so io_import_fixed() fails meeting it */
 	ctx->dummy_ubuf->ubuf = -1UL;
 
-	ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS,
-					sizeof(struct list_head), GFP_KERNEL);
-	if (!ctx->io_buffers)
-		goto err;
-	for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++)
-		INIT_LIST_HEAD(&ctx->io_buffers[i]);
-
 	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
 			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
 		goto err;
@@ -1690,7 +1680,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 err:
 	kfree(ctx->dummy_ubuf);
 	kfree(ctx->cancel_hash);
-	kfree(ctx->io_buffers);
+	kfree(ctx->io_bl);
+	xa_destroy(&ctx->io_bl_xa);
 	kfree(ctx);
 	return NULL;
 }
@@ -3447,15 +3438,14 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
 	return __io_import_fixed(req, rw, iter, imu);
 }
 
-static void io_buffer_add_list(struct io_ring_ctx *ctx,
-			       struct io_buffer_list *bl, unsigned int bgid)
+static int io_buffer_add_list(struct io_ring_ctx *ctx,
+			      struct io_buffer_list *bl, unsigned int bgid)
 {
-	struct list_head *list;
-
-	list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
-	INIT_LIST_HEAD(&bl->buf_list);
 	bl->bgid = bgid;
-	list_add(&bl->list, list);
+	if (bgid < BGID_ARRAY)
+		return 0;
+
+	return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
 }
 
 static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
@@ -4921,6 +4911,23 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
 	return i ? 0 : -ENOMEM;
 }
 
+static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
+{
+	int i;
+
+	ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
+				GFP_KERNEL);
+	if (!ctx->io_bl)
+		return -ENOMEM;
+
+	for (i = 0; i < BGID_ARRAY; i++) {
+		INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
+		ctx->io_bl[i].bgid = i;
+	}
+
+	return 0;
+}
+
 static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_provide_buf *p = &req->pbuf;
@@ -4930,6 +4937,12 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 
 	io_ring_submit_lock(ctx, issue_flags);
 
+	if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
+		ret = io_init_bl_list(ctx);
+		if (ret)
+			goto err;
+	}
+
 	bl = io_buffer_get_list(ctx, p->bgid);
 	if (unlikely(!bl)) {
 		bl = kmalloc(sizeof(*bl), GFP_KERNEL);
@@ -4937,7 +4950,11 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 			ret = -ENOMEM;
 			goto err;
 		}
-		io_buffer_add_list(ctx, bl, p->bgid);
+		ret = io_buffer_add_list(ctx, bl, p->bgid);
+		if (ret) {
+			kfree(bl);
+			goto err;
+		}
 	}
 
 	ret = io_add_buffers(ctx, p, bl);
@@ -9931,19 +9948,19 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
 
 static void io_destroy_buffers(struct io_ring_ctx *ctx)
 {
+	struct io_buffer_list *bl;
+	unsigned long index;
 	int i;
 
-	for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) {
-		struct list_head *list = &ctx->io_buffers[i];
+	for (i = 0; i < BGID_ARRAY; i++) {
+		if (!ctx->io_bl)
+			break;
+		__io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
+	}
 
-		while (!list_empty(list)) {
-			struct io_buffer_list *bl;
-
-			bl = list_first_entry(list, struct io_buffer_list, list);
-			__io_remove_buffers(ctx, bl, -1U);
-			list_del(&bl->list);
-			kfree(bl);
-		}
+	xa_for_each(&ctx->io_bl_xa, index, bl) {
+		xa_erase(&ctx->io_bl_xa, bl->bgid);
+		__io_remove_buffers(ctx, bl, -1U);
 	}
 
 	while (!list_empty(&ctx->io_buffers_pages)) {
@@ -10052,7 +10069,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		io_wq_put_hash(ctx->hash_map);
 	kfree(ctx->cancel_hash);
 	kfree(ctx->dummy_ubuf);
-	kfree(ctx->io_buffers);
+	kfree(ctx->io_bl);
+	xa_destroy(&ctx->io_bl_xa);
 	kfree(ctx);
 }
 
@@ -11980,6 +11998,7 @@ static int __init io_uring_init(void)
 
 	/* ->buf_index is u16 */
 	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
+	BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE);
 
 	/* should fit into one byte */
 	BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));

From b66e65f41426ec82b92ad4d9a752802bf9e2e383 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 30 Apr 2022 12:16:40 -0600
Subject: [PATCH 080/111] io_uring: never call io_buffer_select() for a buffer
 re-select

Callers already have room to store the addr and length information,
clean it up by having the caller just assign the previously provided
data.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0672ce0bd832..26b69ff5c4c3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3455,9 +3455,6 @@ static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer_list *bl;
 
-	if (req->flags & REQ_F_BUFFER_SELECTED)
-		return u64_to_user_ptr(kbuf->addr);
-
 	io_ring_submit_lock(req->ctx, issue_flags);
 
 	bl = io_buffer_get_list(ctx, req->buf_index);
@@ -3497,8 +3494,9 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 	buf = io_buffer_select(req, &len, issue_flags);
 	if (IS_ERR(buf))
 		return PTR_ERR(buf);
+	req->rw.addr = (unsigned long) buf;
 	iov[0].iov_base = buf;
-	iov[0].iov_len = (compat_size_t) len;
+	req->rw.len = iov[0].iov_len = (compat_size_t) len;
 	return 0;
 }
 #endif
@@ -3519,8 +3517,9 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 	buf = io_buffer_select(req, &len, issue_flags);
 	if (IS_ERR(buf))
 		return PTR_ERR(buf);
+	req->rw.addr = (unsigned long) buf;
 	iov[0].iov_base = buf;
-	iov[0].iov_len = len;
+	req->rw.len = iov[0].iov_len = len;
 	return 0;
 }
 
@@ -3528,10 +3527,8 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 				    unsigned int issue_flags)
 {
 	if (req->flags & REQ_F_BUFFER_SELECTED) {
-		struct io_buffer *kbuf = req->kbuf;
-
-		iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
-		iov[0].iov_len = kbuf->len;
+		iov[0].iov_base = u64_to_user_ptr(req->rw.addr);
+		iov[0].iov_len = req->rw.len;
 		return 0;
 	}
 	if (req->rw.len != 1)
@@ -3545,6 +3542,13 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 	return __io_iov_buffer_select(req, iov, issue_flags);
 }
 
+static inline bool io_do_buffer_select(struct io_kiocb *req)
+{
+	if (!(req->flags & REQ_F_BUFFER_SELECT))
+		return false;
+	return !(req->flags & REQ_F_BUFFER_SELECTED);
+}
+
 static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
 				       struct io_rw_state *s,
 				       unsigned int issue_flags)
@@ -3567,10 +3571,11 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
 	sqe_len = req->rw.len;
 
 	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
-		if (req->flags & REQ_F_BUFFER_SELECT) {
+		if (io_do_buffer_select(req)) {
 			buf = io_buffer_select(req, &sqe_len, issue_flags);
 			if (IS_ERR(buf))
 				return ERR_CAST(buf);
+			req->rw.addr = (unsigned long) buf;
 			req->rw.len = sqe_len;
 		}
 
@@ -5555,7 +5560,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
 		return io_setup_async_msg(req, kmsg);
 
-	if (req->flags & REQ_F_BUFFER_SELECT) {
+	if (io_do_buffer_select(req)) {
 		void __user *buf;
 
 		buf = io_buffer_select(req, &sr->len, issue_flags);
@@ -5619,7 +5624,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	if (unlikely(!sock))
 		return -ENOTSOCK;
 
-	if (req->flags & REQ_F_BUFFER_SELECT) {
+	if (io_do_buffer_select(req)) {
 		void __user *buf;
 
 		buf = io_buffer_select(req, &sr->len, issue_flags);

From 149c69b04a901c8b611b643af8f4dd6b104e7379 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 30 Apr 2022 12:22:02 -0600
Subject: [PATCH 081/111] io_uring: abstract out provided buffer list selection

In preparation for providing another way to select a buffer, move the
existing logic into a helper.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 26b69ff5c4c3..3d5a4e64d6fe 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3448,29 +3448,41 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx,
 	return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
 }
 
+static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
+					      struct io_buffer_list *bl,
+					      unsigned int issue_flags)
+{
+	struct io_buffer *kbuf;
+
+	if (list_empty(&bl->buf_list))
+		return ERR_PTR(-ENOBUFS);
+
+	kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
+	list_del(&kbuf->list);
+	if (*len > kbuf->len)
+		*len = kbuf->len;
+	req->flags |= REQ_F_BUFFER_SELECTED;
+	req->kbuf = kbuf;
+	io_ring_submit_unlock(req->ctx, issue_flags);
+	return u64_to_user_ptr(kbuf->addr);
+}
+
 static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 				     unsigned int issue_flags)
 {
-	struct io_buffer *kbuf = req->kbuf;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer_list *bl;
 
 	io_ring_submit_lock(req->ctx, issue_flags);
 
 	bl = io_buffer_get_list(ctx, req->buf_index);
-	if (bl && !list_empty(&bl->buf_list)) {
-		kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
-		list_del(&kbuf->list);
-		if (*len > kbuf->len)
-			*len = kbuf->len;
-		req->flags |= REQ_F_BUFFER_SELECTED;
-		req->kbuf = kbuf;
+	if (unlikely(!bl)) {
 		io_ring_submit_unlock(req->ctx, issue_flags);
-		return u64_to_user_ptr(kbuf->addr);
+		return ERR_PTR(-ENOBUFS);
 	}
 
-	io_ring_submit_unlock(req->ctx, issue_flags);
-	return ERR_PTR(-ENOBUFS);
+	/* selection helpers drop the submit lock again, if needed */
+	return io_provided_buffer_select(req, len, bl, issue_flags);
 }
 
 #ifdef CONFIG_COMPAT

From a4f8d94cfb7c69f996b6a52b1fcbec2f2504dd3f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 30 Apr 2022 14:35:17 -0600
Subject: [PATCH 082/111] io_uring: move provided and fixed buffers into the
 same io_kiocb area

These are mutually exclusive - if you use provided buffers, then you
cannot use fixed buffers and vice versa. Move them into the same spot
in the io_kiocb, which is also advantageous for provided buffers as
they get near the submit side hot cacheline.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3d5a4e64d6fe..a260a0fd7b17 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -958,8 +958,14 @@ struct io_kiocb {
 	struct task_struct		*task;
 
 	struct io_rsrc_node		*rsrc_node;
-	/* store used ubuf, so we can prevent reloading */
-	struct io_mapped_ubuf		*imu;
+
+	union {
+		/* store used ubuf, so we can prevent reloading */
+		struct io_mapped_ubuf	*imu;
+
+		/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+		struct io_buffer	*kbuf;
+	};
 
 	union {
 		/* used by request caches, completion batching and iopoll */
@@ -976,8 +982,6 @@ struct io_kiocb {
 	struct async_poll		*apoll;
 	/* opcode allocated if it needs to store data for async defer */
 	void				*async_data;
-	/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
-	struct io_buffer		*kbuf;
 	/* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
 	struct io_kiocb			*link;
 	/* custom credentials, valid IFF REQ_F_CREDS is set */

From 660cbfa2340af1d25db4f7c4e93b8c1722bc72aa Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 1 May 2022 11:57:06 -0600
Subject: [PATCH 083/111] io_uring: move provided buffer state closer to submit
 state

The timeout and other items that follow are less hot, so let's move the
provided buffer state above that.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a260a0fd7b17..87d65dedee07 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -410,12 +410,14 @@ struct io_ring_ctx {
 		struct io_mapped_ubuf	**user_bufs;
 
 		struct io_submit_state	submit_state;
-		struct list_head	timeout_list;
-		struct list_head	ltimeout_list;
-		struct list_head	cq_overflow_list;
+
 		struct io_buffer_list	*io_bl;
 		struct xarray		io_bl_xa;
 		struct list_head	io_buffers_cache;
+
+		struct list_head	timeout_list;
+		struct list_head	ltimeout_list;
+		struct list_head	cq_overflow_list;
 		struct list_head	apoll_cache;
 		struct xarray		personalities;
 		u32			pers_next;

From 1dbd023eb083249026d51f41cee48a7b199b3d4e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 1 May 2022 12:08:35 -0600
Subject: [PATCH 084/111] io_uring: eliminate the need to track provided buffer
 ID separately

We have io_kiocb->buf_index which is used for either fixed buffers, or
for provided buffers. For the latter, it's used to hold the buffer group
ID for buffer selection. Post selection, req->kbuf->bid is used to get
the buffer ID.

Store the buffer ID, when selected, in req->buf_index. If we do end up
recycling the buffer, reset it back to the buffer group ID.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 87d65dedee07..52641594a86b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -951,6 +951,11 @@ struct io_kiocb {
 	u8				opcode;
 	/* polled IO has completed */
 	u8				iopoll_completed;
+	/*
+	 * Can be either a fixed buffer index, or used with provided buffers.
+	 * For the latter, before issue it points to the buffer group ID,
+	 * and after selection it points to the buffer ID itself.
+	 */
 	u16				buf_index;
 	unsigned int			flags;
 
@@ -1456,14 +1461,11 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
 
 static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
 {
-	struct io_buffer *kbuf = req->kbuf;
-	unsigned int cflags;
-
-	cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT);
 	req->flags &= ~REQ_F_BUFFER_SELECTED;
-	list_add(&kbuf->list, list);
+	list_add(&req->kbuf->list, list);
 	req->kbuf = NULL;
-	return cflags;
+
+	return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
 }
 
 static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
@@ -1537,6 +1539,7 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 	bl = io_buffer_get_list(ctx, buf->bgid);
 	list_add(&buf->list, &bl->buf_list);
 	req->flags &= ~REQ_F_BUFFER_SELECTED;
+	req->buf_index = buf->bgid;
 	req->kbuf = NULL;
 
 	io_ring_submit_unlock(ctx, issue_flags);
@@ -3469,6 +3472,7 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
 		*len = kbuf->len;
 	req->flags |= REQ_F_BUFFER_SELECTED;
 	req->kbuf = kbuf;
+	req->buf_index = kbuf->bid;
 	io_ring_submit_unlock(req->ctx, issue_flags);
 	return u64_to_user_ptr(kbuf->addr);
 }

From 7ccba24d3bc084d891def1a6fea504e4cb327a8c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 1 May 2022 12:31:56 -0600
Subject: [PATCH 085/111] io_uring: don't clear req->kbuf when buffer selection
 is done

It's not needed as the REQ_F_BUFFER_SELECTED flag tracks the state of
whether or not kbuf is valid, so just drop it.

Suggested-by: Dylan Yudaken <dylany@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 52641594a86b..22699cb359e9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1463,7 +1463,6 @@ static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
 {
 	req->flags &= ~REQ_F_BUFFER_SELECTED;
 	list_add(&req->kbuf->list, list);
-	req->kbuf = NULL;
 
 	return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
 }
@@ -1540,7 +1539,6 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 	list_add(&buf->list, &bl->buf_list);
 	req->flags &= ~REQ_F_BUFFER_SELECTED;
 	req->buf_index = buf->bgid;
-	req->kbuf = NULL;
 
 	io_ring_submit_unlock(ctx, issue_flags);
 }

From d78bd8adfcbc55b9dc01e9034a55b2a61a2124dc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 7 May 2022 09:56:13 -0600
Subject: [PATCH 086/111] io_uring: track fixed files with a bitmap

In preparation for adding a basic allocator for direct descriptors,
add helpers that set/clear whether a file slot is used.

Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 22699cb359e9..f8a685cc0363 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -257,6 +257,7 @@ struct io_rsrc_put {
 
 struct io_file_table {
 	struct io_fixed_file *files;
+	unsigned long *bitmap;
 };
 
 struct io_rsrc_node {
@@ -7572,6 +7573,7 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 	/* mask in overlapping REQ_F and FFS bits */
 	req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
 	io_req_set_rsrc_node(req, ctx, 0);
+	WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap));
 out:
 	io_ring_submit_unlock(ctx, issue_flags);
 	return file;
@@ -8638,13 +8640,35 @@ static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
 {
 	table->files = kvcalloc(nr_files, sizeof(table->files[0]),
 				GFP_KERNEL_ACCOUNT);
-	return !!table->files;
+	if (unlikely(!table->files))
+		return false;
+
+	table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
+	if (unlikely(!table->bitmap)) {
+		kvfree(table->files);
+		return false;
+	}
+
+	return true;
 }
 
 static void io_free_file_tables(struct io_file_table *table)
 {
 	kvfree(table->files);
+	bitmap_free(table->bitmap);
 	table->files = NULL;
+	table->bitmap = NULL;
+}
+
+static inline void io_file_bitmap_set(struct io_file_table *table, int bit)
+{
+	WARN_ON_ONCE(test_bit(bit, table->bitmap));
+	__set_bit(bit, table->bitmap);
+}
+
+static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
+{
+	__clear_bit(bit, table->bitmap);
 }
 
 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
@@ -8659,6 +8683,7 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 			continue;
 		if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
 			continue;
+		io_file_bitmap_clear(&ctx->file_table, i);
 		fput(file);
 	}
 #endif
@@ -9062,6 +9087,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		}
 		file_slot = io_fixed_file_slot(&ctx->file_table, i);
 		io_fixed_file_set(file_slot, file);
+		io_file_bitmap_set(&ctx->file_table, i);
 	}
 
 	io_rsrc_node_switch(ctx, NULL);
@@ -9122,6 +9148,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 		if (ret)
 			goto err;
 		file_slot->file_ptr = 0;
+		io_file_bitmap_clear(&ctx->file_table, slot_index);
 		needs_switch = true;
 	}
 
@@ -9129,6 +9156,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 	if (!ret) {
 		*io_get_tag_slot(ctx->file_data, slot_index) = 0;
 		io_fixed_file_set(file_slot, file);
+		io_file_bitmap_set(&ctx->file_table, slot_index);
 	}
 err:
 	if (needs_switch)
@@ -9170,6 +9198,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
 		goto out;
 
 	file_slot->file_ptr = 0;
+	io_file_bitmap_clear(&ctx->file_table, offset);
 	io_rsrc_node_switch(ctx, ctx->file_data);
 	ret = 0;
 out:
@@ -9219,6 +9248,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 			if (err)
 				break;
 			file_slot->file_ptr = 0;
+			io_file_bitmap_clear(&ctx->file_table, i);
 			needs_switch = true;
 		}
 		if (fd != -1) {
@@ -9247,6 +9277,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 			}
 			*io_get_tag_slot(data, i) = tag;
 			io_fixed_file_set(file_slot, file);
+			io_file_bitmap_set(&ctx->file_table, i);
 		}
 	}
 

From b70b8e3331d8134ab993368a6e0eb18c1acb1b1d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 7 May 2022 14:20:40 -0600
Subject: [PATCH 087/111] io_uring: add basic fixed file allocator

Applications currently always pick where they want fixed files to go.
In preparation for allowing these types of commands with multishot
support, add a basic allocator in the fixed file table.

Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f8a685cc0363..8c40411a7e78 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -258,6 +258,7 @@ struct io_rsrc_put {
 struct io_file_table {
 	struct io_fixed_file *files;
 	unsigned long *bitmap;
+	unsigned int alloc_hint;
 };
 
 struct io_rsrc_node {
@@ -4696,6 +4697,31 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return __io_openat_prep(req, sqe);
 }
 
+static int __maybe_unused io_file_bitmap_get(struct io_ring_ctx *ctx)
+{
+	struct io_file_table *table = &ctx->file_table;
+	unsigned long nr = ctx->nr_user_files;
+	int ret;
+
+	if (table->alloc_hint >= nr)
+		table->alloc_hint = 0;
+
+	do {
+		ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint);
+		if (ret != nr) {
+			table->alloc_hint = ret + 1;
+			return ret;
+		}
+		if (!table->alloc_hint)
+			break;
+
+		nr = table->alloc_hint;
+		table->alloc_hint = 0;
+	} while (1);
+
+	return -ENFILE;
+}
+
 static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct open_flags op;
@@ -8664,11 +8690,14 @@ static inline void io_file_bitmap_set(struct io_file_table *table, int bit)
 {
 	WARN_ON_ONCE(test_bit(bit, table->bitmap));
 	__set_bit(bit, table->bitmap);
+	if (bit == table->alloc_hint)
+		table->alloc_hint++;
 }
 
 static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
 {
 	__clear_bit(bit, table->bitmap);
+	table->alloc_hint = bit;
 }
 
 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)

From 1339f24b336db5ded9811f3fe7b948e0de207785 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 7 May 2022 14:18:44 -0600
Subject: [PATCH 088/111] io_uring: allow allocated fixed files for
 openat/openat2

If the application passes in IORING_FILE_INDEX_ALLOC as the file_slot,
then that's a hint to allocate a fixed file descriptor rather than have
one be passed in directly.

This can be useful for having io_uring manage the direct descriptor space.

Normal open direct requests will complete with 0 for success, and < 0
in case of error. If io_uring is asked to allocated the direct descriptor,
then the direct descriptor is returned in case of success.

Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 36 ++++++++++++++++++++++++++++++++---
 include/uapi/linux/io_uring.h |  9 +++++++++
 2 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8c40411a7e78..f448264a1067 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4697,7 +4697,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return __io_openat_prep(req, sqe);
 }
 
-static int __maybe_unused io_file_bitmap_get(struct io_ring_ctx *ctx)
+static int io_file_bitmap_get(struct io_ring_ctx *ctx)
 {
 	struct io_file_table *table = &ctx->file_table;
 	unsigned long nr = ctx->nr_user_files;
@@ -4722,6 +4722,36 @@ static int __maybe_unused io_file_bitmap_get(struct io_ring_ctx *ctx)
 	return -ENFILE;
 }
 
+static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
+			       struct file *file, unsigned int file_slot)
+{
+	bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC;
+	struct io_ring_ctx *ctx = req->ctx;
+	int ret;
+
+	if (alloc_slot) {
+		io_ring_submit_lock(ctx, issue_flags);
+		ret = io_file_bitmap_get(ctx);
+		if (unlikely(ret < 0)) {
+			io_ring_submit_unlock(ctx, issue_flags);
+			return ret;
+		}
+
+		file_slot = ret;
+	} else {
+		file_slot--;
+	}
+
+	ret = io_install_fixed_file(req, file, issue_flags, file_slot);
+	if (alloc_slot) {
+		io_ring_submit_unlock(ctx, issue_flags);
+		if (!ret)
+			return file_slot;
+	}
+
+	return ret;
+}
+
 static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct open_flags op;
@@ -4777,8 +4807,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 	if (!fixed)
 		fd_install(ret, file);
 	else
-		ret = io_install_fixed_file(req, file, issue_flags,
-					    req->open.file_slot - 1);
+		ret = io_fixed_fd_install(req, issue_flags, file,
+						req->open.file_slot);
 err:
 	putname(req->open.filename);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 06621a278cb6..b7f02a55032a 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -63,6 +63,15 @@ struct io_uring_sqe {
 	__u64	__pad2[2];
 };
 
+/*
+ * If sqe->file_index is set to this for opcodes that instantiate a new
+ * direct descriptor (like openat/openat2/accept), then io_uring will allocate
+ * an available direct descriptor instead of having the application pass one
+ * in. The picked direct descriptor will be returned in cqe->res, or -ENFILE
+ * if the space is full.
+ */
+#define IORING_FILE_INDEX_ALLOC		(~0U)
+
 enum {
 	IOSQE_FIXED_FILE_BIT,
 	IOSQE_IO_DRAIN_BIT,

From c30c3e00cbd96f0cce478efba41dbe78dad8c774 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 7 May 2022 10:08:31 -0600
Subject: [PATCH 089/111] io_uring: allow allocated fixed files for accept

If the application passes in IORING_FILE_INDEX_ALLOC as the file_slot,
then that's a hint to allocate a fixed file descriptor rather than have
one be passed in directly.

This can be useful for having io_uring manage the direct descriptor space,
and also allows multi-shot support to work with fixed files.

Normal accept direct requests will complete with 0 for success, and < 0
in case of error. If io_uring is asked to allocated the direct descriptor,
then the direct descriptor is returned in case of success.

Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f448264a1067..dc75c43bc9b2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5805,8 +5805,8 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 		fd_install(fd, file);
 		ret = fd;
 	} else {
-		ret = io_install_fixed_file(req, file, issue_flags,
-					    accept->file_slot - 1);
+		ret = io_fixed_fd_install(req, issue_flags, file,
+						accept->file_slot);
 	}
 	__io_req_complete(req, issue_flags, ret, 0);
 	return 0;

From 09893e15f1e9910711c152cd126ccfa85581ba83 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 9 May 2022 09:11:01 -0600
Subject: [PATCH 090/111] io_uring: bump max direct descriptor count to 1M

We currently limit these to 32K, but since we're now backing the table
space with vmalloc when needed, there's no reason why we can't make it
bigger. The total space is limited by RLIMIT_NOFILE as well.

Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index dc75c43bc9b2..b75a49d3831b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -94,7 +94,7 @@
 #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
 
 /* only define max */
-#define IORING_MAX_FIXED_FILES	(1U << 15)
+#define IORING_MAX_FIXED_FILES	(1U << 20)
 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
 				 IORING_REGISTER_LAST + IORING_OP_LAST)
 

From a8da73a32b6e9271a613e5a0e90a8c35f40abeb8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 9 May 2022 09:29:14 -0600
Subject: [PATCH 091/111] io_uring: add flag for allocating a fully sparse
 direct descriptor space

Currently to setup a fully sparse descriptor space upfront, the app needs
to alloate an array of the full size and memset it to -1 and then pass
that in. Make this a bit easier by allowing a flag that simply does
this internally rather than needing to copy each slot separately.

This works with IORING_REGISTER_FILES2 as the flag is set in struct
io_uring_rsrc_register, and is only allow when the type is
IORING_RSRC_FILE as this doesn't make sense for registered buffers.

Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 15 ++++++++++++---
 include/uapi/linux/io_uring.h |  8 +++++++-
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b75a49d3831b..362189819898 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9111,12 +9111,12 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
 		struct io_fixed_file *file_slot;
 
-		if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
+		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
 			ret = -EFAULT;
 			goto fail;
 		}
 		/* allow sparse sets */
-		if (fd == -1) {
+		if (!fds || fd == -1) {
 			ret = -EINVAL;
 			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
 				goto fail;
@@ -11759,14 +11759,20 @@ static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 	memset(&rr, 0, sizeof(rr));
 	if (copy_from_user(&rr, arg, size))
 		return -EFAULT;
-	if (!rr.nr || rr.resv || rr.resv2)
+	if (!rr.nr || rr.resv2)
+		return -EINVAL;
+	if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
 		return -EINVAL;
 
 	switch (type) {
 	case IORING_RSRC_FILE:
+		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
+			break;
 		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
 					     rr.nr, u64_to_user_ptr(rr.tags));
 	case IORING_RSRC_BUFFER:
+		if (rr.flags & IORING_RSRC_REGISTER_SPARSE)
+			break;
 		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
 					       rr.nr, u64_to_user_ptr(rr.tags));
 	}
@@ -11935,6 +11941,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 		ret = io_sqe_buffers_unregister(ctx);
 		break;
 	case IORING_REGISTER_FILES:
+		ret = -EFAULT;
+		if (!arg)
+			break;
 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
 		break;
 	case IORING_UNREGISTER_FILES:
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index b7f02a55032a..36ec43dc7bf9 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -396,9 +396,15 @@ struct io_uring_files_update {
 	__aligned_u64 /* __s32 * */ fds;
 };
 
+/*
+ * Register a fully sparse file space, rather than pass in an array of all
+ * -1 file descriptors.
+ */
+#define IORING_RSRC_REGISTER_SPARSE	(1U << 0)
+
 struct io_uring_rsrc_register {
 	__u32 nr;
-	__u32 resv;
+	__u32 flags;
 	__u64 resv2;
 	__aligned_u64 data;
 	__aligned_u64 tags;

From e0deb6a025ae8c850dc8685be39fb27b06c88736 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 13 May 2022 11:24:56 +0100
Subject: [PATCH 092/111] io_uring: avoid io-wq -EAGAIN looping for !IOPOLL

If an opcode handler semi-reliably returns -EAGAIN, io_wq_submit_work()
might continue busily hammer the same handler over and over again, which
is not ideal. The -EAGAIN handling in question was put there only for
IOPOLL, so restrict it to IOPOLL mode only where there is no other
recourse than to retry as we cannot wait.

Fixes: def596e9557c9 ("io_uring: support for IO polling")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f168b4f24181942f3614dd8ff648221736f572e6.1652433740.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 362189819898..f5f38329c823 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7573,6 +7573,8 @@ fail:
 		 * wait for request slots on the block side.
 		 */
 		if (!needs_poll) {
+			if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
+				break;
 			cond_resched();
 			continue;
 		}

From 1b1d7b4bf1d9948c8dba5ee550459ce7c65ac019 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 12 May 2022 02:18:34 -0700
Subject: [PATCH 093/111] io_uring: only wake when the correct events are set

The check for waking up a request compares the poll_t bits, however this
will always contain some common flags so this always wakes up.

For files with single wait queues such as sockets this can cause the
request to be sent to the async worker unnecesarily. Further if it is
non-blocking will complete the request with EAGAIN which is not desired.

Here exclude these common events, making sure to not exclude POLLERR which
might be important.

Fixes: d7718a9d25a6 ("io_uring: use poll driven retry for files that support it")
Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220512091834.728610-3-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f5f38329c823..c4bd0faf0b27 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6146,6 +6146,7 @@ static void io_poll_cancel_req(struct io_kiocb *req)
 
 #define wqe_to_req(wait)	((void *)((unsigned long) (wait)->private & ~1))
 #define wqe_is_double(wait)	((unsigned long) (wait)->private & 1)
+#define IO_ASYNC_POLL_COMMON	(EPOLLONESHOT | POLLPRI)
 
 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 			void *key)
@@ -6180,7 +6181,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	}
 
 	/* for instances that support it check for an event match first */
-	if (mask && !(mask & poll->events))
+	if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON)))
 		return 0;
 
 	if (io_poll_get_ownership(req)) {
@@ -6337,7 +6338,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct async_poll *apoll;
 	struct io_poll_table ipt;
-	__poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI;
+	__poll_t mask = IO_ASYNC_POLL_COMMON | POLLERR;
 	int ret;
 
 	if (!def->pollin && !def->pollout)

From 390ed29b5e425ba00da2b6113b74a14949f71b02 Mon Sep 17 00:00:00 2001
From: Hao Xu <howeyxu@tencent.com>
Date: Sat, 14 May 2022 22:20:43 +0800
Subject: [PATCH 094/111] io_uring: add IORING_ACCEPT_MULTISHOT for accept

add an accept_flag IORING_ACCEPT_MULTISHOT for accept, which is to
support multishot.

Signed-off-by: Hao Xu <howeyxu@tencent.com>
Link: https://lore.kernel.org/r/20220514142046.58072-2-haoxu.linux@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 36ec43dc7bf9..15f821af9242 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -232,6 +232,11 @@ enum {
  */
 #define IORING_RECVSEND_POLL_FIRST	(1U << 0)
 
+/*
+ * accept flags stored in sqe->ioprio
+ */
+#define IORING_ACCEPT_MULTISHOT	(1U << 0)
+
 /*
  * IO completion data structure (Completion Queue Entry)
  */

From 227685ebfaba0bc7e2ddc47cef4556050b6d7a8f Mon Sep 17 00:00:00 2001
From: Hao Xu <howeyxu@tencent.com>
Date: Sat, 14 May 2022 22:20:44 +0800
Subject: [PATCH 095/111] io_uring: add REQ_F_APOLL_MULTISHOT for requests

Add a flag to indicate multishot mode for fast poll. currently only
accept use it, but there may be more operations leveraging it in the
future. Also add a mask IO_APOLL_MULTI_POLLED which stands for
REQ_F_APOLL_MULTI | REQ_F_POLLED, to make the code short and cleaner.

Signed-off-by: Hao Xu <howeyxu@tencent.com>
Link: https://lore.kernel.org/r/20220514142046.58072-3-haoxu.linux@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c4bd0faf0b27..e1e550de5956 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -116,6 +116,8 @@
 #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
 				 IO_REQ_CLEAN_FLAGS)
 
+#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
+
 #define IO_TCTX_REFS_CACHE_NR	(1U << 10)
 
 struct io_uring {
@@ -812,6 +814,7 @@ enum {
 	REQ_F_SINGLE_POLL_BIT,
 	REQ_F_DOUBLE_POLL_BIT,
 	REQ_F_PARTIAL_IO_BIT,
+	REQ_F_APOLL_MULTISHOT_BIT,
 	/* keep async read/write and isreg together and in order */
 	REQ_F_SUPPORT_NOWAIT_BIT,
 	REQ_F_ISREG_BIT,
@@ -876,6 +879,8 @@ enum {
 	REQ_F_DOUBLE_POLL	= BIT(REQ_F_DOUBLE_POLL_BIT),
 	/* request has already done partial IO */
 	REQ_F_PARTIAL_IO	= BIT(REQ_F_PARTIAL_IO_BIT),
+	/* fast poll multishot mode */
+	REQ_F_APOLL_MULTISHOT	= BIT(REQ_F_APOLL_MULTISHOT_BIT),
 };
 
 struct async_poll {

From dbc2564cfe0faff439dc46adb8c009589054ea46 Mon Sep 17 00:00:00 2001
From: Hao Xu <howeyxu@tencent.com>
Date: Sat, 14 May 2022 22:20:45 +0800
Subject: [PATCH 096/111] io_uring: let fast poll support multishot

For operations like accept, multishot is a useful feature, since we can
reduce a number of accept sqe. Let's integrate it to fast poll, it may
be good for other operations in the future.

Signed-off-by: Hao Xu <howeyxu@tencent.com>
Link: https://lore.kernel.org/r/20220514142046.58072-4-haoxu.linux@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 47 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 15 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e1e550de5956..a83405f657e0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6011,6 +6011,7 @@ static void io_poll_remove_entries(struct io_kiocb *req)
 	rcu_read_unlock();
 }
 
+static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags);
 /*
  * All poll tw should go through this. Checks for poll events, manages
  * references, does rewait, etc.
@@ -6019,10 +6020,10 @@ static void io_poll_remove_entries(struct io_kiocb *req)
  * either spurious wakeup or multishot CQE is served. 0 when it's done with
  * the request, then the mask is stored in req->cqe.res.
  */
-static int io_poll_check_events(struct io_kiocb *req, bool locked)
+static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	int v;
+	int v, ret;
 
 	/* req->task == current here, checking PF_EXITING is safe */
 	if (unlikely(req->task->flags & PF_EXITING))
@@ -6046,23 +6047,37 @@ static int io_poll_check_events(struct io_kiocb *req, bool locked)
 			req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
 		}
 
-		/* multishot, just fill an CQE and proceed */
-		if (req->cqe.res && !(req->apoll_events & EPOLLONESHOT)) {
-			__poll_t mask = mangle_poll(req->cqe.res & req->apoll_events);
+		if ((unlikely(!req->cqe.res)))
+			continue;
+		if (req->apoll_events & EPOLLONESHOT)
+			return 0;
+
+		/* multishot, just fill a CQE and proceed */
+		if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
+			__poll_t mask = mangle_poll(req->cqe.res &
+						    req->apoll_events);
 			bool filled;
 
 			spin_lock(&ctx->completion_lock);
-			filled = io_fill_cqe_aux(ctx, req->cqe.user_data, mask,
-						 IORING_CQE_F_MORE);
+			filled = io_fill_cqe_aux(ctx, req->cqe.user_data,
+						 mask, IORING_CQE_F_MORE);
 			io_commit_cqring(ctx);
 			spin_unlock(&ctx->completion_lock);
-			if (unlikely(!filled))
-				return -ECANCELED;
-			io_cqring_ev_posted(ctx);
-		} else if (req->cqe.res) {
-			return 0;
+			if (filled) {
+				io_cqring_ev_posted(ctx);
+				continue;
+			}
+			return -ECANCELED;
 		}
 
+		io_tw_lock(req->ctx, locked);
+		if (unlikely(req->task->flags & PF_EXITING))
+			return -EFAULT;
+		ret = io_issue_sqe(req,
+				   IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
+		if (ret)
+			return ret;
+
 		/*
 		 * Release all references, retry if someone tried to restart
 		 * task_work while we were executing it.
@@ -6077,7 +6092,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
-	ret = io_poll_check_events(req, *locked);
+	ret = io_poll_check_events(req, locked);
 	if (ret > 0)
 		return;
 
@@ -6102,7 +6117,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
-	ret = io_poll_check_events(req, *locked);
+	ret = io_poll_check_events(req, locked);
 	if (ret > 0)
 		return;
 
@@ -6343,7 +6358,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct async_poll *apoll;
 	struct io_poll_table ipt;
-	__poll_t mask = IO_ASYNC_POLL_COMMON | POLLERR;
+	__poll_t mask = POLLPRI | POLLERR;
 	int ret;
 
 	if (!def->pollin && !def->pollout)
@@ -6352,6 +6367,8 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 		return IO_APOLL_ABORTED;
 	if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
 		return IO_APOLL_ABORTED;
+	if (!(req->flags & REQ_F_APOLL_MULTISHOT))
+		mask |= EPOLLONESHOT;
 
 	if (def->pollin) {
 		mask |= POLLIN | POLLRDNORM;

From 4e86a2c980137f7be1ea600af5f1f5c8342ecc09 Mon Sep 17 00:00:00 2001
From: Hao Xu <howeyxu@tencent.com>
Date: Sat, 14 May 2022 22:20:46 +0800
Subject: [PATCH 097/111] io_uring: implement multishot mode for accept

Refactor io_accept() to support multishot mode.

theoretical analysis:
  1) when connections come in fast
    - singleshot:
              add accept sqe(userspace) --> accept inline
                              ^                 |
                              |-----------------|
    - multishot:
             add accept sqe(userspace) --> accept inline
                                              ^     |
                                              |--*--|

    we do accept repeatedly in * place until get EAGAIN

  2) when connections come in at a low pressure
    similar thing like 1), we reduce a lot of userspace-kernel context
    switch and useless vfs_poll()

tests:
Did some tests, which goes in this way:

  server    client(multiple)
  accept    connect
  read      write
  write     read
  close     close

Basically, raise up a number of clients(on same machine with server) to
connect to the server, and then write some data to it, the server will
write those data back to the client after it receives them, and then
close the connection after write return. Then the client will read the
data and then close the connection. Here I test 10000 clients connect
one server, data size 128 bytes. And each client has a go routine for
it, so they come to the server in short time.
test 20 times before/after this patchset, time spent:(unit cycle, which
is the return value of clock())
before:
  1930136+1940725+1907981+1947601+1923812+1928226+1911087+1905897+1941075
  +1934374+1906614+1912504+1949110+1908790+1909951+1941672+1969525+1934984
  +1934226+1914385)/20.0 = 1927633.75
after:
  1858905+1917104+1895455+1963963+1892706+1889208+1874175+1904753+1874112
  +1874985+1882706+1884642+1864694+1906508+1916150+1924250+1869060+1889506
  +1871324+1940803)/20.0 = 1894750.45

(1927633.75 - 1894750.45) / 1927633.75 = 1.65%

Signed-off-by: Hao Xu <howeyxu@tencent.com>
Link: https://lore.kernel.org/r/20220514142046.58072-5-haoxu.linux@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 54 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a83405f657e0..d4752e192ef9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1148,6 +1148,7 @@ static const struct io_op_def io_op_defs[] = {
 		.unbound_nonreg_file	= 1,
 		.pollin			= 1,
 		.poll_exclusive		= 1,
+		.ioprio			= 1,	/* used for flags */
 	},
 	[IORING_OP_ASYNC_CANCEL] = {
 		.audit_skip		= 1,
@@ -5762,6 +5763,7 @@ out_free:
 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_accept *accept = &req->accept;
+	unsigned flags;
 
 	if (sqe->len || sqe->buf_index)
 		return -EINVAL;
@@ -5770,19 +5772,30 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 	accept->flags = READ_ONCE(sqe->accept_flags);
 	accept->nofile = rlimit(RLIMIT_NOFILE);
+	flags = READ_ONCE(sqe->ioprio);
+	if (flags & ~IORING_ACCEPT_MULTISHOT)
+		return -EINVAL;
 
 	accept->file_slot = READ_ONCE(sqe->file_index);
-	if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
-		return -EINVAL;
+	if (accept->file_slot) {
+		if (accept->flags & SOCK_CLOEXEC)
+			return -EINVAL;
+		if (flags & IORING_ACCEPT_MULTISHOT &&
+		    accept->file_slot != IORING_FILE_INDEX_ALLOC)
+			return -EINVAL;
+	}
 	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
 		return -EINVAL;
 	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
 		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
+	if (flags & IORING_ACCEPT_MULTISHOT)
+		req->flags |= REQ_F_APOLL_MULTISHOT;
 	return 0;
 }
 
 static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 {
+	struct io_ring_ctx *ctx = req->ctx;
 	struct io_accept *accept = &req->accept;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
@@ -5790,6 +5803,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 	struct file *file;
 	int ret, fd;
 
+retry:
 	if (!fixed) {
 		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
 		if (unlikely(fd < 0))
@@ -5801,8 +5815,17 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 		if (!fixed)
 			put_unused_fd(fd);
 		ret = PTR_ERR(file);
-		if (ret == -EAGAIN && force_nonblock)
-			return -EAGAIN;
+		if (ret == -EAGAIN && force_nonblock) {
+			/*
+			 * if it's multishot and polled, we don't need to
+			 * return EAGAIN to arm the poll infra since it
+			 * has already been done
+			 */
+			if ((req->flags & IO_APOLL_MULTI_POLLED) ==
+			    IO_APOLL_MULTI_POLLED)
+				ret = 0;
+			return ret;
+		}
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		req_set_fail(req);
@@ -5813,8 +5836,27 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 		ret = io_fixed_fd_install(req, issue_flags, file,
 						accept->file_slot);
 	}
-	__io_req_complete(req, issue_flags, ret, 0);
-	return 0;
+
+	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
+		__io_req_complete(req, issue_flags, ret, 0);
+		return 0;
+	}
+	if (ret >= 0) {
+		bool filled;
+
+		spin_lock(&ctx->completion_lock);
+		filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret,
+					 IORING_CQE_F_MORE);
+		io_commit_cqring(ctx);
+		spin_unlock(&ctx->completion_lock);
+		if (filled) {
+			io_cqring_ev_posted(ctx);
+			goto retry;
+		}
+		ret = -ECANCELED;
+	}
+
+	return ret;
 }
 
 static int io_connect_prep_async(struct io_kiocb *req)

From e7637a492b9f1ae6b7cfcecf0aed5e4c76efa3bd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 15 May 2022 11:19:43 -0600
Subject: [PATCH 098/111] io_uring: fix locking state for empty buffer group

io_provided_buffer_select() must drop the submit lock, if needed, even
in the error handling case. Failure to do so will leave us with the
ctx->uring_lock held, causing spew like:

====================================
WARNING: iou-wrk-366/368 still has locks held!
5.18.0-rc6-00294-gdf8dc7004331 #994 Not tainted
------------------------------------
1 lock held by iou-wrk-366/368:
 #0: ffff0000c72598a8 (&ctx->uring_lock){+.+.}-{3:3}, at: io_ring_submit_lock+0x20/0x48

stack backtrace:
CPU: 4 PID: 368 Comm: iou-wrk-366 Not tainted 5.18.0-rc6-00294-gdf8dc7004331 #994
Hardware name: linux,dummy-virt (DT)
Call trace:
 dump_backtrace.part.0+0xa4/0xd4
 show_stack+0x14/0x5c
 dump_stack_lvl+0x88/0xb0
 dump_stack+0x14/0x2c
 debug_check_no_locks_held+0x84/0x90
 try_to_freeze.isra.0+0x18/0x44
 get_signal+0x94/0x6ec
 io_wqe_worker+0x1d8/0x2b4
 ret_from_fork+0x10/0x20

and triggering later hangs off get_signal() because we attempt to
re-grab the lock.

Reported-by: syzbot+987d7bb19195ae45208c@syzkaller.appspotmail.com
Fixes: 149c69b04a90 ("io_uring: abstract out provided buffer list selection")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d4752e192ef9..2322583f9275 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3467,20 +3467,23 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
 					      struct io_buffer_list *bl,
 					      unsigned int issue_flags)
 {
-	struct io_buffer *kbuf;
+	void __user *ret = ERR_PTR(-ENOBUFS);
 
-	if (list_empty(&bl->buf_list))
-		return ERR_PTR(-ENOBUFS);
+	if (!list_empty(&bl->buf_list)) {
+		struct io_buffer *kbuf;
+
+		kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
+		list_del(&kbuf->list);
+		if (*len > kbuf->len)
+			*len = kbuf->len;
+		req->flags |= REQ_F_BUFFER_SELECTED;
+		req->kbuf = kbuf;
+		req->buf_index = kbuf->bid;
+		ret = u64_to_user_ptr(kbuf->addr);
+	}
 
-	kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
-	list_del(&kbuf->list);
-	if (*len > kbuf->len)
-		*len = kbuf->len;
-	req->flags |= REQ_F_BUFFER_SELECTED;
-	req->kbuf = kbuf;
-	req->buf_index = kbuf->bid;
 	io_ring_submit_unlock(req->ctx, issue_flags);
-	return u64_to_user_ptr(kbuf->addr);
+	return ret;
 }
 
 static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,

From 3d200242a6c968af321913b635fc4014b238cba4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 5 May 2022 21:47:55 -0600
Subject: [PATCH 099/111] io_uring: add buffer selection support to
 IORING_OP_NOP

Obviously not really useful since it's not transferring data, but it
is helpful in benchmarking overhead of provided buffers.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2322583f9275..dc129c74ed1e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1058,6 +1058,7 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_NOP] = {
 		.audit_skip		= 1,
 		.iopoll			= 1,
+		.buffer_select		= 1,
 	},
 	[IORING_OP_READV] = {
 		.needs_file		= 1,
@@ -4538,7 +4539,17 @@ done:
  */
 static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 {
-	__io_req_complete(req, issue_flags, 0, 0);
+	void __user *buf;
+
+	if (req->flags & REQ_F_BUFFER_SELECT) {
+		size_t len = 1;
+
+		buf = io_buffer_select(req, &len, issue_flags);
+		if (IS_ERR(buf))
+			return PTR_ERR(buf);
+	}
+
+	__io_req_complete(req, issue_flags, 0, io_put_kbuf(req, issue_flags));
 	return 0;
 }
 

From d8c2237d0aa9c04b867ce1e281e2a30a86a68e3b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Apr 2022 13:02:27 -0600
Subject: [PATCH 100/111] io_uring: add io_pin_pages() helper

Abstract this out from io_sqe_buffer_register() so we can use it
elsewhere too without duplicating this code.

No intended functional changes in this patch.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 79 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 51 insertions(+), 28 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index dc129c74ed1e..384cdbd40941 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9845,30 +9845,18 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
 	return ret;
 }
 
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
-				  struct io_mapped_ubuf **pimu,
-				  struct page **last_hpage)
+static struct page **io_pin_pages(unsigned long ubuf, unsigned long len,
+				  int *npages)
 {
-	struct io_mapped_ubuf *imu = NULL;
+	unsigned long start, end, nr_pages;
 	struct vm_area_struct **vmas = NULL;
 	struct page **pages = NULL;
-	unsigned long off, start, end, ubuf;
-	size_t size;
-	int ret, pret, nr_pages, i;
+	int i, pret, ret = -ENOMEM;
 
-	if (!iov->iov_base) {
-		*pimu = ctx->dummy_ubuf;
-		return 0;
-	}
-
-	ubuf = (unsigned long) iov->iov_base;
-	end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	start = ubuf >> PAGE_SHIFT;
 	nr_pages = end - start;
 
-	*pimu = NULL;
-	ret = -ENOMEM;
-
 	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
 	if (!pages)
 		goto done;
@@ -9878,10 +9866,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	if (!vmas)
 		goto done;
 
-	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
-	if (!imu)
-		goto done;
-
 	ret = 0;
 	mmap_read_lock(current->mm);
 	pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
@@ -9899,6 +9883,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 				break;
 			}
 		}
+		*npages = nr_pages;
 	} else {
 		ret = pret < 0 ? pret : -EFAULT;
 	}
@@ -9912,14 +9897,53 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 			unpin_user_pages(pages, pret);
 		goto done;
 	}
+	ret = 0;
+done:
+	kvfree(vmas);
+	if (ret < 0) {
+		kvfree(pages);
+		pages = ERR_PTR(ret);
+	}
+	return pages;
+}
 
-	ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
-	if (ret) {
-		unpin_user_pages(pages, pret);
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
+				  struct io_mapped_ubuf **pimu,
+				  struct page **last_hpage)
+{
+	struct io_mapped_ubuf *imu = NULL;
+	struct page **pages = NULL;
+	unsigned long off;
+	size_t size;
+	int ret, nr_pages, i;
+
+	if (!iov->iov_base) {
+		*pimu = ctx->dummy_ubuf;
+		return 0;
+	}
+
+	*pimu = NULL;
+	ret = -ENOMEM;
+
+	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
+				&nr_pages);
+	if (IS_ERR(pages)) {
+		ret = PTR_ERR(pages);
+		pages = NULL;
 		goto done;
 	}
 
-	off = ubuf & ~PAGE_MASK;
+	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+	if (!imu)
+		goto done;
+
+	ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
+	if (ret) {
+		unpin_user_pages(pages, nr_pages);
+		goto done;
+	}
+
+	off = (unsigned long) iov->iov_base & ~PAGE_MASK;
 	size = iov->iov_len;
 	for (i = 0; i < nr_pages; i++) {
 		size_t vec_len;
@@ -9932,8 +9956,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 		size -= vec_len;
 	}
 	/* store original address for later verification */
-	imu->ubuf = ubuf;
-	imu->ubuf_end = ubuf + iov->iov_len;
+	imu->ubuf = (unsigned long) iov->iov_base;
+	imu->ubuf_end = imu->ubuf + iov->iov_len;
 	imu->nr_bvecs = nr_pages;
 	*pimu = imu;
 	ret = 0;
@@ -9941,7 +9965,6 @@ done:
 	if (ret)
 		kvfree(imu);
 	kvfree(pages);
-	kvfree(vmas);
 	return ret;
 }
 

From c7fb19428d67dd0a2a78a4f237af01d39c78dc5a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 30 Apr 2022 14:38:53 -0600
Subject: [PATCH 101/111] io_uring: add support for ring mapped supplied
 buffers

Provided buffers allow an application to supply io_uring with buffers
that can then be grabbed for a read/receive request, when the data
source is ready to deliver data. The existing scheme relies on using
IORING_OP_PROVIDE_BUFFERS to do that, but it can be difficult to use
in real world applications. It's pretty efficient if the application
is able to supply back batches of provided buffers when they have been
consumed and the application is ready to recycle them, but if
fragmentation occurs in the buffer space, it can become difficult to
supply enough buffers at the time. This hurts efficiency.

Add a register op, IORING_REGISTER_PBUF_RING, which allows an application
to setup a shared queue for each buffer group of provided buffers. The
application can then supply buffers simply by adding them to this ring,
and the kernel can consume then just as easily. The ring shares the head
with the application, the tail remains private in the kernel.

Provided buffers setup with IORING_REGISTER_PBUF_RING cannot use
IORING_OP_{PROVIDE,REMOVE}_BUFFERS for adding or removing entries to the
ring, they must use the mapped ring. Mapped provided buffer rings can
co-exist with normal provided buffers, just not within the same group ID.

To gauge overhead of the existing scheme and evaluate the mapped ring
approach, a simple NOP benchmark was written. It uses a ring of 128
entries, and submits/completes 32 at the time. 'Replenish' is how
many buffers are provided back at the time after they have been
consumed:

Test			Replenish			NOPs/sec
================================================================
No provided buffers	NA				~30M
Provided buffers	32				~16M
Provided buffers	 1				~10M
Ring buffers		32				~27M
Ring buffers		 1				~27M

The ring mapped buffers perform almost as well as not using provided
buffers at all, and they don't care if you provided 1 or more back at
the same time. This means application can just replenish as they go,
rather than need to batch and compact, further reducing overhead in the
application. The NOP benchmark above doesn't need to do any compaction,
so that overhead isn't even reflected in the above test.

Co-developed-by: Dylan Yudaken <dylany@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 234 ++++++++++++++++++++++++++++++++--
 include/uapi/linux/io_uring.h |  36 ++++++
 2 files changed, 258 insertions(+), 12 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 384cdbd40941..78192a9e7684 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -285,9 +285,26 @@ struct io_rsrc_data {
 	bool				quiesce;
 };
 
+#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
 struct io_buffer_list {
-	struct list_head buf_list;
+	/*
+	 * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
+	 * then these are classic provided buffers and ->buf_list is used.
+	 */
+	union {
+		struct list_head buf_list;
+		struct {
+			struct page **buf_pages;
+			struct io_uring_buf_ring *buf_ring;
+		};
+	};
 	__u16 bgid;
+
+	/* below is for ring provided buffers */
+	__u16 buf_nr_pages;
+	__u16 nr_entries;
+	__u32 head;
+	__u32 mask;
 };
 
 struct io_buffer {
@@ -804,6 +821,7 @@ enum {
 	REQ_F_NEED_CLEANUP_BIT,
 	REQ_F_POLLED_BIT,
 	REQ_F_BUFFER_SELECTED_BIT,
+	REQ_F_BUFFER_RING_BIT,
 	REQ_F_COMPLETE_INLINE_BIT,
 	REQ_F_REISSUE_BIT,
 	REQ_F_CREDS_BIT,
@@ -855,6 +873,8 @@ enum {
 	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),
 	/* buffer already selected */
 	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
+	/* buffer selected from ring, needs commit */
+	REQ_F_BUFFER_RING	= BIT(REQ_F_BUFFER_RING_BIT),
 	/* completion is deferred through io_comp_state */
 	REQ_F_COMPLETE_INLINE	= BIT(REQ_F_COMPLETE_INLINE_BIT),
 	/* caller should reissue async */
@@ -979,6 +999,12 @@ struct io_kiocb {
 
 		/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
 		struct io_buffer	*kbuf;
+
+		/*
+		 * stores buffer ID for ring provided buffers, valid IFF
+		 * REQ_F_BUFFER_RING is set.
+		 */
+		struct io_buffer_list	*buf_list;
 	};
 
 	union {
@@ -1470,8 +1496,14 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
 
 static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
 {
-	req->flags &= ~REQ_F_BUFFER_SELECTED;
-	list_add(&req->kbuf->list, list);
+	if (req->flags & REQ_F_BUFFER_RING) {
+		if (req->buf_list)
+			req->buf_list->head++;
+		req->flags &= ~REQ_F_BUFFER_RING;
+	} else {
+		list_add(&req->kbuf->list, list);
+		req->flags &= ~REQ_F_BUFFER_SELECTED;
+	}
 
 	return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
 }
@@ -1480,7 +1512,7 @@ static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
 {
 	lockdep_assert_held(&req->ctx->completion_lock);
 
-	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
 		return 0;
 	return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
 }
@@ -1490,7 +1522,7 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
 {
 	unsigned int cflags;
 
-	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
 		return 0;
 
 	/*
@@ -1505,7 +1537,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
 	 * We migrate buffers from the comp_list to the issue cache list
 	 * when we need one.
 	 */
-	if (issue_flags & IO_URING_F_UNLOCKED) {
+	if (req->flags & REQ_F_BUFFER_RING) {
+		/* no buffers to recycle for this case */
+		cflags = __io_put_kbuf(req, NULL);
+	} else if (issue_flags & IO_URING_F_UNLOCKED) {
 		struct io_ring_ctx *ctx = req->ctx;
 
 		spin_lock(&ctx->completion_lock);
@@ -1535,11 +1570,23 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 	struct io_buffer_list *bl;
 	struct io_buffer *buf;
 
-	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
 		return;
 	/* don't recycle if we already did IO to this buffer */
 	if (req->flags & REQ_F_PARTIAL_IO)
 		return;
+	/*
+	 * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
+	 * the flag and hence ensure that bl->head doesn't get incremented.
+	 * If the tail has already been incremented, hang on to it.
+	 */
+	if (req->flags & REQ_F_BUFFER_RING) {
+		if (req->buf_list) {
+			req->buf_index = req->buf_list->bgid;
+			req->flags &= ~REQ_F_BUFFER_RING;
+		}
+		return;
+	}
 
 	io_ring_submit_lock(ctx, issue_flags);
 
@@ -3487,6 +3534,53 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
 	return ret;
 }
 
+static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
+					  struct io_buffer_list *bl,
+					  unsigned int issue_flags)
+{
+	struct io_uring_buf_ring *br = bl->buf_ring;
+	struct io_uring_buf *buf;
+	__u32 head = bl->head;
+
+	if (unlikely(smp_load_acquire(&br->tail) == head)) {
+		io_ring_submit_unlock(req->ctx, issue_flags);
+		return ERR_PTR(-ENOBUFS);
+	}
+
+	head &= bl->mask;
+	if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
+		buf = &br->bufs[head];
+	} else {
+		int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
+		int index = head / IO_BUFFER_LIST_BUF_PER_PAGE - 1;
+		buf = page_address(bl->buf_pages[index]);
+		buf += off;
+	}
+	if (*len > buf->len)
+		*len = buf->len;
+	req->flags |= REQ_F_BUFFER_RING;
+	req->buf_list = bl;
+	req->buf_index = buf->bid;
+
+	if (!(issue_flags & IO_URING_F_UNLOCKED))
+		return u64_to_user_ptr(buf->addr);
+
+	/*
+	 * If we came in unlocked, we have no choice but to
+	 * consume the buffer here. This does mean it'll be
+	 * pinned until the IO completes. But coming in
+	 * unlocked means we're in io-wq context, hence there
+	 * should be no further retry. For the locked case, the
+	 * caller must ensure to call the commit when the
+	 * transfer completes (or if we get -EAGAIN and must
+	 * poll or retry).
+	 */
+	req->buf_list = NULL;
+	bl->head++;
+	io_ring_submit_unlock(req->ctx, issue_flags);
+	return u64_to_user_ptr(buf->addr);
+}
+
 static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 				     unsigned int issue_flags)
 {
@@ -3502,6 +3596,9 @@ static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 	}
 
 	/* selection helpers drop the submit lock again, if needed */
+	if (bl->buf_nr_pages)
+		return io_ring_buffer_select(req, len, bl, issue_flags);
+
 	return io_provided_buffer_select(req, len, bl, issue_flags);
 }
 
@@ -3558,7 +3655,7 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 				    unsigned int issue_flags)
 {
-	if (req->flags & REQ_F_BUFFER_SELECTED) {
+	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
 		iov[0].iov_base = u64_to_user_ptr(req->rw.addr);
 		iov[0].iov_len = req->rw.len;
 		return 0;
@@ -3578,7 +3675,7 @@ static inline bool io_do_buffer_select(struct io_kiocb *req)
 {
 	if (!(req->flags & REQ_F_BUFFER_SELECT))
 		return false;
-	return !(req->flags & REQ_F_BUFFER_SELECTED);
+	return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
 }
 
 static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
@@ -4872,6 +4969,18 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 	if (!nbufs)
 		return 0;
 
+	if (bl->buf_nr_pages) {
+		int j;
+
+		i = bl->buf_ring->tail - bl->head;
+		for (j = 0; j < bl->buf_nr_pages; j++)
+			unpin_user_page(bl->buf_pages[j]);
+		kvfree(bl->buf_pages);
+		bl->buf_pages = NULL;
+		bl->buf_nr_pages = 0;
+		return i;
+	}
+
 	/* the head kbuf is the list itself */
 	while (!list_empty(&bl->buf_list)) {
 		struct io_buffer *nxt;
@@ -4898,8 +5007,12 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = -ENOENT;
 	bl = io_buffer_get_list(ctx, p->bgid);
-	if (bl)
-		ret = __io_remove_buffers(ctx, bl, p->nbufs);
+	if (bl) {
+		ret = -EINVAL;
+		/* can't use provide/remove buffers command on mapped buffers */
+		if (!bl->buf_nr_pages)
+			ret = __io_remove_buffers(ctx, bl, p->nbufs);
+	}
 	if (ret < 0)
 		req_set_fail(req);
 
@@ -5047,7 +5160,7 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 
 	bl = io_buffer_get_list(ctx, p->bgid);
 	if (unlikely(!bl)) {
-		bl = kmalloc(sizeof(*bl), GFP_KERNEL);
+		bl = kzalloc(sizeof(*bl), GFP_KERNEL);
 		if (!bl) {
 			ret = -ENOMEM;
 			goto err;
@@ -5058,6 +5171,11 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 			goto err;
 		}
 	}
+	/* can't add buffers via this command for a mapped buffer ring */
+	if (bl->buf_nr_pages) {
+		ret = -EINVAL;
+		goto err;
+	}
 
 	ret = io_add_buffers(ctx, p, bl);
 err:
@@ -12011,6 +12129,83 @@ err:
 	return ret;
 }
 
+static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_buf_ring *br;
+	struct io_uring_buf_reg reg;
+	struct io_buffer_list *bl;
+	struct page **pages;
+	int nr_pages;
+
+	if (copy_from_user(&reg, arg, sizeof(reg)))
+		return -EFAULT;
+
+	if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
+		return -EINVAL;
+	if (!reg.ring_addr)
+		return -EFAULT;
+	if (reg.ring_addr & ~PAGE_MASK)
+		return -EINVAL;
+	if (!is_power_of_2(reg.ring_entries))
+		return -EINVAL;
+
+	if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
+		int ret = io_init_bl_list(ctx);
+		if (ret)
+			return ret;
+	}
+
+	bl = io_buffer_get_list(ctx, reg.bgid);
+	if (bl && bl->buf_nr_pages)
+		return -EEXIST;
+	if (!bl) {
+		bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+		if (!bl)
+			return -ENOMEM;
+	}
+
+	pages = io_pin_pages(reg.ring_addr,
+			     struct_size(br, bufs, reg.ring_entries),
+			     &nr_pages);
+	if (IS_ERR(pages)) {
+		kfree(bl);
+		return PTR_ERR(pages);
+	}
+
+	br = page_address(pages[0]);
+	bl->buf_pages = pages;
+	bl->buf_nr_pages = nr_pages;
+	bl->nr_entries = reg.ring_entries;
+	bl->buf_ring = br;
+	bl->mask = reg.ring_entries - 1;
+	io_buffer_add_list(ctx, bl, reg.bgid);
+	return 0;
+}
+
+static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_buf_reg reg;
+	struct io_buffer_list *bl;
+
+	if (copy_from_user(&reg, arg, sizeof(reg)))
+		return -EFAULT;
+	if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
+		return -EINVAL;
+
+	bl = io_buffer_get_list(ctx, reg.bgid);
+	if (!bl)
+		return -ENOENT;
+	if (!bl->buf_nr_pages)
+		return -EINVAL;
+
+	__io_remove_buffers(ctx, bl, -1U);
+	if (bl->bgid >= BGID_ARRAY) {
+		xa_erase(&ctx->io_bl_xa, bl->bgid);
+		kfree(bl);
+	}
+	return 0;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -12142,6 +12337,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	case IORING_UNREGISTER_RING_FDS:
 		ret = io_ringfd_unregister(ctx, arg, nr_args);
 		break;
+	case IORING_REGISTER_PBUF_RING:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_pbuf_ring(ctx, arg);
+		break;
+	case IORING_UNREGISTER_PBUF_RING:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_unregister_pbuf_ring(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -12227,6 +12434,9 @@ static int __init io_uring_init(void)
 	/* ->buf_index is u16 */
 	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
 	BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE);
+	BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
+	BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
+		     offsetof(struct io_uring_buf_ring, tail));
 
 	/* should fit into one byte */
 	BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 15f821af9242..ddf969ae5a79 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -384,6 +384,10 @@ enum {
 	IORING_REGISTER_RING_FDS		= 20,
 	IORING_UNREGISTER_RING_FDS		= 21,
 
+	/* register ring based provide buffer group */
+	IORING_REGISTER_PBUF_RING		= 22,
+	IORING_UNREGISTER_PBUF_RING		= 23,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
@@ -461,6 +465,38 @@ struct io_uring_restriction {
 	__u32 resv2[3];
 };
 
+struct io_uring_buf {
+	__u64	addr;
+	__u32	len;
+	__u16	bid;
+	__u16	resv;
+};
+
+struct io_uring_buf_ring {
+	union {
+		/*
+		 * To avoid spilling into more pages than we need to, the
+		 * ring tail is overlaid with the io_uring_buf->resv field.
+		 */
+		struct {
+			__u64	resv1;
+			__u32	resv2;
+			__u16	resv3;
+			__u16	tail;
+		};
+		struct io_uring_buf	bufs[0];
+	};
+};
+
+/* argument for IORING_(UN)REGISTER_PBUF_RING */
+struct io_uring_buf_reg {
+	__u64	ring_addr;
+	__u32	ring_entries;
+	__u16	bgid;
+	__u16	pad;
+	__u64	resv[3];
+};
+
 /*
  * io_uring_restriction->opcode values
  */

From 20cbd21d899b72765e38481a926c7b2008c64350 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 18 May 2022 10:40:00 +0200
Subject: [PATCH 102/111] io_uring: use a rwf_t for io_rw.flags

Use the proper type.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220518084005.3255380-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 78192a9e7684..53ddfb57e4be 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -640,7 +640,7 @@ struct io_rw {
 	struct kiocb			kiocb;
 	u64				addr;
 	u32				len;
-	u32				flags;
+	rwf_t				flags;
 };
 
 struct io_connect {

From 984824db844a9bd6e9e15ee469241982526a6ccd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 18 May 2022 10:40:01 +0200
Subject: [PATCH 103/111] io_uring: don't use ERR_PTR for user pointers

ERR_PTR abuses the high bits of a pointer to transport error information.
This is only safe for kernel pointers and not user pointers.  Fix
io_buffer_select and its helpers to just return NULL for failure and get
rid of this abuse.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220518084005.3255380-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 83 +++++++++++++++++++++++----------------------------
 1 file changed, 37 insertions(+), 46 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 53ddfb57e4be..9cac9b0ffebe 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3512,11 +3512,8 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx,
 }
 
 static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
-					      struct io_buffer_list *bl,
-					      unsigned int issue_flags)
+					      struct io_buffer_list *bl)
 {
-	void __user *ret = ERR_PTR(-ENOBUFS);
-
 	if (!list_empty(&bl->buf_list)) {
 		struct io_buffer *kbuf;
 
@@ -3527,11 +3524,9 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
 		req->flags |= REQ_F_BUFFER_SELECTED;
 		req->kbuf = kbuf;
 		req->buf_index = kbuf->bid;
-		ret = u64_to_user_ptr(kbuf->addr);
+		return u64_to_user_ptr(kbuf->addr);
 	}
-
-	io_ring_submit_unlock(req->ctx, issue_flags);
-	return ret;
+	return NULL;
 }
 
 static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
@@ -3544,7 +3539,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 
 	if (unlikely(smp_load_acquire(&br->tail) == head)) {
 		io_ring_submit_unlock(req->ctx, issue_flags);
-		return ERR_PTR(-ENOBUFS);
+		return NULL;
 	}
 
 	head &= bl->mask;
@@ -3562,22 +3557,19 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 	req->buf_list = bl;
 	req->buf_index = buf->bid;
 
-	if (!(issue_flags & IO_URING_F_UNLOCKED))
-		return u64_to_user_ptr(buf->addr);
-
-	/*
-	 * If we came in unlocked, we have no choice but to
-	 * consume the buffer here. This does mean it'll be
-	 * pinned until the IO completes. But coming in
-	 * unlocked means we're in io-wq context, hence there
-	 * should be no further retry. For the locked case, the
-	 * caller must ensure to call the commit when the
-	 * transfer completes (or if we get -EAGAIN and must
-	 * poll or retry).
-	 */
-	req->buf_list = NULL;
-	bl->head++;
-	io_ring_submit_unlock(req->ctx, issue_flags);
+	if (issue_flags & IO_URING_F_UNLOCKED) {
+		/*
+		 * If we came in unlocked, we have no choice but to consume the
+		 * buffer here. This does mean it'll be pinned until the IO
+		 * completes. But coming in unlocked means we're in io-wq
+		 * context, hence there should be no further retry. For the
+		 * locked case, the caller must ensure to call the commit when
+		 * the transfer completes (or if we get -EAGAIN and must poll
+		 * or retry).
+		 */
+		req->buf_list = NULL;
+		bl->head++;
+	}
 	return u64_to_user_ptr(buf->addr);
 }
 
@@ -3586,20 +3578,19 @@ static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer_list *bl;
+	void __user *ret = NULL;
 
 	io_ring_submit_lock(req->ctx, issue_flags);
 
 	bl = io_buffer_get_list(ctx, req->buf_index);
-	if (unlikely(!bl)) {
-		io_ring_submit_unlock(req->ctx, issue_flags);
-		return ERR_PTR(-ENOBUFS);
+	if (likely(bl)) {
+		if (bl->buf_nr_pages)
+			ret = io_ring_buffer_select(req, len, bl, issue_flags);
+		else
+			ret = io_provided_buffer_select(req, len, bl);
 	}
-
-	/* selection helpers drop the submit lock again, if needed */
-	if (bl->buf_nr_pages)
-		return io_ring_buffer_select(req, len, bl, issue_flags);
-
-	return io_provided_buffer_select(req, len, bl, issue_flags);
+	io_ring_submit_unlock(req->ctx, issue_flags);
+	return ret;
 }
 
 #ifdef CONFIG_COMPAT
@@ -3621,8 +3612,8 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 
 	len = clen;
 	buf = io_buffer_select(req, &len, issue_flags);
-	if (IS_ERR(buf))
-		return PTR_ERR(buf);
+	if (!buf)
+		return -ENOBUFS;
 	req->rw.addr = (unsigned long) buf;
 	iov[0].iov_base = buf;
 	req->rw.len = iov[0].iov_len = (compat_size_t) len;
@@ -3644,8 +3635,8 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 	if (len < 0)
 		return -EINVAL;
 	buf = io_buffer_select(req, &len, issue_flags);
-	if (IS_ERR(buf))
-		return PTR_ERR(buf);
+	if (!buf)
+		return -ENOBUFS;
 	req->rw.addr = (unsigned long) buf;
 	iov[0].iov_base = buf;
 	req->rw.len = iov[0].iov_len = len;
@@ -3702,8 +3693,8 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
 	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
 		if (io_do_buffer_select(req)) {
 			buf = io_buffer_select(req, &sqe_len, issue_flags);
-			if (IS_ERR(buf))
-				return ERR_CAST(buf);
+			if (!buf)
+				return ERR_PTR(-ENOBUFS);
 			req->rw.addr = (unsigned long) buf;
 			req->rw.len = sqe_len;
 		}
@@ -4642,8 +4633,8 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 		size_t len = 1;
 
 		buf = io_buffer_select(req, &len, issue_flags);
-		if (IS_ERR(buf))
-			return PTR_ERR(buf);
+		if (!buf)
+			return -ENOBUFS;
 	}
 
 	__io_req_complete(req, issue_flags, 0, io_put_kbuf(req, issue_flags));
@@ -5779,8 +5770,8 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 		void __user *buf;
 
 		buf = io_buffer_select(req, &sr->len, issue_flags);
-		if (IS_ERR(buf))
-			return PTR_ERR(buf);
+		if (!buf)
+			return -ENOBUFS;
 		kmsg->fast_iov[0].iov_base = buf;
 		kmsg->fast_iov[0].iov_len = sr->len;
 		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
@@ -5843,8 +5834,8 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 		void __user *buf;
 
 		buf = io_buffer_select(req, &sr->len, issue_flags);
-		if (IS_ERR(buf))
-			return PTR_ERR(buf);
+		if (!buf)
+			return -ENOBUFS;
 		sr->buf = buf;
 	}
 

From ee67ba3b20f7dcd001b7743eb8e46880cb27fdc6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 18 May 2022 10:40:02 +0200
Subject: [PATCH 104/111] io_uring: drop a spurious inline on a forward
 declaration

io_file_get_normal isn't marked inline, so don't claim it as such in the
forward declaration.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220518084005.3255380-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9cac9b0ffebe..0b1e10d5fbd2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1303,7 +1303,7 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 static void io_clean_op(struct io_kiocb *req);
 static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 					     unsigned issue_flags);
-static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd);
+static struct file *io_file_get_normal(struct io_kiocb *req, int fd);
 static void io_drop_inflight_file(struct io_kiocb *req);
 static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags);
 static void io_queue_sqe(struct io_kiocb *req);

From 58f5c8d39e0ea07fdaaea6a85c49000da83dc0cc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 18 May 2022 10:40:03 +0200
Subject: [PATCH 105/111] io_uring: make apoll_events a __poll_t

apoll_events is fed to vfs_poll and the poll tables, so it should be
a __poll_t.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220518084005.3255380-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0b1e10d5fbd2..c4214bfc7039 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1011,7 +1011,7 @@ struct io_kiocb {
 		/* used by request caches, completion batching and iopoll */
 		struct io_wq_work_node	comp_list;
 		/* cache ->apoll->events */
-		int apoll_events;
+		__poll_t apoll_events;
 	};
 	atomic_t			refs;
 	atomic_t			poll_refs;
@@ -6297,7 +6297,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
 		io_req_complete_failed(req, ret);
 }
 
-static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
+static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events)
 {
 	req->cqe.res = mask;
 	/*
@@ -6316,7 +6316,8 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
 	io_req_task_work_add(req, false);
 }
 
-static inline void io_poll_execute(struct io_kiocb *req, int res, int events)
+static inline void io_poll_execute(struct io_kiocb *req, int res,
+		__poll_t events)
 {
 	if (io_poll_get_ownership(req))
 		__io_poll_execute(req, res, events);

From a294bef57c55a45aef51d31e71d6892e8eba1483 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 18 May 2022 10:40:04 +0200
Subject: [PATCH 106/111] io_uring: consistently use the EPOLL* defines

POLL* are unannotated values for the userspace ABI, while everything
in-kernel should use EPOLL* and the __poll_t type.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220518084005.3255380-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c4214bfc7039..e24eb20f7e30 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6332,7 +6332,7 @@ static void io_poll_cancel_req(struct io_kiocb *req)
 
 #define wqe_to_req(wait)	((void *)((unsigned long) (wait)->private & ~1))
 #define wqe_is_double(wait)	((unsigned long) (wait)->private & 1)
-#define IO_ASYNC_POLL_COMMON	(EPOLLONESHOT | POLLPRI)
+#define IO_ASYNC_POLL_COMMON	(EPOLLONESHOT | EPOLLPRI)
 
 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 			void *key)
@@ -6537,14 +6537,14 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 		mask |= EPOLLONESHOT;
 
 	if (def->pollin) {
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 		/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
 		if ((req->opcode == IORING_OP_RECVMSG) &&
 		    (req->sr_msg.msg_flags & MSG_ERRQUEUE))
-			mask &= ~POLLIN;
+			mask &= ~EPOLLIN;
 	} else {
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 	}
 	if (def->poll_exclusive)
 		mask |= EPOLLEXCLUSIVE;

From 0bf1dbee9baf3e78bff297245178f8c9a8ef8670 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 18 May 2022 10:40:05 +0200
Subject: [PATCH 107/111] io_uring: use rcu_dereference in io_close

Accessing the file table needs a rcu_dereference_protected().

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220518084005.3255380-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e24eb20f7e30..7b27afdab0e6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5372,7 +5372,8 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
 		spin_unlock(&files->file_lock);
 		goto err;
 	}
-	file = fdt->fd[close->fd];
+	file = rcu_dereference_protected(fdt->fd[close->fd],
+			lockdep_is_held(&files->file_lock));
 	if (!file || file->f_op == &io_uring_fops) {
 		spin_unlock(&files->file_lock);
 		file = NULL;

From 0184f08e65348f39aa4e8a71927e4538515f4ac0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 18 May 2022 19:13:49 +0100
Subject: [PATCH 108/111] io_uring: add fully sparse buffer registration

Honour IORING_RSRC_REGISTER_SPARSE not only for direct files but fixed
buffers as well. It makes the rsrc API more consistent.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/66f429e4912fe39fb3318217ff33a2853d4544be.1652879898.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7b27afdab0e6..a210a2c0429d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -10134,12 +10134,17 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	}
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
-		ret = io_copy_iov(ctx, &iov, arg, i);
-		if (ret)
-			break;
-		ret = io_buffer_validate(&iov);
-		if (ret)
-			break;
+		if (arg) {
+			ret = io_copy_iov(ctx, &iov, arg, i);
+			if (ret)
+				break;
+			ret = io_buffer_validate(&iov);
+			if (ret)
+				break;
+		} else {
+			memset(&iov, 0, sizeof(iov));
+		}
+
 		if (!iov.iov_base && *io_get_tag_slot(data, i)) {
 			ret = -EINVAL;
 			break;
@@ -11986,7 +11991,7 @@ static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
 					     rr.nr, u64_to_user_ptr(rr.tags));
 	case IORING_RSRC_BUFFER:
-		if (rr.flags & IORING_RSRC_REGISTER_SPARSE)
+		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
 			break;
 		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
 					       rr.nr, u64_to_user_ptr(rr.tags));
@@ -12224,6 +12229,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 
 	switch (opcode) {
 	case IORING_REGISTER_BUFFERS:
+		ret = -EFAULT;
+		if (!arg)
+			break;
 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
 		break;
 	case IORING_UNREGISTER_BUFFERS:

From 1d0dbbfa282d9be57792e3b5827dc57b010181ee Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 18 May 2022 14:36:18 -0600
Subject: [PATCH 109/111] io_uring: initialize io_buffer_list head when shared
 ring is unregistered

We use ->buf_pages != 0 to tell if this is a shared buffer ring or a
classic provided buffer group. If we unregister the shared ring and
then attempt to use it, buf_pages is zero yet the classic list head
isn't properly initialized. This causes io_buffer_select() to think
that we have classic buffers available, but then we crash when we try
and get one from the list.

Just initialize the list if we unregister a shared buffer ring, leaving
it in a sane state for either re-registration or for attempting to use
it. And do the same for the initial setup from the classic path.

Fixes: c7fb19428d67 ("io_uring: add support for ring mapped supplied buffers")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a210a2c0429d..8fc3dd49bc04 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4969,6 +4969,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 		kvfree(bl->buf_pages);
 		bl->buf_pages = NULL;
 		bl->buf_nr_pages = 0;
+		/* make sure it's seen as empty */
+		INIT_LIST_HEAD(&bl->buf_list);
 		return i;
 	}
 
@@ -5156,6 +5158,7 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 			ret = -ENOMEM;
 			goto err;
 		}
+		INIT_LIST_HEAD(&bl->buf_list);
 		ret = io_buffer_add_list(ctx, bl, p->bgid);
 		if (ret) {
 			kfree(bl);

From 2fcabce2d7d34f69a888146dab15b36a917f09d4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 18 May 2022 15:34:37 -0600
Subject: [PATCH 110/111] io_uring: disallow mixed provided buffer group
 registrations

It's nonsensical to register a provided buffer ring, if a classic
provided buffer group with the same ID exists. Depending on the order of
which we decide what type to pick, the other type will never get used.
Explicitly disallow it and return an error if this is attempted.

Fixes: c7fb19428d67 ("io_uring: add support for ring mapped supplied buffers")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8fc3dd49bc04..0bc5fbf64e49 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -12157,9 +12157,11 @@ static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	}
 
 	bl = io_buffer_get_list(ctx, reg.bgid);
-	if (bl && bl->buf_nr_pages)
-		return -EEXIST;
-	if (!bl) {
+	if (bl) {
+		/* if mapped buffer ring OR classic exists, don't allow */
+		if (bl->buf_nr_pages || !list_empty(&bl->buf_list))
+			return -EEXIST;
+	} else {
 		bl = kzalloc(sizeof(*bl), GFP_KERNEL);
 		if (!bl)
 			return -ENOMEM;

From 0e7579ca732a39cc377e17509dda9bfc4f6ba78e Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@openvz.org>
Date: Thu, 19 May 2022 17:30:49 +0300
Subject: [PATCH 111/111] io_uring: fix incorrect __kernel_rwf_t cast

Currently 'make C=1 fs/io_uring.o' generates sparse warning:

  CHECK   fs/io_uring.c
fs/io_uring.c: note: in included file (through
include/trace/trace_events.h, include/trace/define_trace.h, i
nclude/trace/events/io_uring.h):
./include/trace/events/io_uring.h:488:1:
 warning: incorrect type in assignment (different base types)
    expected unsigned int [usertype] op_flags
    got restricted __kernel_rwf_t const [usertype] rw_flags

This happen on cast of sqe->rw_flags which is defined as __kernel_rwf_t,
this type is bitwise and requires __force attribute for any casts.
However rw_flags is a member of the union, and its access can be safely
replaced by using of its neighbours, so let's use poll32_events to fix
the sparse warning.

Signed-off-by: Vasily Averin <vvs@openvz.org>
Link: https://lore.kernel.org/r/6f009241-a63f-ae43-a04b-62841aaef293@openvz.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/io_uring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 42534ec2ab9d..7cfd9c5cff37 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -520,7 +520,7 @@ TRACE_EVENT(io_uring_req_failed,
 		__entry->off		= sqe->off;
 		__entry->addr		= sqe->addr;
 		__entry->len		= sqe->len;
-		__entry->op_flags	= sqe->rw_flags;
+		__entry->op_flags	= sqe->poll32_events;
 		__entry->buf_index	= sqe->buf_index;
 		__entry->personality	= sqe->personality;
 		__entry->file_index	= sqe->file_index;