From 9921d5013a6e51892623bf2f1c5b49eaecda55ac Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 27 Oct 2022 00:11:53 +0100
Subject: [PATCH 1/6] selftests/net: don't tests batched TCP io_uring zc

It doesn't make sense batch submitting io_uring requests to a single TCP
socket without linking or some other kind of ordering. Moreover, it
causes spurious -EINTR fails due to interaction with task_work. Disable
it for now and keep queue depth=1.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b547698d5938b1b1a898af1c260188d8546ded9a.1666700897.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/net/io_uring_zerocopy_tx.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/io_uring_zerocopy_tx.sh b/tools/testing/selftests/net/io_uring_zerocopy_tx.sh
index 32aa6e9dacc2..9ac4456d48fc 100755
--- a/tools/testing/selftests/net/io_uring_zerocopy_tx.sh
+++ b/tools/testing/selftests/net/io_uring_zerocopy_tx.sh
@@ -29,7 +29,7 @@ if [[ "$#" -eq "0" ]]; then
 	for IP in "${IPs[@]}"; do
 		for mode in $(seq 1 3); do
 			$0 "$IP" udp -m "$mode" -t 1 -n 32
-			$0 "$IP" tcp -m "$mode" -t 1 -n 32
+			$0 "$IP" tcp -m "$mode" -t 1 -n 1
 		done
 	done
 

From 6dcabcd398946e2b0b776a8310291aeebe1ca0e6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 6 Nov 2022 13:17:27 -0700
Subject: [PATCH 2/6] io_uring: fix typo in io_uring.h comment

Just a basic s/thig/this swap, fixing up a typo introduced by a commit
added in the 6.1 release.

Fixes: 9cda70f622cd ("io_uring: introduce fixed buffer support for io_uring_cmd")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ab7458033ee3..2df3225b562f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -222,7 +222,7 @@ enum io_uring_op {
 
 /*
  * sqe->uring_cmd_flags
- * IORING_URING_CMD_FIXED	use registered buffer; pass thig flag
+ * IORING_URING_CMD_FIXED	use registered buffer; pass this flag
  *				along with setting sqe->buf_index.
  */
 #define IORING_URING_CMD_FIXED	(1U << 0)

From 0fc8c2acbfc789a977a50a4a9812a8e4b37958ce Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Tue, 8 Nov 2022 07:30:16 -0800
Subject: [PATCH 3/6] io_uring: calculate CQEs from the user visible value

io_cqring_wait (and it's wake function io_has_work) used cached_cq_tail in
order to calculate the number of CQEs. cached_cq_tail is set strictly
before the user visible rings->cq.tail

However as far as userspace is concerned,  if io_uring_enter(2) is called
with a minimum number of events, they will verify by checking
rings->cq.tail.

It is therefore possible for io_uring_enter(2) to return early with fewer
events visible to the user.

Instead make the wait functions read from the user visible value, so there
will be no discrepency.

This is triggered eventually by the following reproducer:

struct io_uring_sqe *sqe;
struct io_uring_cqe *cqe;
unsigned int cqe_ready;
struct io_uring ring;
int ret, i;

ret = io_uring_queue_init(N, &ring, 0);
assert(!ret);
while(true) {
	for (i = 0; i < N; i++) {
		sqe = io_uring_get_sqe(&ring);
		io_uring_prep_nop(sqe);
		sqe->flags |= IOSQE_ASYNC;
	}
	ret = io_uring_submit(&ring);
	assert(ret == N);

	do {
		ret = io_uring_wait_cqes(&ring, &cqe, N, NULL, NULL);
	} while(ret == -EINTR);
	cqe_ready = io_uring_cq_ready(&ring);
	assert(!ret);
	assert(cqe_ready == N);
	io_uring_cq_advance(&ring, N);
}

Fixes: ad3eb2c89fb2 ("io_uring: split overflow state into SQ and CQ side")
Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221108153016.1854297-1-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ac8c488e3077..4a1e482747cc 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -176,6 +176,11 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
 	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
 }
 
+static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx)
+{
+	return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head);
+}
+
 static bool io_match_linked(struct io_kiocb *head)
 {
 	struct io_kiocb *req;
@@ -2315,7 +2320,7 @@ static inline bool io_has_work(struct io_ring_ctx *ctx)
 static inline bool io_should_wake(struct io_wait_queue *iowq)
 {
 	struct io_ring_ctx *ctx = iowq->ctx;
-	int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
+	int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail;
 
 	/*
 	 * Wake up if we have enough events, or if a timeout occurred since we
@@ -2399,7 +2404,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			return ret;
 		io_cqring_overflow_flush(ctx);
 
-		if (io_cqring_events(ctx) >= min_events)
+		/* if user messes with these they will just get an early return */
+		if (__io_cqring_events_user(ctx) >= min_events)
 			return 0;
 	} while (ret > 0);
 

From 3851d25c75ed03117268a8feb34adca5a843a126 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 10 Nov 2022 10:50:55 -0700
Subject: [PATCH 4/6] io_uring: check for rollover of buffer ID when providing
 buffers

We already check if the chosen starting offset for the buffer IDs fit
within an unsigned short, as 65535 is the maximum value for a provided
buffer. But if the caller asks to add N buffers at offset M, and M + N
would exceed the size of the unsigned short, we simply add buffers with
wrapping around the ID.

This is not necessarily a bug and could in fact be a valid use case, but
it seems confusing and inconsistent with the initial check for starting
offset. Let's check for wrap consistently, and error the addition if we
do need to wrap.

Reported-by: Olivier Langlois <olivier@trillion01.com>
Link: https://github.com/axboe/liburing/issues/726
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 25cd724ade18..e2c46889d5fa 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -346,6 +346,8 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	tmp = READ_ONCE(sqe->off);
 	if (tmp > USHRT_MAX)
 		return -E2BIG;
+	if (tmp + p->nbufs >= USHRT_MAX)
+		return -EINVAL;
 	p->bid = tmp;
 	return 0;
 }

From 30a33669fa21cd3dc7d92a00ba736358059014b7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 11 Nov 2022 16:51:29 +0000
Subject: [PATCH 5/6] io_uring/poll: fix double poll req->flags races

io_poll_double_prepare()            | io_poll_wake()
                                    | poll->head = NULL
smp_load(&poll->head); /* NULL */   |
flags = req->flags;                 |
                                    | req->flags &= ~SINGLE_POLL;
req->flags = flags | DOUBLE_POLL    |

The idea behind io_poll_double_prepare() is to serialise with the
first poll entry by taking the wq lock. However, it's not safe to assume
that io_poll_wake() is not running when we can't grab the lock and so we
may race modifying req->flags.

Skip double poll setup if that happens. It's ok because the first poll
entry will only be removed when it's definitely completing, e.g.
pollfree or oneshot with a valid mask.

Fixes: 49f1c68e048f1 ("io_uring: optimise submission side poll_refs")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b7fab2d502f6121a7d7b199fe4d914a43ca9cdfd.1668184658.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 0d9f49c575e0..97c214aa688c 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -394,7 +394,8 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	return 1;
 }
 
-static void io_poll_double_prepare(struct io_kiocb *req)
+/* fails only when polling is already completing by the first entry */
+static bool io_poll_double_prepare(struct io_kiocb *req)
 {
 	struct wait_queue_head *head;
 	struct io_poll *poll = io_poll_get_single(req);
@@ -403,20 +404,20 @@ static void io_poll_double_prepare(struct io_kiocb *req)
 	rcu_read_lock();
 	head = smp_load_acquire(&poll->head);
 	/*
-	 * poll arm may not hold ownership and so race with
-	 * io_poll_wake() by modifying req->flags. There is only one
-	 * poll entry queued, serialise with it by taking its head lock.
+	 * poll arm might not hold ownership and so race for req->flags with
+	 * io_poll_wake(). There is only one poll entry queued, serialise with
+	 * it by taking its head lock. As we're still arming the tw hanlder
+	 * is not going to be run, so there are no races with it.
 	 */
-	if (head)
+	if (head) {
 		spin_lock_irq(&head->lock);
-
-	req->flags |= REQ_F_DOUBLE_POLL;
-	if (req->opcode == IORING_OP_POLL_ADD)
-		req->flags |= REQ_F_ASYNC_DATA;
-
-	if (head)
+		req->flags |= REQ_F_DOUBLE_POLL;
+		if (req->opcode == IORING_OP_POLL_ADD)
+			req->flags |= REQ_F_ASYNC_DATA;
 		spin_unlock_irq(&head->lock);
+	}
 	rcu_read_unlock();
+	return !!head;
 }
 
 static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
@@ -454,7 +455,11 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
 		/* mark as double wq entry */
 		wqe_private |= IO_WQE_F_DOUBLE;
 		io_init_poll_iocb(poll, first->events, first->wait.func);
-		io_poll_double_prepare(req);
+		if (!io_poll_double_prepare(req)) {
+			/* the request is completing, just back off */
+			kfree(poll);
+			return;
+		}
 		*poll_ptr = poll;
 	} else {
 		/* fine to modify, there is no poll queued to race with us */

From 5576035f15dfcc6cb1cec236db40c2c0733b0ba4 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 11 Nov 2022 16:51:30 +0000
Subject: [PATCH 6/6] io_uring/poll: lockdep annote io_poll_req_insert_locked

Add a lockdep annotation in io_poll_req_insert_locked().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8115d8e702733754d0aea119e9b5bb63d1eb8b24.1668184658.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 97c214aa688c..f500506984ec 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -116,6 +116,8 @@ static void io_poll_req_insert_locked(struct io_kiocb *req)
 	struct io_hash_table *table = &req->ctx->cancel_table_locked;
 	u32 index = hash_long(req->cqe.user_data, table->hash_bits);
 
+	lockdep_assert_held(&req->ctx->uring_lock);
+
 	hlist_add_head(&req->hash_node, &table->hbs[index].list);
 }