From 15b2219facadec583c24523eed40fa45865f859f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 12 Mar 2021 20:20:42 -0700 Subject: [PATCH 01/14] kernel: freezer should treat PF_IO_WORKER like PF_KTHREAD for freezing Don't send fake signals to PF_IO_WORKER threads, they don't accept signals. Just treat them like kthreads in this regard, all they need is a wakeup as no forced kernel/user transition is needed. Suggested-by: Linus Torvalds Signed-off-by: Jens Axboe --- kernel/freezer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/freezer.c b/kernel/freezer.c index dc520f01f99d..1a2d57d1327c 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -134,7 +134,7 @@ bool freeze_task(struct task_struct *p) return false; } - if (!(p->flags & PF_KTHREAD)) + if (!(p->flags & (PF_KTHREAD | PF_IO_WORKER))) fake_signal_wake_up(p); else wake_up_state(p, TASK_INTERRUPTIBLE); From 16efa4fce3b7af17bb45d635c3e89992d721e0f3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 12 Mar 2021 20:26:13 -0700 Subject: [PATCH 02/14] io_uring: allow IO worker threads to be frozen With the freezer using the proper signaling to notify us of when it's time to freeze a thread, we can re-enable normal freezer usage for the IO threads. Ensure that SQPOLL, io-wq, and the io-wq manager call try_to_freeze() appropriately, and remove the default setting of PF_NOFREEZE from create_io_thread(). Signed-off-by: Jens Axboe --- fs/io-wq.c | 6 +++++- fs/io_uring.c | 1 + kernel/fork.c | 1 - 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 0ae9ecadf295..e05f996d088f 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -488,6 +488,8 @@ static int io_wqe_worker(void *data) set_task_comm(current, buf); while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { + long ret; + set_current_state(TASK_INTERRUPTIBLE); loop: raw_spin_lock_irq(&wqe->lock); @@ -498,7 +500,8 @@ loop: __io_worker_idle(wqe, worker); raw_spin_unlock_irq(&wqe->lock); io_flush_signals(); - if (schedule_timeout(WORKER_IDLE_TIMEOUT)) + ret = schedule_timeout(WORKER_IDLE_TIMEOUT); + if (try_to_freeze() || ret) continue; if (fatal_signal_pending(current)) break; @@ -709,6 +712,7 @@ static int io_wq_manager(void *data) set_current_state(TASK_INTERRUPTIBLE); io_wq_check_workers(wq); schedule_timeout(HZ); + try_to_freeze(); if (fatal_signal_pending(current)) set_bit(IO_WQ_BIT_EXIT, &wq->state); } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)); diff --git a/fs/io_uring.c b/fs/io_uring.c index a4bce17af506..05adc4887ef3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6752,6 +6752,7 @@ static int io_sq_thread(void *data) up_read(&sqd->rw_lock); schedule(); + try_to_freeze(); down_read(&sqd->rw_lock); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); diff --git a/kernel/fork.c b/kernel/fork.c index 72e444cd0ffe..d3171e8e88e5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2436,7 +2436,6 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) if (!IS_ERR(tsk)) { sigfillset(&tsk->blocked); sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); - tsk->flags |= PF_NOFREEZE; } return tsk; } From 9e15c3a0ced5a61f320b989072c24983cb1620c1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 13 Mar 2021 12:29:43 -0700 Subject: [PATCH 03/14] io_uring: convert io_buffer_idr to XArray Like we did for the personality idr, convert the IO buffer idr to use XArray. This avoids a use-after-free on removal of entries, since idr doesn't like doing so from inside an iterator, and it nicely reduces the amount of code we need to support this feature. Fixes: 5a2e745d4d43 ("io_uring: buffer registration infrastructure") Cc: stable@vger.kernel.org Cc: Matthew Wilcox Cc: yangerkun Reported-by: Hulk Robot Signed-off-by: Jens Axboe --- fs/io_uring.c | 43 +++++++++++++++---------------------------- 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 05adc4887ef3..58d62dd9f8e4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -402,7 +402,7 @@ struct io_ring_ctx { struct socket *ring_sock; #endif - struct idr io_buffer_idr; + struct xarray io_buffers; struct xarray personalities; u32 pers_next; @@ -1135,7 +1135,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->cq_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->ref_comp); - idr_init(&ctx->io_buffer_idr); + xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); @@ -2843,7 +2843,7 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, lockdep_assert_held(&req->ctx->uring_lock); - head = idr_find(&req->ctx->io_buffer_idr, bgid); + head = xa_load(&req->ctx->io_buffers, bgid); if (head) { if (!list_empty(&head->list)) { kbuf = list_last_entry(&head->list, struct io_buffer, @@ -2851,7 +2851,7 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, list_del(&kbuf->list); } else { kbuf = head; - idr_remove(&req->ctx->io_buffer_idr, bgid); + xa_erase(&req->ctx->io_buffers, bgid); } if (*len > kbuf->len) *len = kbuf->len; @@ -3892,7 +3892,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, } i++; kfree(buf); - idr_remove(&ctx->io_buffer_idr, bgid); + xa_erase(&ctx->io_buffers, bgid); return i; } @@ -3910,7 +3910,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) lockdep_assert_held(&ctx->uring_lock); ret = -ENOENT; - head = idr_find(&ctx->io_buffer_idr, p->bgid); + head = xa_load(&ctx->io_buffers, p->bgid); if (head) ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); if (ret < 0) @@ -3993,21 +3993,14 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) lockdep_assert_held(&ctx->uring_lock); - list = head = idr_find(&ctx->io_buffer_idr, p->bgid); + list = head = xa_load(&ctx->io_buffers, p->bgid); ret = io_add_buffers(p, &head); - if (ret < 0) - goto out; - - if (!list) { - ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1, - GFP_KERNEL); - if (ret < 0) { + if (ret >= 0 && !list) { + ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL); + if (ret < 0) __io_remove_buffers(ctx, head, p->bgid, -1U); - goto out; - } } -out: if (ret < 0) req_set_fail_links(req); @@ -8333,19 +8326,13 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) return -ENXIO; } -static int __io_destroy_buffers(int id, void *p, void *data) -{ - struct io_ring_ctx *ctx = data; - struct io_buffer *buf = p; - - __io_remove_buffers(ctx, buf, id, -1U); - return 0; -} - static void io_destroy_buffers(struct io_ring_ctx *ctx) { - idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx); - idr_destroy(&ctx->io_buffer_idr); + struct io_buffer *buf; + unsigned long index; + + xa_for_each(&ctx->io_buffers, index, buf) + __io_remove_buffers(ctx, buf, index, -1U); } static void io_req_cache_free(struct list_head *list, struct task_struct *tsk) From efe814a471e0e58f28f1efaf430c8784a4f36626 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 14 Mar 2021 20:57:08 +0000 Subject: [PATCH 04/14] io_uring: fix ->flags races by linked timeouts It's racy to modify req->flags from a not owning context, e.g. linked timeout calling req_set_fail_links() for the master request might race with that request setting/clearing flags while being executed concurrently. Just remove req_set_fail_links(prev) from io_link_timeout_fn(), io_async_find_and_cancel() and functions down the line take care of setting the fail bit. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 58d62dd9f8e4..217f72d50ff5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6197,7 +6197,6 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { - req_set_fail_links(prev); io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); io_put_req_deferred(prev, 1); } else { From 180f829fe4026bd192447d261e712b6cb84f6202 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 14 Mar 2021 20:57:09 +0000 Subject: [PATCH 05/14] io_uring: fix complete_post use ctx after free If io_req_complete_post() put not a final ref, we can't rely on the request's ctx ref, and so ctx may potentially be freed while complete_post() is in io_cqring_ev_posted()/etc. In that case get an additional ctx reference, and put it in the end, so protecting following io_cqring_ev_posted(). And also prolong ctx lifetime until spin_unlock happens, as we do with mutexes, so added percpu_ref_get() doesn't race with ctx free. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 217f72d50ff5..f8a683607b25 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1550,14 +1550,17 @@ static void io_req_complete_post(struct io_kiocb *req, long res, io_put_task(req->task, 1); list_add(&req->compl.list, &cs->locked_free_list); cs->locked_free_nr++; - } else - req = NULL; + } else { + if (!percpu_ref_tryget(&ctx->refs)) + req = NULL; + } io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); - io_cqring_ev_posted(ctx); - if (req) + if (req) { + io_cqring_ev_posted(ctx); percpu_ref_put(&ctx->refs); + } } static void io_req_complete_state(struct io_kiocb *req, long res, @@ -8373,11 +8376,13 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) { /* * Some may use context even when all refs and requests have been put, - * and they are free to do so while still holding uring_lock, see - * __io_req_task_submit(). Wait for them to finish. + * and they are free to do so while still holding uring_lock or + * completion_lock, see __io_req_task_submit(). Wait for them to finish. */ mutex_lock(&ctx->uring_lock); mutex_unlock(&ctx->uring_lock); + spin_lock_irq(&ctx->completion_lock); + spin_unlock_irq(&ctx->completion_lock); io_sq_thread_finish(ctx); io_sqe_buffers_unregister(ctx); From 09a6f4efaa6536e760385f949e24078fd78305ad Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 14 Mar 2021 20:57:10 +0000 Subject: [PATCH 06/14] io_uring: replace sqd rw_semaphore with mutex The only user of read-locking of sqd->rw_lock is sq_thread itself, which is by definition alone, so we don't really need rw_semaphore, but mutex will do. Replace it with a mutex, and kill read-to-write upgrading and extra task_work handling in io_sq_thread(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f8a683607b25..6de779aafd33 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -258,7 +258,7 @@ enum { struct io_sq_data { refcount_t refs; - struct rw_semaphore rw_lock; + struct mutex lock; /* ctx's that are using this sqd */ struct list_head ctx_list; @@ -6689,16 +6689,15 @@ static int io_sq_thread(void *data) set_cpus_allowed_ptr(current, cpu_online_mask); current->flags |= PF_NO_SETAFFINITY; - down_read(&sqd->rw_lock); - + mutex_lock(&sqd->lock); while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) { int ret; bool cap_entries, sqt_spin, needs_sched; if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) { - up_read(&sqd->rw_lock); + mutex_unlock(&sqd->lock); cond_resched(); - down_read(&sqd->rw_lock); + mutex_lock(&sqd->lock); io_run_task_work(); timeout = jiffies + sqd->sq_thread_idle; continue; @@ -6745,10 +6744,10 @@ static int io_sq_thread(void *data) list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); - up_read(&sqd->rw_lock); + mutex_unlock(&sqd->lock); schedule(); try_to_freeze(); - down_read(&sqd->rw_lock); + mutex_lock(&sqd->lock); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); } @@ -6756,20 +6755,13 @@ static int io_sq_thread(void *data) finish_wait(&sqd->wait, &wait); timeout = jiffies + sqd->sq_thread_idle; } - up_read(&sqd->rw_lock); - down_write(&sqd->rw_lock); - /* - * someone may have parked and added a cancellation task_work, run - * it first because we don't want it in io_uring_cancel_sqpoll() - */ - io_run_task_work(); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_uring_cancel_sqpoll(ctx); sqd->thread = NULL; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); - up_write(&sqd->rw_lock); + mutex_unlock(&sqd->lock); io_run_task_work(); complete(&sqd->exited); @@ -7071,21 +7063,21 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) } static void io_sq_thread_unpark(struct io_sq_data *sqd) - __releases(&sqd->rw_lock) + __releases(&sqd->lock) { WARN_ON_ONCE(sqd->thread == current); clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - up_write(&sqd->rw_lock); + mutex_unlock(&sqd->lock); } static void io_sq_thread_park(struct io_sq_data *sqd) - __acquires(&sqd->rw_lock) + __acquires(&sqd->lock) { WARN_ON_ONCE(sqd->thread == current); set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - down_write(&sqd->rw_lock); + mutex_lock(&sqd->lock); /* set again for consistency, in case concurrent parks are happening */ set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); if (sqd->thread) @@ -7096,11 +7088,11 @@ static void io_sq_thread_stop(struct io_sq_data *sqd) { WARN_ON_ONCE(sqd->thread == current); - down_write(&sqd->rw_lock); + mutex_lock(&sqd->lock); set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); if (sqd->thread) wake_up_process(sqd->thread); - up_write(&sqd->rw_lock); + mutex_unlock(&sqd->lock); wait_for_completion(&sqd->exited); } @@ -7182,7 +7174,7 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, refcount_set(&sqd->refs, 1); INIT_LIST_HEAD(&sqd->ctx_list); - init_rwsem(&sqd->rw_lock); + mutex_init(&sqd->lock); init_waitqueue_head(&sqd->wait); init_completion(&sqd->exited); return sqd; From f6d54255f4235448d4bbe442362d4caa62da97d5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 14 Mar 2021 20:57:11 +0000 Subject: [PATCH 07/14] io_uring: halt SQO submission on ctx exit io_sq_thread_finish() is called in io_ring_ctx_free(), so SQPOLL task is potentially running submitting new requests. It's not a disaster because of using a "try" variant of percpu_ref_get, but is far from nice. Remove ctx from the sqd ctx list earlier, before cancellation loop, so SQPOLL can't find it and so won't submit new requests. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6de779aafd33..b87012a21775 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8564,6 +8564,14 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_unregister_personality(ctx, index); mutex_unlock(&ctx->uring_lock); + /* prevent SQPOLL from submitting new requests */ + if (ctx->sq_data) { + io_sq_thread_park(ctx->sq_data); + list_del_init(&ctx->sqd_list); + io_sqd_update_thread_idle(ctx->sq_data); + io_sq_thread_unpark(ctx->sq_data); + } + io_kill_timeouts(ctx, NULL, NULL); io_poll_remove_all(ctx, NULL, NULL); From 9e138a48345427fa42f6076396ea069cebf3c08f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 14 Mar 2021 20:57:12 +0000 Subject: [PATCH 08/14] io_uring: fix concurrent parking If io_sq_thread_park() of one task got rescheduled right after set_bit(), before it gets back to mutex_lock() there can happen park()/unpark() by another task with SQPOLL locking again and continuing running never seeing that first set_bit(SHOULD_PARK), so won't even try to put the mutex down for parking. It will get parked eventually when SQPOLL drops the lock for reschedule, but may be problematic and will get in the way of further fixes. Account number of tasks waiting for parking with a new atomic variable park_pending and adjust SHOULD_PARK accordingly. It doesn't entirely replaces SHOULD_PARK bit with this atomic var because it's convenient to have it as a bit in the state and will help to do optimisations later. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b87012a21775..70ceb8ed5950 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -258,6 +258,7 @@ enum { struct io_sq_data { refcount_t refs; + atomic_t park_pending; struct mutex lock; /* ctx's that are using this sqd */ @@ -7067,7 +7068,13 @@ static void io_sq_thread_unpark(struct io_sq_data *sqd) { WARN_ON_ONCE(sqd->thread == current); + /* + * Do the dance but not conditional clear_bit() because it'd race with + * other threads incrementing park_pending and setting the bit. + */ clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + if (atomic_dec_return(&sqd->park_pending)) + set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); mutex_unlock(&sqd->lock); } @@ -7076,10 +7083,9 @@ static void io_sq_thread_park(struct io_sq_data *sqd) { WARN_ON_ONCE(sqd->thread == current); + atomic_inc(&sqd->park_pending); set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); mutex_lock(&sqd->lock); - /* set again for consistency, in case concurrent parks are happening */ - set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); if (sqd->thread) wake_up_process(sqd->thread); } @@ -7099,6 +7105,8 @@ static void io_sq_thread_stop(struct io_sq_data *sqd) static void io_put_sq_data(struct io_sq_data *sqd) { if (refcount_dec_and_test(&sqd->refs)) { + WARN_ON_ONCE(atomic_read(&sqd->park_pending)); + io_sq_thread_stop(sqd); kfree(sqd); } @@ -7172,6 +7180,7 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, if (!sqd) return ERR_PTR(-ENOMEM); + atomic_set(&sqd->park_pending, 0); refcount_set(&sqd->refs, 1); INIT_LIST_HEAD(&sqd->ctx_list); mutex_init(&sqd->lock); From 9b46571142e47503ed4f3ae3be5ed3968d8cb9cc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Mar 2021 14:23:07 +0000 Subject: [PATCH 09/14] io_uring: add generic callback_head helpers We already have helpers to run/add callback_head but taking ctx and working with ctx->exit_task_work. Extract generic versions of them implemented in terms of struct callback_head, it will be used later. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 62 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 70ceb8ed5950..5c2de43b99f5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1929,17 +1929,44 @@ static int io_req_task_work_add(struct io_kiocb *req) return ret; } +static bool io_run_task_work_head(struct callback_head **work_head) +{ + struct callback_head *work, *next; + bool executed = false; + + do { + work = xchg(work_head, NULL); + if (!work) + break; + + do { + next = work->next; + work->func(work); + work = next; + cond_resched(); + } while (work); + executed = true; + } while (1); + + return executed; +} + +static void io_task_work_add_head(struct callback_head **work_head, + struct callback_head *task_work) +{ + struct callback_head *head; + + do { + head = READ_ONCE(*work_head); + task_work->next = head; + } while (cmpxchg(work_head, head, task_work) != head); +} + static void io_req_task_work_add_fallback(struct io_kiocb *req, task_work_func_t cb) { - struct io_ring_ctx *ctx = req->ctx; - struct callback_head *head; - init_task_work(&req->task_work, cb); - do { - head = READ_ONCE(ctx->exit_task_work); - req->task_work.next = head; - } while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head); + io_task_work_add_head(&req->ctx->exit_task_work, &req->task_work); } static void __io_req_task_cancel(struct io_kiocb *req, int error) @@ -8471,26 +8498,9 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) return -EINVAL; } -static bool io_run_ctx_fallback(struct io_ring_ctx *ctx) +static inline bool io_run_ctx_fallback(struct io_ring_ctx *ctx) { - struct callback_head *work, *next; - bool executed = false; - - do { - work = xchg(&ctx->exit_task_work, NULL); - if (!work) - break; - - do { - next = work->next; - work->func(work); - work = next; - cond_resched(); - } while (work); - executed = true; - } while (1); - - return executed; + return io_run_task_work_head(&ctx->exit_task_work); } struct io_tctx_exit { From b7f5a0bfe2061b2c7b2164de06fa4072d7373a45 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Mar 2021 14:23:08 +0000 Subject: [PATCH 10/14] io_uring: fix sqpoll cancellation via task_work Running sqpoll cancellations via task_work_run() is a bad idea because it depends on other task works to be run, but those may be locked in currently running task_work_run() because of how it's (splicing the list in batches). Enqueue and run them through a separate callback head, namely struct io_sq_data::park_task_work. As a nice bonus we now precisely control where it's run, that's much safer than guessing where it can happen as it was before. Reported-by: Jens Axboe Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5c2de43b99f5..5f4e312111ea 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -274,6 +274,7 @@ struct io_sq_data { unsigned long state; struct completion exited; + struct callback_head *park_task_work; }; #define IO_IOPOLL_BATCH 8 @@ -6727,6 +6728,7 @@ static int io_sq_thread(void *data) cond_resched(); mutex_lock(&sqd->lock); io_run_task_work(); + io_run_task_work_head(&sqd->park_task_work); timeout = jiffies + sqd->sq_thread_idle; continue; } @@ -6781,6 +6783,7 @@ static int io_sq_thread(void *data) } finish_wait(&sqd->wait, &wait); + io_run_task_work_head(&sqd->park_task_work); timeout = jiffies + sqd->sq_thread_idle; } @@ -6792,6 +6795,7 @@ static int io_sq_thread(void *data) mutex_unlock(&sqd->lock); io_run_task_work(); + io_run_task_work_head(&sqd->park_task_work); complete(&sqd->exited); do_exit(0); } @@ -8890,7 +8894,7 @@ static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx) if (task) { init_completion(&work.completion); init_task_work(&work.task_work, io_sqpoll_cancel_cb); - WARN_ON_ONCE(task_work_add(task, &work.task_work, TWA_SIGNAL)); + io_task_work_add_head(&sqd->park_task_work, &work.task_work); wake_up_process(task); } io_sq_thread_unpark(sqd); From 76cd979f4f38a27df22efb5773a0d567181a9392 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 16 Mar 2021 16:33:27 +0100 Subject: [PATCH 11/14] io_uring: imply MSG_NOSIGNAL for send[msg]()/recv[msg]() calls We never want to generate any SIGPIPE, -EPIPE only is much better. Signed-off-by: Stefan Metzmacher Link: https://lore.kernel.org/r/38961085c3ec49fd21550c7788f214d1ff02d2d4.1615908477.git.metze@samba.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5f4e312111ea..a81f7a30ea70 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4384,7 +4384,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg = &iomsg; } - flags = req->sr_msg.msg_flags; + flags = req->sr_msg.msg_flags | MSG_NOSIGNAL; if (flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; else if (issue_flags & IO_URING_F_NONBLOCK) @@ -4428,7 +4428,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) msg.msg_controllen = 0; msg.msg_namelen = 0; - flags = req->sr_msg.msg_flags; + flags = req->sr_msg.msg_flags | MSG_NOSIGNAL; if (flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; else if (issue_flags & IO_URING_F_NONBLOCK) @@ -4618,7 +4618,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) 1, req->sr_msg.len); } - flags = req->sr_msg.msg_flags; + flags = req->sr_msg.msg_flags | MSG_NOSIGNAL; if (flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; else if (force_nonblock) @@ -4677,7 +4677,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) msg.msg_iocb = NULL; msg.msg_flags = 0; - flags = req->sr_msg.msg_flags; + flags = req->sr_msg.msg_flags | MSG_NOSIGNAL; if (flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; else if (force_nonblock) From 53e043b2b432ef2294efec04dd8a88d96c024624 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 15 Mar 2021 12:56:56 +0100 Subject: [PATCH 12/14] io_uring: remove structures from include/linux/io_uring.h Link: https://lore.kernel.org/r/8c1d14f3748105f4caeda01716d47af2fa41d11c.1615809009.git.metze@samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Jens Axboe --- fs/io-wq.h | 10 +++++++++- fs/io_uring.c | 16 ++++++++++++++++ include/linux/io_uring.h | 25 ------------------------- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/fs/io-wq.h b/fs/io-wq.h index 1ac2f3248088..80d590564ff9 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -2,7 +2,6 @@ #define INTERNAL_IO_WQ_H #include -#include struct io_wq; @@ -21,6 +20,15 @@ enum io_wq_cancel { IO_WQ_CANCEL_NOTFOUND, /* work not found */ }; +struct io_wq_work_node { + struct io_wq_work_node *next; +}; + +struct io_wq_work_list { + struct io_wq_work_node *first; + struct io_wq_work_node *last; +}; + static inline void wq_list_add_after(struct io_wq_work_node *node, struct io_wq_work_node *pos, struct io_wq_work_list *list) diff --git a/fs/io_uring.c b/fs/io_uring.c index a81f7a30ea70..52ba8d7f3eb8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -456,6 +456,22 @@ struct io_ring_ctx { struct list_head tctx_list; }; +struct io_uring_task { + /* submission side */ + struct xarray xa; + struct wait_queue_head wait; + void *last; + void *io_wq; + struct percpu_counter inflight; + atomic_t in_idle; + bool sqpoll; + + spinlock_t task_lock; + struct io_wq_work_list task_list; + unsigned long task_state; + struct callback_head task_work; +}; + /* * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 9761a0ec9f95..79cde9906be0 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -5,31 +5,6 @@ #include #include -struct io_wq_work_node { - struct io_wq_work_node *next; -}; - -struct io_wq_work_list { - struct io_wq_work_node *first; - struct io_wq_work_node *last; -}; - -struct io_uring_task { - /* submission side */ - struct xarray xa; - struct wait_queue_head wait; - void *last; - void *io_wq; - struct percpu_counter inflight; - atomic_t in_idle; - bool sqpoll; - - spinlock_t task_lock; - struct io_wq_work_list task_list; - unsigned long task_state; - struct callback_head task_work; -}; - #if defined(CONFIG_IO_URING) struct sock *io_uring_get_socket(struct file *file); void __io_uring_task_cancel(void); From ee53fb2b197b72b126ca0387ae636da75d969428 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 15 Mar 2021 12:56:57 +0100 Subject: [PATCH 13/14] io_uring: use typesafe pointers in io_uring_task Signed-off-by: Stefan Metzmacher Link: https://lore.kernel.org/r/ce2a598e66e48347bb04afbaf2acc67c0cc7971a.1615809009.git.metze@samba.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 52ba8d7f3eb8..675073966760 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -460,8 +460,8 @@ struct io_uring_task { /* submission side */ struct xarray xa; struct wait_queue_head wait; - void *last; - void *io_wq; + const struct io_ring_ctx *last; + struct io_wq *io_wq; struct percpu_counter inflight; atomic_t in_idle; bool sqpoll; From de75a3d3f5a14c9ab3c4883de3471d3c92a8ee78 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 18 Mar 2021 11:54:35 +0000 Subject: [PATCH 14/14] io_uring: don't leak creds on SQO attach error Attaching to already dead/dying SQPOLL task is disallowed in io_sq_offload_create(), but cleanup is hand coded by calling io_put_sq_data()/etc., that miss to put ctx->sq_creds. Defer everything to error-path io_sq_thread_finish(), adding ctx->sqd_list in the error case as well as finish will handle it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 675073966760..ea2d3e120555 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7910,22 +7910,17 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, ret = 0; io_sq_thread_park(sqd); + list_add(&ctx->sqd_list, &sqd->ctx_list); + io_sqd_update_thread_idle(sqd); /* don't attach to a dying SQPOLL thread, would be racy */ - if (attached && !sqd->thread) { + if (attached && !sqd->thread) ret = -ENXIO; - } else { - list_add(&ctx->sqd_list, &sqd->ctx_list); - io_sqd_update_thread_idle(sqd); - } io_sq_thread_unpark(sqd); - if (ret < 0) { - io_put_sq_data(sqd); - ctx->sq_data = NULL; - return ret; - } else if (attached) { + if (ret < 0) + goto err; + if (attached) return 0; - } if (p->flags & IORING_SETUP_SQ_AFF) { int cpu = p->sq_thread_cpu;