From 035fbafc7a54b8c7755b3c508b8f3ab6ff3c8d65 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 12 Oct 2020 15:03:41 +0100 Subject: [PATCH 01/21] io_uring: Fix sizeof() mismatch An incorrect sizeof() is being used, sizeof(file_data->table) is not correct, it should be sizeof(*file_data->table). Fixes: 5398ae698525 ("io_uring: clean file_data access in files_register") Signed-off-by: Colin Ian King Addresses-Coverity: ("Sizeof not portable (SIZEOF_MISMATCH)") Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2e1dc354cd08..717dd5d38d75 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7306,7 +7306,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, spin_lock_init(&file_data->lock); nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); - file_data->table = kcalloc(nr_tables, sizeof(file_data->table), + file_data->table = kcalloc(nr_tables, sizeof(*file_data->table), GFP_KERNEL); if (!file_data->table) goto out_free; From 368c5481ae7c6a9719c40984faea35480d9f4872 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 Oct 2020 09:43:56 +0100 Subject: [PATCH 02/21] io_uring: don't set COMP_LOCKED if won't put __io_kill_linked_timeout() sets REQ_F_COMP_LOCKED for a linked timeout even if it can't cancel it, e.g. it's already running. It not only races with io_link_timeout_fn() for ->flags field, but also leaves the flag set and so io_link_timeout_fn() may find it and decide that it holds the lock. Hopefully, the second problem is potential. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 717dd5d38d75..f0f4b5b5c2a0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1769,6 +1769,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) ret = hrtimer_try_to_cancel(&io->timer); if (ret != -1) { + req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); req->flags &= ~REQ_F_LINK_HEAD; @@ -1791,7 +1792,6 @@ static bool __io_kill_linked_timeout(struct io_kiocb *req) return false; list_del_init(&link->link_list); - link->flags |= REQ_F_COMP_LOCKED; wake_ev = io_link_cancel_timeout(link); req->flags &= ~REQ_F_LINK_TIMEOUT; return wake_ev; From b1b74cfc1967bd0747ff85f650f598e84eeb3d1c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 Oct 2020 09:43:57 +0100 Subject: [PATCH 03/21] io_uring: don't unnecessarily clear F_LINK_TIMEOUT If a request had REQ_F_LINK_TIMEOUT it would've been cleared in __io_kill_linked_timeout() by the time of __io_fail_links(), so no need to care about it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f0f4b5b5c2a0..ca9be31b76b3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1852,7 +1852,6 @@ static void __io_fail_links(struct io_kiocb *req) io_cqring_fill_event(link, -ECANCELED); link->flags |= REQ_F_COMP_LOCKED; __io_double_put_req(link); - req->flags &= ~REQ_F_LINK_TIMEOUT; } io_commit_cqring(ctx); From 6a0af224c21309f24dbb1b79d0744b255d7156a0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 Oct 2020 09:43:58 +0100 Subject: [PATCH 04/21] io_uring: don't put a poll req under spinlock Move io_put_req() in io_poll_task_handler() from under spinlock. This eliminates the need to use REQ_F_COMP_LOCKED, at the expense of potentially having to grab the lock again. That's still a better trade off than relying on the locked flag. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ca9be31b76b3..92546f90defd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4844,10 +4844,9 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) hash_del(&req->hash_node); io_poll_complete(req, req->result, 0); - req->flags |= REQ_F_COMP_LOCKED; - *nxt = io_put_req_find_next(req); spin_unlock_irq(&ctx->completion_lock); + *nxt = io_put_req_find_next(req); io_cqring_ev_posted(ctx); } From 4edf20f9990230e9b85e79954d5cd28fc93616e9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 Oct 2020 09:43:59 +0100 Subject: [PATCH 05/21] io_uring: dig out COMP_LOCK from deep call chain io_req_clean_work() checks REQ_F_COMP_LOCK to pass this two layers up. Move the check up into __io_free_req(), so at least it doesn't looks so ugly and would facilitate further changes. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 31 ++++++++----------------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 92546f90defd..0680fa385353 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1181,14 +1181,10 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -/* - * Returns true if we need to defer file table putting. This can only happen - * from the error path with REQ_F_COMP_LOCKED set. - */ -static bool io_req_clean_work(struct io_kiocb *req) +static void io_req_clean_work(struct io_kiocb *req) { if (!(req->flags & REQ_F_WORK_INITIALIZED)) - return false; + return; req->flags &= ~REQ_F_WORK_INITIALIZED; @@ -1207,9 +1203,6 @@ static bool io_req_clean_work(struct io_kiocb *req) if (req->work.fs) { struct fs_struct *fs = req->work.fs; - if (req->flags & REQ_F_COMP_LOCKED) - return true; - spin_lock(&req->work.fs->lock); if (--fs->users) fs = NULL; @@ -1218,8 +1211,6 @@ static bool io_req_clean_work(struct io_kiocb *req) free_fs_struct(fs); req->work.fs = NULL; } - - return false; } static void io_prep_async_work(struct io_kiocb *req) @@ -1699,7 +1690,7 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file, fput(file); } -static bool io_dismantle_req(struct io_kiocb *req) +static void io_dismantle_req(struct io_kiocb *req) { io_clean_op(req); @@ -1708,7 +1699,7 @@ static bool io_dismantle_req(struct io_kiocb *req) if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - return io_req_clean_work(req); + io_req_clean_work(req); } static void __io_free_req_finish(struct io_kiocb *req) @@ -1731,21 +1722,15 @@ static void __io_free_req_finish(struct io_kiocb *req) static void io_req_task_file_table_put(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); - struct fs_struct *fs = req->work.fs; - spin_lock(&req->work.fs->lock); - if (--fs->users) - fs = NULL; - spin_unlock(&req->work.fs->lock); - if (fs) - free_fs_struct(fs); - req->work.fs = NULL; + io_dismantle_req(req); __io_free_req_finish(req); } static void __io_free_req(struct io_kiocb *req) { - if (!io_dismantle_req(req)) { + if (!(req->flags & REQ_F_COMP_LOCKED)) { + io_dismantle_req(req); __io_free_req_finish(req); } else { int ret; @@ -2057,7 +2042,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) } rb->task_refs++; - WARN_ON_ONCE(io_dismantle_req(req)); + io_dismantle_req(req); rb->reqs[rb->to_free++] = req; if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) __io_req_free_batch_flush(req->ctx, rb); From 216578e55ac932cf5e348d9e65d8e129fc9e34cc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 Oct 2020 09:44:00 +0100 Subject: [PATCH 06/21] io_uring: fix REQ_F_COMP_LOCKED by killing it REQ_F_COMP_LOCKED is used and implemented in a buggy way. The problem is that the flag is set before io_put_req() but not cleared after, and if that wasn't the final reference, the request will be freed with the flag set from some other context, which may not hold a spinlock. That means possible races with removing linked timeouts and unsynchronised completion (e.g. access to CQ). Instead of fixing REQ_F_COMP_LOCKED, kill the flag and use task_work_add() to move such requests to a fresh context to free from it, as was done with __io_free_req_finish(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 149 +++++++++++++++++++++++--------------------------- 1 file changed, 69 insertions(+), 80 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0680fa385353..641d869d96ee 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -574,7 +574,6 @@ enum { REQ_F_NOWAIT_BIT, REQ_F_LINK_TIMEOUT_BIT, REQ_F_ISREG_BIT, - REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, @@ -613,8 +612,6 @@ enum { REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), - /* completion under lock */ - REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), /* needs cleanup */ REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), /* already went through poll handler */ @@ -963,8 +960,8 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2, struct io_comp_state *cs); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); +static void io_put_req_deferred(struct io_kiocb *req, int nr); static void io_double_put_req(struct io_kiocb *req); -static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void __io_queue_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); @@ -1316,9 +1313,8 @@ static void io_kill_timeout(struct io_kiocb *req) atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); - req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, 0); - io_put_req(req); + io_put_req_deferred(req, 1); } } @@ -1369,8 +1365,7 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx) if (link) { __io_queue_linked_timeout(link); /* drop submission reference */ - link->flags |= REQ_F_COMP_LOCKED; - io_put_req(link); + io_put_req_deferred(link, 1); } kfree(de); } while (!list_empty(&ctx->defer_list)); @@ -1597,13 +1592,19 @@ static void io_submit_flush_completions(struct io_comp_state *cs) req = list_first_entry(&cs->list, struct io_kiocb, compl.list); list_del(&req->compl.list); __io_cqring_fill_event(req, req->result, req->compl.cflags); - if (!(req->flags & REQ_F_LINK_HEAD)) { - req->flags |= REQ_F_COMP_LOCKED; - io_put_req(req); - } else { + + /* + * io_free_req() doesn't care about completion_lock unless one + * of these flags is set. REQ_F_WORK_INITIALIZED is in the list + * because of a potential deadlock with req->work.fs->lock + */ + if (req->flags & (REQ_F_FAIL_LINK|REQ_F_LINK_TIMEOUT + |REQ_F_WORK_INITIALIZED)) { spin_unlock_irq(&ctx->completion_lock); io_put_req(req); spin_lock_irq(&ctx->completion_lock); + } else { + io_put_req(req); } } io_commit_cqring(ctx); @@ -1702,10 +1703,14 @@ static void io_dismantle_req(struct io_kiocb *req) io_req_clean_work(req); } -static void __io_free_req_finish(struct io_kiocb *req) +static void __io_free_req(struct io_kiocb *req) { - struct io_uring_task *tctx = req->task->io_uring; - struct io_ring_ctx *ctx = req->ctx; + struct io_uring_task *tctx; + struct io_ring_ctx *ctx; + + io_dismantle_req(req); + tctx = req->task->io_uring; + ctx = req->ctx; atomic_long_inc(&tctx->req_complete); if (tctx->in_idle) @@ -1719,33 +1724,6 @@ static void __io_free_req_finish(struct io_kiocb *req) percpu_ref_put(&ctx->refs); } -static void io_req_task_file_table_put(struct callback_head *cb) -{ - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); - - io_dismantle_req(req); - __io_free_req_finish(req); -} - -static void __io_free_req(struct io_kiocb *req) -{ - if (!(req->flags & REQ_F_COMP_LOCKED)) { - io_dismantle_req(req); - __io_free_req_finish(req); - } else { - int ret; - - init_task_work(&req->task_work, io_req_task_file_table_put); - ret = task_work_add(req->task, &req->task_work, TWA_RESUME); - if (unlikely(ret)) { - struct task_struct *tsk; - - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, 0); - } - } -} - static bool io_link_cancel_timeout(struct io_kiocb *req) { struct io_timeout_data *io = req->async_data; @@ -1754,11 +1732,10 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) ret = hrtimer_try_to_cancel(&io->timer); if (ret != -1) { - req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); req->flags &= ~REQ_F_LINK_HEAD; - io_put_req(req); + io_put_req_deferred(req, 1); return true; } @@ -1785,17 +1762,12 @@ static bool __io_kill_linked_timeout(struct io_kiocb *req) static void io_kill_linked_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; bool wake_ev; - if (!(req->flags & REQ_F_COMP_LOCKED)) { - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); - wake_ev = __io_kill_linked_timeout(req); - spin_unlock_irqrestore(&ctx->completion_lock, flags); - } else { - wake_ev = __io_kill_linked_timeout(req); - } + spin_lock_irqsave(&ctx->completion_lock, flags); + wake_ev = __io_kill_linked_timeout(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); if (wake_ev) io_cqring_ev_posted(ctx); @@ -1835,27 +1807,29 @@ static void __io_fail_links(struct io_kiocb *req) trace_io_uring_fail_link(req, link); io_cqring_fill_event(link, -ECANCELED); - link->flags |= REQ_F_COMP_LOCKED; - __io_double_put_req(link); + + /* + * It's ok to free under spinlock as they're not linked anymore, + * but avoid REQ_F_WORK_INITIALIZED because it may deadlock on + * work.fs->lock. + */ + if (link->flags & REQ_F_WORK_INITIALIZED) + io_put_req_deferred(link, 2); + else + io_double_put_req(link); } io_commit_cqring(ctx); - io_cqring_ev_posted(ctx); } static void io_fail_links(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; - if (!(req->flags & REQ_F_COMP_LOCKED)) { - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); - __io_fail_links(req); - spin_unlock_irqrestore(&ctx->completion_lock, flags); - } else { - __io_fail_links(req); - } + spin_lock_irqsave(&ctx->completion_lock, flags); + __io_fail_links(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); } @@ -2069,6 +2043,34 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } +static void io_put_req_deferred_cb(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + + io_free_req(req); +} + +static void io_free_req_deferred(struct io_kiocb *req) +{ + int ret; + + init_task_work(&req->task_work, io_put_req_deferred_cb); + ret = io_req_task_work_add(req, true); + if (unlikely(ret)) { + struct task_struct *tsk; + + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, 0); + wake_up_process(tsk); + } +} + +static inline void io_put_req_deferred(struct io_kiocb *req, int refs) +{ + if (refcount_sub_and_test(refs, &req->refs)) + io_free_req_deferred(req); +} + static struct io_wq_work *io_steal_work(struct io_kiocb *req) { struct io_kiocb *nxt; @@ -2085,17 +2087,6 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req) return nxt ? &nxt->work : NULL; } -/* - * Must only be used if we don't need to care about links, usually from - * within the completion handling itself. - */ -static void __io_double_put_req(struct io_kiocb *req) -{ - /* drop both submit and complete references */ - if (refcount_sub_and_test(2, &req->refs)) - __io_free_req(req); -} - static void io_double_put_req(struct io_kiocb *req) { /* drop both submit and complete references */ @@ -5127,9 +5118,8 @@ static bool io_poll_remove_one(struct io_kiocb *req) if (do_complete) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(req->ctx); - req->flags |= REQ_F_COMP_LOCKED; req_set_fail_links(req); - io_put_req(req); + io_put_req_deferred(req, 1); } return do_complete; @@ -5311,9 +5301,8 @@ static int __io_timeout_cancel(struct io_kiocb *req) list_del_init(&req->timeout.list); req_set_fail_links(req); - req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, -ECANCELED); - io_put_req(req); + io_put_req_deferred(req, 1); return 0; } From 0918682be432b85ccd49285832221d9b65831ef5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 13 Oct 2020 15:01:40 -0600 Subject: [PATCH 07/21] Revert "io_uring: mark io_uring_fops/io_op_defs as __read_mostly" This reverts commit 738277adc81929b3e7c9b63fec6693868cc5f931. This change didn't make a lot of sense, and as Linus reports, it actually fails on clang: /tmp/io_uring-dd40c4.s:26476: Warning: ignoring changed section attributes for .data..read_mostly The arrays are already marked const so, by definition, they are not just read-mostly, they are read-only. Reported-by: Linus Torvalds Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 641d869d96ee..94a66a6d1cba 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -760,7 +760,7 @@ struct io_op_def { unsigned short async_size; }; -static const struct io_op_def io_op_defs[] __read_mostly = { +static const struct io_op_def io_op_defs[] = { [IORING_OP_NOP] = {}, [IORING_OP_READV] = { .needs_mm = 1, @@ -983,7 +983,7 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, static struct kmem_cache *req_cachep; -static const struct file_operations io_uring_fops __read_mostly; +static const struct file_operations io_uring_fops; struct sock *io_uring_get_socket(struct file *file) { From 55cbc2564ab2fd555ec0fc39311a9cfb811d7da5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 14 Oct 2020 07:35:57 -0600 Subject: [PATCH 08/21] io_uring: fix error path cleanup in io_sqe_files_register() syzbot reports the following crash: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 1 PID: 8927 Comm: syz-executor.3 Not tainted 5.9.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:io_file_from_index fs/io_uring.c:5963 [inline] RIP: 0010:io_sqe_files_register fs/io_uring.c:7369 [inline] RIP: 0010:__io_uring_register fs/io_uring.c:9463 [inline] RIP: 0010:__do_sys_io_uring_register+0x2fd2/0x3ee0 fs/io_uring.c:9553 Code: ec 03 49 c1 ee 03 49 01 ec 49 01 ee e8 57 61 9c ff 41 80 3c 24 00 0f 85 9b 09 00 00 4d 8b af b8 01 00 00 4c 89 e8 48 c1 e8 03 <80> 3c 28 00 0f 85 76 09 00 00 49 8b 55 00 89 d8 c1 f8 09 48 98 4c RSP: 0018:ffffc90009137d68 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffc9000ef2a000 RDX: 0000000000040000 RSI: ffffffff81d81dd9 RDI: 0000000000000005 RBP: dffffc0000000000 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffed1012882a37 R13: 0000000000000000 R14: ffffed1012882a38 R15: ffff888094415000 FS: 00007f4266f3c700(0000) GS:ffff8880ae500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000118c000 CR3: 000000008e57d000 CR4: 00000000001506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x45de59 Code: 0d b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 db b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f4266f3bc78 EFLAGS: 00000246 ORIG_RAX: 00000000000001ab RAX: ffffffffffffffda RBX: 00000000000083c0 RCX: 000000000045de59 RDX: 0000000020000280 RSI: 0000000000000002 RDI: 0000000000000005 RBP: 000000000118bf68 R08: 0000000000000000 R09: 0000000000000000 R10: 40000000000000a1 R11: 0000000000000246 R12: 000000000118bf2c R13: 00007fff2fa4f12f R14: 00007f4266f3c9c0 R15: 000000000118bf2c Modules linked in: ---[ end trace 2a40a195e2d5e6e6 ]--- RIP: 0010:io_file_from_index fs/io_uring.c:5963 [inline] RIP: 0010:io_sqe_files_register fs/io_uring.c:7369 [inline] RIP: 0010:__io_uring_register fs/io_uring.c:9463 [inline] RIP: 0010:__do_sys_io_uring_register+0x2fd2/0x3ee0 fs/io_uring.c:9553 Code: ec 03 49 c1 ee 03 49 01 ec 49 01 ee e8 57 61 9c ff 41 80 3c 24 00 0f 85 9b 09 00 00 4d 8b af b8 01 00 00 4c 89 e8 48 c1 e8 03 <80> 3c 28 00 0f 85 76 09 00 00 49 8b 55 00 89 d8 c1 f8 09 48 98 4c RSP: 0018:ffffc90009137d68 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffc9000ef2a000 RDX: 0000000000040000 RSI: ffffffff81d81dd9 RDI: 0000000000000005 RBP: dffffc0000000000 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffed1012882a37 R13: 0000000000000000 R14: ffffed1012882a38 R15: ffff888094415000 FS: 00007f4266f3c700(0000) GS:ffff8880ae400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000074a918 CR3: 000000008e57d000 CR4: 00000000001506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 which is a copy of fget failure condition jumping to cleanup, but the cleanup requires ctx->file_data to be assigned. Assign it when setup, and ensure that we clear it again for the error path exit. Fixes: 5398ae698525 ("io_uring: clean file_data access in files_register") Reported-by: syzbot+f4ebcc98223dafd8991e@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 94a66a6d1cba..21c6ce15b751 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7289,6 +7289,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args)) goto out_ref; + ctx->file_data = file_data; for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { struct fixed_file_table *table; @@ -7323,7 +7324,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, table->files[index] = file; } - ctx->file_data = file_data; ret = io_sqe_files_scm(ctx); if (ret) { io_sqe_files_unregister(ctx); @@ -7356,6 +7356,7 @@ out_ref: out_free: kfree(file_data->table); kfree(file_data); + ctx->file_data = NULL; return ret; } From a8b595b22d31f83b715511f59012f152a269d83b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Oct 2020 10:13:07 -0600 Subject: [PATCH 09/21] io-wq: assign NUMA node locality if appropriate There was an assumption that kthread_create_on_node() would properly set NUMA affinities in terms of CPUs allowed, but it doesn't. Make sure we do this when creating an io-wq context on NUMA. Cc: stable@vger.kernel.org Stefan Metzmacher Signed-off-by: Jens Axboe --- fs/io-wq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io-wq.c b/fs/io-wq.c index 0a182f1333e8..149fd2f0927e 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -676,6 +676,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) kfree(worker); return false; } + kthread_bind_mask(worker->task, cpumask_of_node(wqe->node)); raw_spin_lock_irq(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); From 0f203765880c4416675726be558b65da4a7604e2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 14 Oct 2020 09:23:55 -0600 Subject: [PATCH 10/21] io_uring: pass required context in as flags We have a number of bits that decide what context to inherit. Set up io-wq flags for these instead. This is in preparation for always having the various members set, but not always needing them for all requests. No intended functional changes in this patch. Signed-off-by: Jens Axboe --- fs/io-wq.c | 10 +++-- fs/io-wq.h | 6 +++ fs/io_uring.c | 100 ++++++++++++++++++++------------------------------ 3 files changed, 52 insertions(+), 64 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 149fd2f0927e..e636898f8a1f 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -448,6 +448,8 @@ static inline void io_wq_switch_blkcg(struct io_worker *worker, struct io_wq_work *work) { #ifdef CONFIG_BLK_CGROUP + if (!(work->flags & IO_WQ_WORK_BLKCG)) + return; if (work->blkcg_css != worker->blkcg_css) { kthread_associate_blkcg(work->blkcg_css); worker->blkcg_css = work->blkcg_css; @@ -470,17 +472,17 @@ static void io_wq_switch_creds(struct io_worker *worker, static void io_impersonate_work(struct io_worker *worker, struct io_wq_work *work) { - if (work->files && current->files != work->files) { + if ((work->flags & IO_WQ_WORK_FILES) && current->files != work->files) { task_lock(current); current->files = work->files; current->nsproxy = work->nsproxy; task_unlock(current); } - if (work->fs && current->fs != work->fs) + if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->fs) current->fs = work->fs; - if (work->mm != worker->mm) + if ((work->flags & IO_WQ_WORK_MM) && work->mm != worker->mm) io_wq_switch_mm(worker, work); - if (worker->cur_creds != work->creds) + if ((work->flags & IO_WQ_WORK_CREDS) && worker->cur_creds != work->creds) io_wq_switch_creds(worker, work); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize; io_wq_switch_blkcg(worker, work); diff --git a/fs/io-wq.h b/fs/io-wq.h index 84bcf6a85523..31a29023605a 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -10,6 +10,12 @@ enum { IO_WQ_WORK_NO_CANCEL = 8, IO_WQ_WORK_CONCURRENT = 16, + IO_WQ_WORK_FILES = 32, + IO_WQ_WORK_FS = 64, + IO_WQ_WORK_MM = 128, + IO_WQ_WORK_CREDS = 256, + IO_WQ_WORK_BLKCG = 512, + IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; diff --git a/fs/io_uring.c b/fs/io_uring.c index 21c6ce15b751..04a4d8c44718 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -729,8 +729,6 @@ struct io_submit_state { }; struct io_op_def { - /* needs current->mm setup, does mm access */ - unsigned needs_mm : 1; /* needs req->file assigned */ unsigned needs_file : 1; /* don't fail if file grab fails */ @@ -741,10 +739,6 @@ struct io_op_def { unsigned unbound_nonreg_file : 1; /* opcode is not supported by this kernel */ unsigned not_supported : 1; - /* needs file table */ - unsigned file_table : 1; - /* needs ->fs */ - unsigned needs_fs : 1; /* set if opcode supports polled "wait" */ unsigned pollin : 1; unsigned pollout : 1; @@ -754,45 +748,42 @@ struct io_op_def { unsigned needs_fsize : 1; /* must always have async data allocated */ unsigned needs_async_data : 1; - /* needs blkcg context, issues async io potentially */ - unsigned needs_blkcg : 1; /* size of async data needed, if any */ unsigned short async_size; + unsigned work_flags; }; static const struct io_op_def io_op_defs[] = { [IORING_OP_NOP] = {}, [IORING_OP_READV] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, .needs_async_data = 1, - .needs_blkcg = 1, .async_size = sizeof(struct io_async_rw), + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_WRITEV] = { - .needs_mm = 1, .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .needs_fsize = 1, .needs_async_data = 1, - .needs_blkcg = 1, .async_size = sizeof(struct io_async_rw), + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_FSYNC] = { .needs_file = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_READ_FIXED] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, - .needs_blkcg = 1, .async_size = sizeof(struct io_async_rw), + .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, @@ -800,8 +791,8 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .needs_fsize = 1, - .needs_blkcg = 1, .async_size = sizeof(struct io_async_rw), + .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -810,137 +801,123 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_POLL_REMOVE] = {}, [IORING_OP_SYNC_FILE_RANGE] = { .needs_file = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_SENDMSG] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, - .needs_fs = 1, .pollout = 1, .needs_async_data = 1, - .needs_blkcg = 1, .async_size = sizeof(struct io_async_msghdr), + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FS, }, [IORING_OP_RECVMSG] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, - .needs_fs = 1, .pollin = 1, .buffer_select = 1, .needs_async_data = 1, - .needs_blkcg = 1, .async_size = sizeof(struct io_async_msghdr), + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FS, }, [IORING_OP_TIMEOUT] = { - .needs_mm = 1, .needs_async_data = 1, .async_size = sizeof(struct io_timeout_data), + .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_TIMEOUT_REMOVE] = {}, [IORING_OP_ACCEPT] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, - .file_table = 1, .pollin = 1, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES, }, [IORING_OP_ASYNC_CANCEL] = {}, [IORING_OP_LINK_TIMEOUT] = { - .needs_mm = 1, .needs_async_data = 1, .async_size = sizeof(struct io_timeout_data), + .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_CONNECT] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_connect), + .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_FALLOCATE] = { .needs_file = 1, .needs_fsize = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_OPENAT] = { - .file_table = 1, - .needs_fs = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FS, }, [IORING_OP_CLOSE] = { .needs_file = 1, .needs_file_no_error = 1, - .file_table = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG, }, [IORING_OP_FILES_UPDATE] = { - .needs_mm = 1, - .file_table = 1, + .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM, }, [IORING_OP_STATX] = { - .needs_mm = 1, - .needs_fs = 1, - .file_table = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM | + IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, }, [IORING_OP_READ] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, - .needs_blkcg = 1, .async_size = sizeof(struct io_async_rw), + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_WRITE] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .needs_fsize = 1, - .needs_blkcg = 1, .async_size = sizeof(struct io_async_rw), + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_FADVISE] = { .needs_file = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_MADVISE] = { - .needs_mm = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_SEND] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_RECV] = { - .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_OPENAT2] = { - .file_table = 1, - .needs_fs = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS | + IO_WQ_WORK_BLKCG, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, - .file_table = 1, + .work_flags = IO_WQ_WORK_FILES, }, [IORING_OP_SPLICE] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, - .needs_blkcg = 1, + .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_PROVIDE_BUFFERS] = {}, [IORING_OP_REMOVE_BUFFERS] = {}, @@ -1031,7 +1008,7 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, struct io_kiocb *req) { - if (!io_op_defs[req->opcode].needs_mm) + if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM)) return 0; return __io_sq_thread_acquire_mm(ctx); } @@ -1224,7 +1201,8 @@ static void io_prep_async_work(struct io_kiocb *req) if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } - if (!req->work.files && io_op_defs[req->opcode].file_table && + if (!req->work.files && + (io_op_defs[req->opcode].work_flags & IO_WQ_WORK_FILES) && !(req->flags & REQ_F_NO_FILE_TABLE)) { req->work.files = get_files_struct(current); get_nsproxy(current->nsproxy); @@ -1235,12 +1213,12 @@ static void io_prep_async_work(struct io_kiocb *req) list_add(&req->inflight_entry, &ctx->inflight_list); spin_unlock_irq(&ctx->inflight_lock); } - if (!req->work.mm && def->needs_mm) { + if (!req->work.mm && (def->work_flags & IO_WQ_WORK_MM)) { mmgrab(current->mm); req->work.mm = current->mm; } #ifdef CONFIG_BLK_CGROUP - if (!req->work.blkcg_css && def->needs_blkcg) { + if (!req->work.blkcg_css && (def->work_flags & IO_WQ_WORK_BLKCG)) { rcu_read_lock(); req->work.blkcg_css = blkcg_css(); /* @@ -1254,7 +1232,7 @@ static void io_prep_async_work(struct io_kiocb *req) #endif if (!req->work.creds) req->work.creds = get_current_cred(); - if (!req->work.fs && def->needs_fs) { + if (!req->work.fs && (def->work_flags & IO_WQ_WORK_FS)) { spin_lock(¤t->fs->lock); if (!current->fs->in_exec) { req->work.fs = current->fs; @@ -1268,6 +1246,8 @@ static void io_prep_async_work(struct io_kiocb *req) req->work.fsize = rlimit(RLIMIT_FSIZE); else req->work.fsize = RLIM_INFINITY; + + req->work.flags |= def->work_flags; } static void io_prep_async_link(struct io_kiocb *req) From dfead8a8e2c494b947480bac90a6f9792f08bc12 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 14 Oct 2020 10:12:37 -0600 Subject: [PATCH 11/21] io_uring: rely solely on work flags to determine personality. We solely rely on work->work_flags now, so use that for proper checking and clearing/dropping of various identity items. Signed-off-by: Jens Axboe --- fs/io-wq.c | 4 ---- fs/io_uring.c | 55 ++++++++++++++++++++++++++++++++------------------- 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index e636898f8a1f..b7d8e544a804 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -429,14 +429,10 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) mmput(worker->mm); worker->mm = NULL; } - if (!work->mm) - return; if (mmget_not_zero(work->mm)) { kthread_use_mm(work->mm); worker->mm = work->mm; - /* hang on to this mm */ - work->mm = NULL; return; } diff --git a/fs/io_uring.c b/fs/io_uring.c index 04a4d8c44718..e92ed22ef924 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1162,19 +1162,21 @@ static void io_req_clean_work(struct io_kiocb *req) req->flags &= ~REQ_F_WORK_INITIALIZED; - if (req->work.mm) { + if (req->work.flags & IO_WQ_WORK_MM) { mmdrop(req->work.mm); - req->work.mm = NULL; + req->work.flags &= ~IO_WQ_WORK_MM; } #ifdef CONFIG_BLK_CGROUP - if (req->work.blkcg_css) + if (req->work.flags & IO_WQ_WORK_BLKCG) { css_put(req->work.blkcg_css); -#endif - if (req->work.creds) { - put_cred(req->work.creds); - req->work.creds = NULL; + req->work.flags &= ~IO_WQ_WORK_BLKCG; } - if (req->work.fs) { +#endif + if (req->work.flags & IO_WQ_WORK_CREDS) { + put_cred(req->work.creds); + req->work.flags &= ~IO_WQ_WORK_CREDS; + } + if (req->work.flags & IO_WQ_WORK_FS) { struct fs_struct *fs = req->work.fs; spin_lock(&req->work.fs->lock); @@ -1183,7 +1185,7 @@ static void io_req_clean_work(struct io_kiocb *req) spin_unlock(&req->work.fs->lock); if (fs) free_fs_struct(fs); - req->work.fs = NULL; + req->work.flags &= ~IO_WQ_WORK_FS; } } @@ -1201,7 +1203,7 @@ static void io_prep_async_work(struct io_kiocb *req) if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } - if (!req->work.files && + if (!(req->work.flags & IO_WQ_WORK_FILES) && (io_op_defs[req->opcode].work_flags & IO_WQ_WORK_FILES) && !(req->flags & REQ_F_NO_FILE_TABLE)) { req->work.files = get_files_struct(current); @@ -1212,13 +1214,17 @@ static void io_prep_async_work(struct io_kiocb *req) spin_lock_irq(&ctx->inflight_lock); list_add(&req->inflight_entry, &ctx->inflight_list); spin_unlock_irq(&ctx->inflight_lock); + req->work.flags |= IO_WQ_WORK_FILES; } - if (!req->work.mm && (def->work_flags & IO_WQ_WORK_MM)) { + if (!(req->work.flags & IO_WQ_WORK_MM) && + (def->work_flags & IO_WQ_WORK_MM)) { mmgrab(current->mm); req->work.mm = current->mm; + req->work.flags |= IO_WQ_WORK_MM; } #ifdef CONFIG_BLK_CGROUP - if (!req->work.blkcg_css && (def->work_flags & IO_WQ_WORK_BLKCG)) { + if (!(req->work.flags & IO_WQ_WORK_BLKCG) && + (def->work_flags & IO_WQ_WORK_BLKCG)) { rcu_read_lock(); req->work.blkcg_css = blkcg_css(); /* @@ -1227,16 +1233,22 @@ static void io_prep_async_work(struct io_kiocb *req) */ if (!css_tryget_online(req->work.blkcg_css)) req->work.blkcg_css = NULL; + else + req->work.flags |= IO_WQ_WORK_BLKCG; rcu_read_unlock(); } #endif - if (!req->work.creds) + if (!(req->work.flags & IO_WQ_WORK_CREDS)) { req->work.creds = get_current_cred(); - if (!req->work.fs && (def->work_flags & IO_WQ_WORK_FS)) { + req->work.flags |= IO_WQ_WORK_CREDS; + } + if (!(req->work.flags & IO_WQ_WORK_FS) && + (def->work_flags & IO_WQ_WORK_FS)) { spin_lock(¤t->fs->lock); if (!current->fs->in_exec) { req->work.fs = current->fs; req->work.fs->users++; + req->work.flags |= IO_WQ_WORK_FS; } else { req->work.flags |= IO_WQ_WORK_CANCEL; } @@ -1246,8 +1258,6 @@ static void io_prep_async_work(struct io_kiocb *req) req->work.fsize = rlimit(RLIMIT_FSIZE); else req->work.fsize = RLIM_INFINITY; - - req->work.flags |= def->work_flags; } static void io_prep_async_link(struct io_kiocb *req) @@ -1437,7 +1447,8 @@ static inline bool io_match_files(struct io_kiocb *req, { if (!files) return true; - if (req->flags & REQ_F_WORK_INITIALIZED) + if ((req->flags & REQ_F_WORK_INITIALIZED) && + (req->work.flags & IO_WQ_WORK_FILES)) return req->work.files == files; return false; } @@ -5694,7 +5705,7 @@ static void io_req_drop_files(struct io_kiocb *req) req->flags &= ~REQ_F_INFLIGHT; put_files_struct(req->work.files); put_nsproxy(req->work.nsproxy); - req->work.files = NULL; + req->work.flags &= ~IO_WQ_WORK_FILES; } static void __io_clean_op(struct io_kiocb *req) @@ -6060,6 +6071,7 @@ again: old_creds = NULL; /* restored original creds */ else old_creds = override_creds(req->work.creds); + req->work.flags |= IO_WQ_WORK_CREDS; } ret = io_issue_sqe(req, true, cs); @@ -6367,6 +6379,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(!req->work.creds)) return -EINVAL; get_cred(req->work.creds); + req->work.flags |= IO_WQ_WORK_CREDS; } /* same numerical values with corresponding REQ_F_*, safe to copy */ @@ -8234,7 +8247,8 @@ static bool io_wq_files_match(struct io_wq_work *work, void *data) { struct files_struct *files = data; - return !files || work->files == files; + return !files || ((work->flags & IO_WQ_WORK_FILES) && + work->files == files); } /* @@ -8389,7 +8403,8 @@ static bool io_uring_cancel_files(struct io_ring_ctx *ctx, spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { - if (files && req->work.files != files) + if (files && (req->work.flags & IO_WQ_WORK_FILES) && + req->work.files != files) continue; /* req is being completed, ignore */ if (!refcount_inc_not_zero(&req->refs)) From 98447d65b4a7a59f8ea37dc6e5d743247d9a7b01 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 14 Oct 2020 10:48:51 -0600 Subject: [PATCH 12/21] io_uring: move io identity items into separate struct io-wq contains a pointer to the identity, which we just hold in io_kiocb for now. This is in preparation for putting this outside io_kiocb. The only exception is struct files_struct, which we'll need different rules for to avoid a circular dependency. No functional changes in this patch. Signed-off-by: Jens Axboe --- fs/io-wq.c | 34 +++++++++++----------- fs/io-wq.h | 12 ++------ fs/io_uring.c | 62 ++++++++++++++++++++-------------------- include/linux/io_uring.h | 13 ++++++++- 4 files changed, 64 insertions(+), 57 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index b7d8e544a804..0c852b75384d 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -430,9 +430,9 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) worker->mm = NULL; } - if (mmget_not_zero(work->mm)) { - kthread_use_mm(work->mm); - worker->mm = work->mm; + if (mmget_not_zero(work->identity->mm)) { + kthread_use_mm(work->identity->mm); + worker->mm = work->identity->mm; return; } @@ -446,9 +446,9 @@ static inline void io_wq_switch_blkcg(struct io_worker *worker, #ifdef CONFIG_BLK_CGROUP if (!(work->flags & IO_WQ_WORK_BLKCG)) return; - if (work->blkcg_css != worker->blkcg_css) { - kthread_associate_blkcg(work->blkcg_css); - worker->blkcg_css = work->blkcg_css; + if (work->identity->blkcg_css != worker->blkcg_css) { + kthread_associate_blkcg(work->identity->blkcg_css); + worker->blkcg_css = work->identity->blkcg_css; } #endif } @@ -456,9 +456,9 @@ static inline void io_wq_switch_blkcg(struct io_worker *worker, static void io_wq_switch_creds(struct io_worker *worker, struct io_wq_work *work) { - const struct cred *old_creds = override_creds(work->creds); + const struct cred *old_creds = override_creds(work->identity->creds); - worker->cur_creds = work->creds; + worker->cur_creds = work->identity->creds; if (worker->saved_creds) put_cred(old_creds); /* creds set by previous switch */ else @@ -468,19 +468,21 @@ static void io_wq_switch_creds(struct io_worker *worker, static void io_impersonate_work(struct io_worker *worker, struct io_wq_work *work) { - if ((work->flags & IO_WQ_WORK_FILES) && current->files != work->files) { + if ((work->flags & IO_WQ_WORK_FILES) && + current->files != work->identity->files) { task_lock(current); - current->files = work->files; - current->nsproxy = work->nsproxy; + current->files = work->identity->files; + current->nsproxy = work->identity->nsproxy; task_unlock(current); } - if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->fs) - current->fs = work->fs; - if ((work->flags & IO_WQ_WORK_MM) && work->mm != worker->mm) + if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs) + current->fs = work->identity->fs; + if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm) io_wq_switch_mm(worker, work); - if ((work->flags & IO_WQ_WORK_CREDS) && worker->cur_creds != work->creds) + if ((work->flags & IO_WQ_WORK_CREDS) && + worker->cur_creds != work->identity->creds) io_wq_switch_creds(worker, work); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize; + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize; io_wq_switch_blkcg(worker, work); } diff --git a/fs/io-wq.h b/fs/io-wq.h index 31a29023605a..be21c500c925 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -1,6 +1,8 @@ #ifndef INTERNAL_IO_WQ_H #define INTERNAL_IO_WQ_H +#include + struct io_wq; enum { @@ -91,15 +93,7 @@ static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work { struct io_wq_work_node list; - struct files_struct *files; - struct mm_struct *mm; -#ifdef CONFIG_BLK_CGROUP - struct cgroup_subsys_state *blkcg_css; -#endif - const struct cred *creds; - struct nsproxy *nsproxy; - struct fs_struct *fs; - unsigned long fsize; + struct io_identity *identity; unsigned flags; }; diff --git a/fs/io_uring.c b/fs/io_uring.c index e92ed22ef924..bd6fd51302ed 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -689,6 +689,7 @@ struct io_kiocb { struct hlist_node hash_node; struct async_poll *apoll; struct io_wq_work work; + struct io_identity identity; }; struct io_defer_entry { @@ -1050,6 +1051,7 @@ static inline void io_req_init_async(struct io_kiocb *req) memset(&req->work, 0, sizeof(req->work)); req->flags |= REQ_F_WORK_INITIALIZED; + req->work.identity = &req->identity; } static inline bool io_async_submit(struct io_ring_ctx *ctx) @@ -1163,26 +1165,26 @@ static void io_req_clean_work(struct io_kiocb *req) req->flags &= ~REQ_F_WORK_INITIALIZED; if (req->work.flags & IO_WQ_WORK_MM) { - mmdrop(req->work.mm); + mmdrop(req->work.identity->mm); req->work.flags &= ~IO_WQ_WORK_MM; } #ifdef CONFIG_BLK_CGROUP if (req->work.flags & IO_WQ_WORK_BLKCG) { - css_put(req->work.blkcg_css); + css_put(req->work.identity->blkcg_css); req->work.flags &= ~IO_WQ_WORK_BLKCG; } #endif if (req->work.flags & IO_WQ_WORK_CREDS) { - put_cred(req->work.creds); + put_cred(req->work.identity->creds); req->work.flags &= ~IO_WQ_WORK_CREDS; } if (req->work.flags & IO_WQ_WORK_FS) { - struct fs_struct *fs = req->work.fs; + struct fs_struct *fs = req->work.identity->fs; - spin_lock(&req->work.fs->lock); + spin_lock(&req->work.identity->fs->lock); if (--fs->users) fs = NULL; - spin_unlock(&req->work.fs->lock); + spin_unlock(&req->work.identity->fs->lock); if (fs) free_fs_struct(fs); req->work.flags &= ~IO_WQ_WORK_FS; @@ -1206,9 +1208,9 @@ static void io_prep_async_work(struct io_kiocb *req) if (!(req->work.flags & IO_WQ_WORK_FILES) && (io_op_defs[req->opcode].work_flags & IO_WQ_WORK_FILES) && !(req->flags & REQ_F_NO_FILE_TABLE)) { - req->work.files = get_files_struct(current); + req->work.identity->files = get_files_struct(current); get_nsproxy(current->nsproxy); - req->work.nsproxy = current->nsproxy; + req->work.identity->nsproxy = current->nsproxy; req->flags |= REQ_F_INFLIGHT; spin_lock_irq(&ctx->inflight_lock); @@ -1219,35 +1221,33 @@ static void io_prep_async_work(struct io_kiocb *req) if (!(req->work.flags & IO_WQ_WORK_MM) && (def->work_flags & IO_WQ_WORK_MM)) { mmgrab(current->mm); - req->work.mm = current->mm; + req->work.identity->mm = current->mm; req->work.flags |= IO_WQ_WORK_MM; } #ifdef CONFIG_BLK_CGROUP if (!(req->work.flags & IO_WQ_WORK_BLKCG) && (def->work_flags & IO_WQ_WORK_BLKCG)) { rcu_read_lock(); - req->work.blkcg_css = blkcg_css(); + req->work.identity->blkcg_css = blkcg_css(); /* * This should be rare, either the cgroup is dying or the task * is moving cgroups. Just punt to root for the handful of ios. */ - if (!css_tryget_online(req->work.blkcg_css)) - req->work.blkcg_css = NULL; - else + if (css_tryget_online(req->work.identity->blkcg_css)) req->work.flags |= IO_WQ_WORK_BLKCG; rcu_read_unlock(); } #endif if (!(req->work.flags & IO_WQ_WORK_CREDS)) { - req->work.creds = get_current_cred(); + req->work.identity->creds = get_current_cred(); req->work.flags |= IO_WQ_WORK_CREDS; } if (!(req->work.flags & IO_WQ_WORK_FS) && (def->work_flags & IO_WQ_WORK_FS)) { spin_lock(¤t->fs->lock); if (!current->fs->in_exec) { - req->work.fs = current->fs; - req->work.fs->users++; + req->work.identity->fs = current->fs; + req->work.identity->fs->users++; req->work.flags |= IO_WQ_WORK_FS; } else { req->work.flags |= IO_WQ_WORK_CANCEL; @@ -1255,9 +1255,9 @@ static void io_prep_async_work(struct io_kiocb *req) spin_unlock(¤t->fs->lock); } if (def->needs_fsize) - req->work.fsize = rlimit(RLIMIT_FSIZE); + req->work.identity->fsize = rlimit(RLIMIT_FSIZE); else - req->work.fsize = RLIM_INFINITY; + req->work.identity->fsize = RLIM_INFINITY; } static void io_prep_async_link(struct io_kiocb *req) @@ -1449,7 +1449,7 @@ static inline bool io_match_files(struct io_kiocb *req, return true; if ((req->flags & REQ_F_WORK_INITIALIZED) && (req->work.flags & IO_WQ_WORK_FILES)) - return req->work.files == files; + return req->work.identity->files == files; return false; } @@ -4089,7 +4089,7 @@ static int io_close(struct io_kiocb *req, bool force_nonblock, } /* No ->flush() or already async, safely close from here */ - ret = filp_close(close->put_file, req->work.files); + ret = filp_close(close->put_file, req->work.identity->files); if (ret < 0) req_set_fail_links(req); fput(close->put_file); @@ -5703,8 +5703,8 @@ static void io_req_drop_files(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); req->flags &= ~REQ_F_INFLIGHT; - put_files_struct(req->work.files); - put_nsproxy(req->work.nsproxy); + put_files_struct(req->work.identity->files); + put_nsproxy(req->work.identity->nsproxy); req->work.flags &= ~IO_WQ_WORK_FILES; } @@ -6063,14 +6063,14 @@ static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs) again: linked_timeout = io_prep_linked_timeout(req); - if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds && - req->work.creds != current_cred()) { + if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.identity->creds && + req->work.identity->creds != current_cred()) { if (old_creds) revert_creds(old_creds); - if (old_creds == req->work.creds) + if (old_creds == req->work.identity->creds) old_creds = NULL; /* restored original creds */ else - old_creds = override_creds(req->work.creds); + old_creds = override_creds(req->work.identity->creds); req->work.flags |= IO_WQ_WORK_CREDS; } @@ -6375,10 +6375,10 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, id = READ_ONCE(sqe->personality); if (id) { io_req_init_async(req); - req->work.creds = idr_find(&ctx->personality_idr, id); - if (unlikely(!req->work.creds)) + req->work.identity->creds = idr_find(&ctx->personality_idr, id); + if (unlikely(!req->work.identity->creds)) return -EINVAL; - get_cred(req->work.creds); + get_cred(req->work.identity->creds); req->work.flags |= IO_WQ_WORK_CREDS; } @@ -8248,7 +8248,7 @@ static bool io_wq_files_match(struct io_wq_work *work, void *data) struct files_struct *files = data; return !files || ((work->flags & IO_WQ_WORK_FILES) && - work->files == files); + work->identity->files == files); } /* @@ -8404,7 +8404,7 @@ static bool io_uring_cancel_files(struct io_ring_ctx *ctx, spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { if (files && (req->work.flags & IO_WQ_WORK_FILES) && - req->work.files != files) + req->work.identity->files != files) continue; /* req is being completed, ignore */ if (!refcount_inc_not_zero(&req->refs)) diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 96315cfaf6d1..352aa6bbd36b 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -4,7 +4,18 @@ #include #include -#include + +struct io_identity { + struct files_struct *files; + struct mm_struct *mm; +#ifdef CONFIG_BLK_CGROUP + struct cgroup_subsys_state *blkcg_css; +#endif + const struct cred *creds; + struct nsproxy *nsproxy; + struct fs_struct *fs; + unsigned long fsize; +}; struct io_uring_task { /* submission side */ From 1e6fa5216a0e59ef02e8b6b40d553238a3b81d49 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Oct 2020 08:46:24 -0600 Subject: [PATCH 13/21] io_uring: COW io_identity on mismatch If the io_identity doesn't completely match the task, then create a copy of it and use that. The existing copy remains valid until the last user of it has gone away. This also changes the personality lookup to be indexed by io_identity, instead of creds directly. Signed-off-by: Jens Axboe --- fs/io_uring.c | 258 ++++++++++++++++++++++++++++----------- include/linux/io_uring.h | 1 + 2 files changed, 189 insertions(+), 70 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index bd6fd51302ed..ab30834c275f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1040,6 +1040,27 @@ static inline void req_set_fail_links(struct io_kiocb *req) req->flags |= REQ_F_FAIL_LINK; } +/* + * None of these are dereferenced, they are simply used to check if any of + * them have changed. If we're under current and check they are still the + * same, we're fine to grab references to them for actual out-of-line use. + */ +static void io_init_identity(struct io_identity *id) +{ + id->files = current->files; + id->mm = current->mm; +#ifdef CONFIG_BLK_CGROUP + rcu_read_lock(); + id->blkcg_css = blkcg_css(); + rcu_read_unlock(); +#endif + id->creds = current_cred(); + id->nsproxy = current->nsproxy; + id->fs = current->fs; + id->fsize = rlimit(RLIMIT_FSIZE); + refcount_set(&id->count, 1); +} + /* * Note: must call io_req_init_async() for the first time you * touch any members of io_wq_work. @@ -1051,6 +1072,7 @@ static inline void io_req_init_async(struct io_kiocb *req) memset(&req->work, 0, sizeof(req->work)); req->flags |= REQ_F_WORK_INITIALIZED; + io_init_identity(&req->identity); req->work.identity = &req->identity; } @@ -1157,6 +1179,14 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } +static void io_put_identity(struct io_kiocb *req) +{ + if (req->work.identity == &req->identity) + return; + if (refcount_dec_and_test(&req->work.identity->count)) + kfree(req->work.identity); +} + static void io_req_clean_work(struct io_kiocb *req) { if (!(req->flags & REQ_F_WORK_INITIALIZED)) @@ -1189,11 +1219,118 @@ static void io_req_clean_work(struct io_kiocb *req) free_fs_struct(fs); req->work.flags &= ~IO_WQ_WORK_FS; } + + io_put_identity(req); +} + +/* + * Create a private copy of io_identity, since some fields don't match + * the current context. + */ +static bool io_identity_cow(struct io_kiocb *req) +{ + const struct cred *creds = NULL; + struct io_identity *id; + + if (req->work.flags & IO_WQ_WORK_CREDS) + creds = req->work.identity->creds; + + id = kmemdup(req->work.identity, sizeof(*id), GFP_KERNEL); + if (unlikely(!id)) { + req->work.flags |= IO_WQ_WORK_CANCEL; + return false; + } + + /* + * We can safely just re-init the creds we copied Either the field + * matches the current one, or we haven't grabbed it yet. The only + * exception is ->creds, through registered personalities, so handle + * that one separately. + */ + io_init_identity(id); + if (creds) + req->work.identity->creds = creds; + + /* add one for this request */ + refcount_inc(&id->count); + + /* drop old identity, assign new one. one ref for req, one for tctx */ + if (req->work.identity != &req->identity && + refcount_sub_and_test(2, &req->work.identity->count)) + kfree(req->work.identity); + + req->work.identity = id; + return true; +} + +static bool io_grab_identity(struct io_kiocb *req) +{ + const struct io_op_def *def = &io_op_defs[req->opcode]; + struct io_identity *id = &req->identity; + struct io_ring_ctx *ctx = req->ctx; + + if (def->needs_fsize && id->fsize != rlimit(RLIMIT_FSIZE)) + return false; + + if (!(req->work.flags & IO_WQ_WORK_FILES) && + (def->work_flags & IO_WQ_WORK_FILES) && + !(req->flags & REQ_F_NO_FILE_TABLE)) { + if (id->files != current->files || + id->nsproxy != current->nsproxy) + return false; + atomic_inc(&id->files->count); + get_nsproxy(id->nsproxy); + req->flags |= REQ_F_INFLIGHT; + + spin_lock_irq(&ctx->inflight_lock); + list_add(&req->inflight_entry, &ctx->inflight_list); + spin_unlock_irq(&ctx->inflight_lock); + req->work.flags |= IO_WQ_WORK_FILES; + } +#ifdef CONFIG_BLK_CGROUP + if (!(req->work.flags & IO_WQ_WORK_BLKCG) && + (def->work_flags & IO_WQ_WORK_BLKCG)) { + rcu_read_lock(); + if (id->blkcg_css != blkcg_css()) { + rcu_read_unlock(); + return false; + } + /* + * This should be rare, either the cgroup is dying or the task + * is moving cgroups. Just punt to root for the handful of ios. + */ + if (css_tryget_online(id->blkcg_css)) + req->work.flags |= IO_WQ_WORK_BLKCG; + rcu_read_unlock(); + } +#endif + if (!(req->work.flags & IO_WQ_WORK_CREDS)) { + if (id->creds != current_cred()) + return false; + get_cred(id->creds); + req->work.flags |= IO_WQ_WORK_CREDS; + } + if (!(req->work.flags & IO_WQ_WORK_FS) && + (def->work_flags & IO_WQ_WORK_FS)) { + if (current->fs != id->fs) + return false; + spin_lock(&id->fs->lock); + if (!id->fs->in_exec) { + id->fs->users++; + req->work.flags |= IO_WQ_WORK_FS; + } else { + req->work.flags |= IO_WQ_WORK_CANCEL; + } + spin_unlock(¤t->fs->lock); + } + + return true; } static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; + struct io_identity *id = &req->identity; struct io_ring_ctx *ctx = req->ctx; io_req_init_async(req); @@ -1205,59 +1342,24 @@ static void io_prep_async_work(struct io_kiocb *req) if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } - if (!(req->work.flags & IO_WQ_WORK_FILES) && - (io_op_defs[req->opcode].work_flags & IO_WQ_WORK_FILES) && - !(req->flags & REQ_F_NO_FILE_TABLE)) { - req->work.identity->files = get_files_struct(current); - get_nsproxy(current->nsproxy); - req->work.identity->nsproxy = current->nsproxy; - req->flags |= REQ_F_INFLIGHT; - spin_lock_irq(&ctx->inflight_lock); - list_add(&req->inflight_entry, &ctx->inflight_list); - spin_unlock_irq(&ctx->inflight_lock); - req->work.flags |= IO_WQ_WORK_FILES; - } + /* ->mm can never change on us */ if (!(req->work.flags & IO_WQ_WORK_MM) && (def->work_flags & IO_WQ_WORK_MM)) { - mmgrab(current->mm); - req->work.identity->mm = current->mm; + mmgrab(id->mm); req->work.flags |= IO_WQ_WORK_MM; } -#ifdef CONFIG_BLK_CGROUP - if (!(req->work.flags & IO_WQ_WORK_BLKCG) && - (def->work_flags & IO_WQ_WORK_BLKCG)) { - rcu_read_lock(); - req->work.identity->blkcg_css = blkcg_css(); - /* - * This should be rare, either the cgroup is dying or the task - * is moving cgroups. Just punt to root for the handful of ios. - */ - if (css_tryget_online(req->work.identity->blkcg_css)) - req->work.flags |= IO_WQ_WORK_BLKCG; - rcu_read_unlock(); - } -#endif - if (!(req->work.flags & IO_WQ_WORK_CREDS)) { - req->work.identity->creds = get_current_cred(); - req->work.flags |= IO_WQ_WORK_CREDS; - } - if (!(req->work.flags & IO_WQ_WORK_FS) && - (def->work_flags & IO_WQ_WORK_FS)) { - spin_lock(¤t->fs->lock); - if (!current->fs->in_exec) { - req->work.identity->fs = current->fs; - req->work.identity->fs->users++; - req->work.flags |= IO_WQ_WORK_FS; - } else { - req->work.flags |= IO_WQ_WORK_CANCEL; - } - spin_unlock(¤t->fs->lock); - } - if (def->needs_fsize) - req->work.identity->fsize = rlimit(RLIMIT_FSIZE); - else - req->work.identity->fsize = RLIM_INFINITY; + + /* if we fail grabbing identity, we must COW, regrab, and retry */ + if (io_grab_identity(req)) + return; + + if (!io_identity_cow(req)) + return; + + /* can't fail at this point */ + if (!io_grab_identity(req)) + WARN_ON(1); } static void io_prep_async_link(struct io_kiocb *req) @@ -1696,12 +1798,10 @@ static void io_dismantle_req(struct io_kiocb *req) static void __io_free_req(struct io_kiocb *req) { - struct io_uring_task *tctx; - struct io_ring_ctx *ctx; + struct io_uring_task *tctx = req->task->io_uring; + struct io_ring_ctx *ctx = req->ctx; io_dismantle_req(req); - tctx = req->task->io_uring; - ctx = req->ctx; atomic_long_inc(&tctx->req_complete); if (tctx->in_idle) @@ -6374,11 +6474,16 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, id = READ_ONCE(sqe->personality); if (id) { + struct io_identity *iod; + io_req_init_async(req); - req->work.identity->creds = idr_find(&ctx->personality_idr, id); - if (unlikely(!req->work.identity->creds)) + iod = idr_find(&ctx->personality_idr, id); + if (unlikely(!iod)) return -EINVAL; - get_cred(req->work.identity->creds); + refcount_inc(&iod->count); + io_put_identity(req); + get_cred(iod->creds); + req->work.identity = iod; req->work.flags |= IO_WQ_WORK_CREDS; } @@ -8171,11 +8276,14 @@ static int io_uring_fasync(int fd, struct file *file, int on) static int io_remove_personalities(int id, void *p, void *data) { struct io_ring_ctx *ctx = data; - const struct cred *cred; + struct io_identity *iod; - cred = idr_remove(&ctx->personality_idr, id); - if (cred) - put_cred(cred); + iod = idr_remove(&ctx->personality_idr, id); + if (iod) { + put_cred(iod->creds); + if (refcount_dec_and_test(&iod->count)) + kfree(iod); + } return 0; } @@ -9245,23 +9353,33 @@ out: static int io_register_personality(struct io_ring_ctx *ctx) { - const struct cred *creds = get_current_cred(); - int id; + struct io_identity *id; + int ret; - id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1, - USHRT_MAX, GFP_KERNEL); - if (id < 0) - put_cred(creds); - return id; + id = kmalloc(sizeof(*id), GFP_KERNEL); + if (unlikely(!id)) + return -ENOMEM; + + io_init_identity(id); + id->creds = get_current_cred(); + + ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL); + if (ret < 0) { + put_cred(id->creds); + kfree(id); + } + return ret; } static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) { - const struct cred *old_creds; + struct io_identity *iod; - old_creds = idr_remove(&ctx->personality_idr, id); - if (old_creds) { - put_cred(old_creds); + iod = idr_remove(&ctx->personality_idr, id); + if (iod) { + put_cred(iod->creds); + if (refcount_dec_and_test(&iod->count)) + kfree(iod); return 0; } diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 352aa6bbd36b..342cc574d5c0 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -15,6 +15,7 @@ struct io_identity { struct nsproxy *nsproxy; struct fs_struct *fs; unsigned long fsize; + refcount_t count; }; struct io_uring_task { From 5c3462cfd123b341c9d3c947c1a2bab373f1697f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Oct 2020 09:02:33 -0600 Subject: [PATCH 14/21] io_uring: store io_identity in io_uring_task This is, by definition, a per-task structure. So store it in the task context, instead of doing carrying it in each io_kiocb. We're being a bit inefficient if members have changed, as that requires an alloc and copy of a new io_identity struct. The next patch will fix that up. Signed-off-by: Jens Axboe --- fs/io_uring.c | 21 +++++++++++---------- include/linux/io_uring.h | 1 + 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ab30834c275f..ae91632b8bf9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -689,7 +689,6 @@ struct io_kiocb { struct hlist_node hash_node; struct async_poll *apoll; struct io_wq_work work; - struct io_identity identity; }; struct io_defer_entry { @@ -1072,8 +1071,7 @@ static inline void io_req_init_async(struct io_kiocb *req) memset(&req->work, 0, sizeof(req->work)); req->flags |= REQ_F_WORK_INITIALIZED; - io_init_identity(&req->identity); - req->work.identity = &req->identity; + req->work.identity = ¤t->io_uring->identity; } static inline bool io_async_submit(struct io_ring_ctx *ctx) @@ -1179,9 +1177,9 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static void io_put_identity(struct io_kiocb *req) +static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req) { - if (req->work.identity == &req->identity) + if (req->work.identity == &tctx->identity) return; if (refcount_dec_and_test(&req->work.identity->count)) kfree(req->work.identity); @@ -1220,7 +1218,7 @@ static void io_req_clean_work(struct io_kiocb *req) req->work.flags &= ~IO_WQ_WORK_FS; } - io_put_identity(req); + io_put_identity(req->task->io_uring, req); } /* @@ -1229,6 +1227,7 @@ static void io_req_clean_work(struct io_kiocb *req) */ static bool io_identity_cow(struct io_kiocb *req) { + struct io_uring_task *tctx = current->io_uring; const struct cred *creds = NULL; struct io_identity *id; @@ -1255,7 +1254,7 @@ static bool io_identity_cow(struct io_kiocb *req) refcount_inc(&id->count); /* drop old identity, assign new one. one ref for req, one for tctx */ - if (req->work.identity != &req->identity && + if (req->work.identity != &tctx->identity && refcount_sub_and_test(2, &req->work.identity->count)) kfree(req->work.identity); @@ -1266,7 +1265,7 @@ static bool io_identity_cow(struct io_kiocb *req) static bool io_grab_identity(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; - struct io_identity *id = &req->identity; + struct io_identity *id = req->work.identity; struct io_ring_ctx *ctx = req->ctx; if (def->needs_fsize && id->fsize != rlimit(RLIMIT_FSIZE)) @@ -1330,10 +1329,11 @@ static bool io_grab_identity(struct io_kiocb *req) static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; - struct io_identity *id = &req->identity; struct io_ring_ctx *ctx = req->ctx; + struct io_identity *id; io_req_init_async(req); + id = req->work.identity; if (req->flags & REQ_F_ISREG) { if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) @@ -6481,7 +6481,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(!iod)) return -EINVAL; refcount_inc(&iod->count); - io_put_identity(req); + io_put_identity(current->io_uring, req); get_cred(iod->creds); req->work.identity = iod; req->work.flags |= IO_WQ_WORK_CREDS; @@ -7691,6 +7691,7 @@ static int io_uring_alloc_task_context(struct task_struct *task) tctx->in_idle = 0; atomic_long_set(&tctx->req_issue, 0); atomic_long_set(&tctx->req_complete, 0); + io_init_identity(&tctx->identity); task->io_uring = tctx; return 0; } diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 342cc574d5c0..bd3346194bca 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -24,6 +24,7 @@ struct io_uring_task { struct wait_queue_head wait; struct file *last; atomic_long_t req_issue; + struct io_identity identity; /* completion side */ bool in_idle ____cacheline_aligned_in_smp; From 500a373d731ac506612db12631ec21295c1ff360 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Oct 2020 17:38:03 -0600 Subject: [PATCH 15/21] io_uring: assign new io_identity for task if members have changed This avoids doing a copy for each new async IO, if some parts of the io_identity has changed. We avoid reference counting for the normal fast path of nothing ever changing. Signed-off-by: Jens Axboe --- fs/io_uring.c | 19 +++++++++++++++---- include/linux/io_uring.h | 3 ++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ae91632b8bf9..7020c6a72231 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1066,12 +1066,18 @@ static void io_init_identity(struct io_identity *id) */ static inline void io_req_init_async(struct io_kiocb *req) { + struct io_uring_task *tctx = current->io_uring; + if (req->flags & REQ_F_WORK_INITIALIZED) return; memset(&req->work, 0, sizeof(req->work)); req->flags |= REQ_F_WORK_INITIALIZED; - req->work.identity = ¤t->io_uring->identity; + + /* Grab a ref if this isn't our static identity */ + req->work.identity = tctx->identity; + if (tctx->identity != &tctx->__identity) + refcount_inc(&req->work.identity->count); } static inline bool io_async_submit(struct io_ring_ctx *ctx) @@ -1179,7 +1185,7 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req) { - if (req->work.identity == &tctx->identity) + if (req->work.identity == &tctx->__identity) return; if (refcount_dec_and_test(&req->work.identity->count)) kfree(req->work.identity); @@ -1254,11 +1260,12 @@ static bool io_identity_cow(struct io_kiocb *req) refcount_inc(&id->count); /* drop old identity, assign new one. one ref for req, one for tctx */ - if (req->work.identity != &tctx->identity && + if (req->work.identity != tctx->identity && refcount_sub_and_test(2, &req->work.identity->count)) kfree(req->work.identity); req->work.identity = id; + tctx->identity = id; return true; } @@ -7691,7 +7698,8 @@ static int io_uring_alloc_task_context(struct task_struct *task) tctx->in_idle = 0; atomic_long_set(&tctx->req_issue, 0); atomic_long_set(&tctx->req_complete, 0); - io_init_identity(&tctx->identity); + io_init_identity(&tctx->__identity); + tctx->identity = &tctx->__identity; task->io_uring = tctx; return 0; } @@ -7701,6 +7709,9 @@ void __io_uring_free(struct task_struct *tsk) struct io_uring_task *tctx = tsk->io_uring; WARN_ON_ONCE(!xa_empty(&tctx->xa)); + WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1); + if (tctx->identity != &tctx->__identity) + kfree(tctx->identity); kfree(tctx); tsk->io_uring = NULL; } diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index bd3346194bca..607d14f61132 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -24,7 +24,8 @@ struct io_uring_task { struct wait_queue_head wait; struct file *last; atomic_long_t req_issue; - struct io_identity identity; + struct io_identity __identity; + struct io_identity *identity; /* completion side */ bool in_idle ____cacheline_aligned_in_smp; From d8a6df10aac9f2e4d5f30aff3129d552d2984ce7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Oct 2020 16:24:45 -0600 Subject: [PATCH 16/21] io_uring: use percpu counters to track inflight requests Even though we place the req_issued and req_complete in separate cachelines, there's considerable overhead in doing the atomics particularly on the completion side. Get rid of having the two counters, and just use a percpu_counter for this. That's what it was made for, after all. This considerably reduces the overhead in __io_free_req(). Signed-off-by: Jens Axboe --- fs/io_uring.c | 50 ++++++++++++++++++++++------------------ include/linux/io_uring.h | 7 ++---- 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7020c6a72231..58c445b95085 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1810,7 +1810,7 @@ static void __io_free_req(struct io_kiocb *req) io_dismantle_req(req); - atomic_long_inc(&tctx->req_complete); + percpu_counter_dec(&tctx->inflight); if (tctx->in_idle) wake_up(&tctx->wait); put_task_struct(req->task); @@ -2089,7 +2089,9 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx, if (rb->to_free) __io_req_free_batch_flush(ctx, rb); if (rb->task) { - atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete); + struct io_uring_task *tctx = rb->task->io_uring; + + percpu_counter_sub(&tctx->inflight, rb->task_refs); put_task_struct_many(rb->task, rb->task_refs); rb->task = NULL; } @@ -2106,7 +2108,9 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) if (req->task != rb->task) { if (rb->task) { - atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete); + struct io_uring_task *tctx = rb->task->io_uring; + + percpu_counter_sub(&tctx->inflight, rb->task_refs); put_task_struct_many(rb->task, rb->task_refs); } rb->task = req->task; @@ -6524,7 +6528,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; - atomic_long_add(nr, ¤t->io_uring->req_issue); + percpu_counter_add(¤t->io_uring->inflight, nr); refcount_add(nr, ¤t->usage); io_submit_state_start(&state, ctx, nr); @@ -6566,10 +6570,12 @@ fail_req: if (unlikely(submitted != nr)) { int ref_used = (submitted == -EAGAIN) ? 0 : submitted; + struct io_uring_task *tctx = current->io_uring; + int unused = nr - ref_used; - percpu_ref_put_many(&ctx->refs, nr - ref_used); - atomic_long_sub(nr - ref_used, ¤t->io_uring->req_issue); - put_task_struct_many(current, nr - ref_used); + percpu_ref_put_many(&ctx->refs, unused); + percpu_counter_sub(&tctx->inflight, unused); + put_task_struct_many(current, unused); } if (link) io_queue_link_head(link, &state.comp); @@ -7687,17 +7693,22 @@ out_fput: static int io_uring_alloc_task_context(struct task_struct *task) { struct io_uring_task *tctx; + int ret; tctx = kmalloc(sizeof(*tctx), GFP_KERNEL); if (unlikely(!tctx)) return -ENOMEM; + ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); + if (unlikely(ret)) { + kfree(tctx); + return ret; + } + xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); tctx->last = NULL; tctx->in_idle = 0; - atomic_long_set(&tctx->req_issue, 0); - atomic_long_set(&tctx->req_complete, 0); io_init_identity(&tctx->__identity); tctx->identity = &tctx->__identity; task->io_uring = tctx; @@ -7712,6 +7723,7 @@ void __io_uring_free(struct task_struct *tsk) WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1); if (tctx->identity != &tctx->__identity) kfree(tctx->identity); + percpu_counter_destroy(&tctx->inflight); kfree(tctx); tsk->io_uring = NULL; } @@ -8696,12 +8708,6 @@ void __io_uring_files_cancel(struct files_struct *files) } } -static inline bool io_uring_task_idle(struct io_uring_task *tctx) -{ - return atomic_long_read(&tctx->req_issue) == - atomic_long_read(&tctx->req_complete); -} - /* * Find any io_uring fd that this task has registered or done IO on, and cancel * requests. @@ -8710,14 +8716,16 @@ void __io_uring_task_cancel(void) { struct io_uring_task *tctx = current->io_uring; DEFINE_WAIT(wait); - long completions; + s64 inflight; /* make sure overflow events are dropped */ tctx->in_idle = true; - while (!io_uring_task_idle(tctx)) { + do { /* read completions before cancelations */ - completions = atomic_long_read(&tctx->req_complete); + inflight = percpu_counter_sum(&tctx->inflight); + if (!inflight) + break; __io_uring_files_cancel(NULL); prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); @@ -8726,12 +8734,10 @@ void __io_uring_task_cancel(void) * If we've seen completions, retry. This avoids a race where * a completion comes in before we did prepare_to_wait(). */ - if (completions != atomic_long_read(&tctx->req_complete)) + if (inflight != percpu_counter_sum(&tctx->inflight)) continue; - if (io_uring_task_idle(tctx)) - break; schedule(); - } + } while (1); finish_wait(&tctx->wait, &wait); tctx->in_idle = false; diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 607d14f61132..28939820b6b0 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -23,13 +23,10 @@ struct io_uring_task { struct xarray xa; struct wait_queue_head wait; struct file *last; - atomic_long_t req_issue; + struct percpu_counter inflight; struct io_identity __identity; struct io_identity *identity; - - /* completion side */ - bool in_idle ____cacheline_aligned_in_smp; - atomic_long_t req_complete; + bool in_idle; }; #if defined(CONFIG_IO_URING) From 4ea33a976bfe79293965d0815e1914e4b6e58967 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Oct 2020 13:46:44 -0600 Subject: [PATCH 17/21] io-wq: inherit audit loginuid and sessionid Make sure the async io-wq workers inherit the loginuid and sessionid from the original task, and restore them to unset once we're done with the async work item. While at it, disable the ability for kernel threads to write to their own loginuid. Signed-off-by: Jens Axboe --- fs/io-wq.c | 10 ++++++++++ fs/io_uring.c | 24 +++++++++++++++++++++++- fs/proc/base.c | 4 ++++ include/linux/io_uring.h | 4 ++++ 4 files changed, 41 insertions(+), 1 deletion(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 0c852b75384d..7cb3b4cb9b11 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "io-wq.h" @@ -484,6 +485,10 @@ static void io_impersonate_work(struct io_worker *worker, io_wq_switch_creds(worker, work); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize; io_wq_switch_blkcg(worker, work); +#ifdef CONFIG_AUDIT + current->loginuid = work->identity->loginuid; + current->sessionid = work->identity->sessionid; +#endif } static void io_assign_current_work(struct io_worker *worker, @@ -496,6 +501,11 @@ static void io_assign_current_work(struct io_worker *worker, cond_resched(); } +#ifdef CONFIG_AUDIT + current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET); + current->sessionid = AUDIT_SID_UNSET; +#endif + spin_lock_irq(&worker->lock); worker->cur_work = work; spin_unlock_irq(&worker->lock); diff --git a/fs/io_uring.c b/fs/io_uring.c index 58c445b95085..b9ffe98f18bc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -81,6 +81,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -327,6 +328,11 @@ struct io_ring_ctx { const struct cred *creds; +#ifdef CONFIG_AUDIT + kuid_t loginuid; + unsigned int sessionid; +#endif + struct completion ref_comp; struct completion sq_thread_comp; @@ -1057,6 +1063,10 @@ static void io_init_identity(struct io_identity *id) id->nsproxy = current->nsproxy; id->fs = current->fs; id->fsize = rlimit(RLIMIT_FSIZE); +#ifdef CONFIG_AUDIT + id->loginuid = current->loginuid; + id->sessionid = current->sessionid; +#endif refcount_set(&id->count, 1); } @@ -1316,6 +1326,11 @@ static bool io_grab_identity(struct io_kiocb *req) get_cred(id->creds); req->work.flags |= IO_WQ_WORK_CREDS; } +#ifdef CONFIG_AUDIT + if (!uid_eq(current->loginuid, id->loginuid) || + current->sessionid != id->sessionid) + return false; +#endif if (!(req->work.flags & IO_WQ_WORK_FS) && (def->work_flags & IO_WQ_WORK_FS)) { if (current->fs != id->fs) @@ -6755,6 +6770,10 @@ static int io_sq_thread(void *data) old_cred = override_creds(ctx->creds); } io_sq_thread_associate_blkcg(ctx, &cur_css); +#ifdef CONFIG_AUDIT + current->loginuid = ctx->loginuid; + current->sessionid = ctx->sessionid; +#endif ret |= __io_sq_thread(ctx, start_jiffies, cap_entries); @@ -9203,7 +9222,10 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->compat = in_compat_syscall(); ctx->user = user; ctx->creds = get_current_cred(); - +#ifdef CONFIG_AUDIT + ctx->loginuid = current->loginuid; + ctx->sessionid = current->sessionid; +#endif ctx->sqo_task = get_task_struct(current); /* diff --git a/fs/proc/base.c b/fs/proc/base.c index aa69c35d904c..0f707003dda5 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1268,6 +1268,10 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, kuid_t kloginuid; int rv; + /* Don't let kthreads write their own loginuid */ + if (current->flags & PF_KTHREAD) + return -EPERM; + rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { rcu_read_unlock(); diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 28939820b6b0..868364cea3b7 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -15,6 +15,10 @@ struct io_identity { struct nsproxy *nsproxy; struct fs_struct *fs; unsigned long fsize; +#ifdef CONFIG_AUDIT + kuid_t loginuid; + unsigned int sessionid; +#endif refcount_t count; }; From 58852d4d673760cf7c88b9360b3c24a041bec298 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 16 Oct 2020 20:55:56 +0100 Subject: [PATCH 18/21] io_uring: fix double poll mask init __io_queue_proc() is used by both, poll reqs and apoll. Don't use req->poll.events to copy poll mask because for apoll it aliases with private data of the request. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b9ffe98f18bc..e1726f457461 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5008,6 +5008,8 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, * for write). Setup a separate io_poll_iocb if this happens. */ if (unlikely(poll->head)) { + struct io_poll_iocb *poll_one = poll; + /* already have a 2nd entry, fail a third attempt */ if (*poll_ptr) { pt->error = -EINVAL; @@ -5018,7 +5020,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->error = -ENOMEM; return; } - io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake); + io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake); refcount_inc(&req->refs); poll->wait.private = req; *poll_ptr = poll; From 13bd691421bc191a402d2e0d3da5f248d170a632 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 17 Oct 2020 08:31:29 -0600 Subject: [PATCH 19/21] mm: mark async iocb read as NOWAIT once some data has been copied Once we've copied some data for an iocb that is marked with IOCB_WAITQ, we should no longer attempt to async lock a new page. Instead make sure we return the copied amount, and let the caller retry, instead of returning -EIOCBQUEUED for a new page. This should only be possible with read-ahead disabled on the below device, and multiple threads racing on the same file. Haven't been able to reproduce on anything else. Cc: stable@vger.kernel.org # v5.9 Fixes: 1a0a7853b901 ("mm: support async buffered reads in generic_file_buffered_read()") Reported-by: Kent Overstreet Signed-off-by: Jens Axboe --- mm/filemap.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/filemap.c b/mm/filemap.c index 1a6beaf69f49..e4101b5bfa82 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2199,6 +2199,14 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; offset = *ppos & ~PAGE_MASK; + /* + * If we've already successfully copied some data, then we + * can no longer safely return -EIOCBQUEUED. Hence mark + * an async read NOWAIT at that point. + */ + if (written && (iocb->ki_flags & IOCB_WAITQ)) + iocb->ki_flags |= IOCB_NOWAIT; + for (;;) { struct page *page; pgoff_t end_index; From 324bcf54c449c7b5b7024c9fa4549fbaaae1935d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 17 Oct 2020 09:25:52 -0600 Subject: [PATCH 20/21] mm: use limited read-ahead to satisfy read For the case where read-ahead is disabled on the file, or if the cgroup is congested, ensure that we can at least do 1 page of read-ahead to make progress on the read in an async fashion. This could potentially be larger, but it's not needed in terms of functionality, so let's error on the side of caution as larger counts of pages may run into reclaim issues (particularly if we're congested). This makes sure we're not hitting the potentially sync ->readpage() path for IO that is marked IOCB_WAITQ, which could cause us to block. It also means we'll use the same path for IO, regardless of whether or not read-ahead happens to be disabled on the lower level device. Acked-by: Johannes Weiner Reported-by: Matthew Wilcox (Oracle) Reported-by: Hao_Xu [axboe: updated for new ractl API] Signed-off-by: Jens Axboe --- mm/readahead.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index c6ffb76827da..c5b0457415be 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -552,15 +552,23 @@ readit: void page_cache_sync_ra(struct readahead_control *ractl, struct file_ra_state *ra, unsigned long req_count) { - /* no read-ahead */ - if (!ra->ra_pages) - return; + bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM); - if (blk_cgroup_congested()) - return; + /* + * Even if read-ahead is disabled, issue this request as read-ahead + * as we'll need it to satisfy the requested range. The forced + * read-ahead will do the right thing and limit the read to just the + * requested range, which we'll set to 1 page for this case. + */ + if (!ra->ra_pages || blk_cgroup_congested()) { + if (!ractl->file) + return; + req_count = 1; + do_forced_ra = true; + } /* be dumb */ - if (ractl->file && (ractl->file->f_mode & FMODE_RANDOM)) { + if (do_forced_ra) { force_page_cache_ra(ractl, ra, req_count); return; } From 9ba0d0c81284f4ec0b24529bdba2fc68b9d6a09a Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Mon, 19 Oct 2020 16:59:42 +0800 Subject: [PATCH 21/21] io_uring: use blk_queue_nowait() to check if NOWAIT supported commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") adds a new helper function blk_queue_nowait() to check if the bdev supports handling of REQ_NOWAIT or not. Since then bio-based dm device can also support REQ_NOWAIT, and currently only dm-linear supports that since commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable it for linear target"). Signed-off-by: Jeffle Xu Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e1726f457461..0f4a9c45061d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2693,7 +2693,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) static bool io_bdev_nowait(struct block_device *bdev) { #ifdef CONFIG_BLOCK - return !bdev || queue_is_mq(bdev_get_queue(bdev)); + return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); #else return true; #endif