io_uring-5.6-2020-02-05
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAl47MicQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpplgD/4wyOfyMQ601AaiXyBmG6lx7UV7kBBaDWAb tlDEh/EWIioejMlYJC8UslLtrlxJS8jKCVJNOAz5zB9V6McLtHNxXNY5pRr4MRrc 2ztxFHuvy8s+LyztGxBh3DA+bT5UrMR/r6uu6Guh2TatFUZr4IOvBUBb6VeP9O1Z sECCkzWZcmIq2gNSh7Dpxr31KdMQo7xngyMhFMh3CHBnDVZN6WX4ugNBJNb71MpY ELH3SRY2uX15dlhatO5UYuAknJOA1VvlulYVWCuBj4UPyH0AAUJQiZJVEPwldCNL qE4cS80Q5EMAFw32cOW/oyl8Z6oFQO5nwFQ+YPPhaZscjMsRteuqnt6qYSgXHJal ze4mUBO9Z1byc9Gex1V5SHZSLzVw3HfgznSUfZrm+Tj2UkocJSaYtS9CzXR8x7tE tD8ev4P3EH+axm4oUSWoA4Bro9eGgkV07ok2mCnxb9rJoV0JNHzUmVSzjF4G9HGK GosVRRS4I4/nHIZQ3KTKp6apLOAn7SPTUkxqb0/M8qbRXZqQYylWhPsL2Q8aBgvT 8pQ2sIQ5AgOmzGKqKRofxbhIh8G+6Ddz97A+Omt47zLb8ccsoatXfEli7mMjtH4P W/aUE0O8Kstma8gZN4LUxrnqKGncDVJMolozFyt5dWc9bIpxX0SmpDdiRzqyN1fw k9L4Ox6hxg== =RzPL -----END PGP SIGNATURE----- Merge tag 'io_uring-5.6-2020-02-05' of git://git.kernel.dk/linux-block Pull io_uring updates from Jens Axboe: "Some later fixes for io_uring: - Small cleanup series from Pavel - Belt and suspenders build time check of sqe size and layout (Stefan) - Addition of ->show_fdinfo() on request of Jann Horn, to aid in understanding mapped personalities - eventfd recursion/deadlock fix, for both io_uring and aio - Fixup for send/recv handling - Fixup for double deferral of read/write request - Fix for potential double completion event for close request - Adjust fadvise advice async/inline behavior - Fix for shutdown hang with SQPOLL thread - Fix for potential use-after-free of fixed file table" * tag 'io_uring-5.6-2020-02-05' of git://git.kernel.dk/linux-block: io_uring: cleanup fixed file data table references io_uring: spin for sq thread to idle on shutdown aio: prevent potential eventfd recursion on poll io_uring: put the flag changing code in the same spot io_uring: iterate req cache backwards io_uring: punt even fadvise() WILLNEED to async context io_uring: fix sporadic double CQE entry for close io_uring: remove extra ->file check io_uring: don't map read/write iovec potentially twice io_uring: use the proper helpers for io_send/recv io_uring: prevent potential eventfd recursion on poll eventfd: track eventfd_signal() recursion depth io_uring: add BUILD_BUG_ON() to assert the layout of struct io_uring_sqe io_uring: add ->show_fdinfo() for the io_uring file descriptor
This commit is contained in:
commit
c1ef57a3a3
20
fs/aio.c
20
fs/aio.c
@ -1610,6 +1610,14 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void aio_poll_put_work(struct work_struct *work)
|
||||||
|
{
|
||||||
|
struct poll_iocb *req = container_of(work, struct poll_iocb, work);
|
||||||
|
struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
|
||||||
|
|
||||||
|
iocb_put(iocb);
|
||||||
|
}
|
||||||
|
|
||||||
static void aio_poll_complete_work(struct work_struct *work)
|
static void aio_poll_complete_work(struct work_struct *work)
|
||||||
{
|
{
|
||||||
struct poll_iocb *req = container_of(work, struct poll_iocb, work);
|
struct poll_iocb *req = container_of(work, struct poll_iocb, work);
|
||||||
@ -1674,6 +1682,8 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
|
|||||||
list_del_init(&req->wait.entry);
|
list_del_init(&req->wait.entry);
|
||||||
|
|
||||||
if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
|
if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
|
||||||
|
struct kioctx *ctx = iocb->ki_ctx;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Try to complete the iocb inline if we can. Use
|
* Try to complete the iocb inline if we can. Use
|
||||||
* irqsave/irqrestore because not all filesystems (e.g. fuse)
|
* irqsave/irqrestore because not all filesystems (e.g. fuse)
|
||||||
@ -1683,8 +1693,14 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
|
|||||||
list_del(&iocb->ki_list);
|
list_del(&iocb->ki_list);
|
||||||
iocb->ki_res.res = mangle_poll(mask);
|
iocb->ki_res.res = mangle_poll(mask);
|
||||||
req->done = true;
|
req->done = true;
|
||||||
spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
|
if (iocb->ki_eventfd && eventfd_signal_count()) {
|
||||||
iocb_put(iocb);
|
iocb = NULL;
|
||||||
|
INIT_WORK(&req->work, aio_poll_put_work);
|
||||||
|
schedule_work(&req->work);
|
||||||
|
}
|
||||||
|
spin_unlock_irqrestore(&ctx->ctx_lock, flags);
|
||||||
|
if (iocb)
|
||||||
|
iocb_put(iocb);
|
||||||
} else {
|
} else {
|
||||||
schedule_work(&req->work);
|
schedule_work(&req->work);
|
||||||
}
|
}
|
||||||
|
15
fs/eventfd.c
15
fs/eventfd.c
@ -24,6 +24,8 @@
|
|||||||
#include <linux/seq_file.h>
|
#include <linux/seq_file.h>
|
||||||
#include <linux/idr.h>
|
#include <linux/idr.h>
|
||||||
|
|
||||||
|
DEFINE_PER_CPU(int, eventfd_wake_count);
|
||||||
|
|
||||||
static DEFINE_IDA(eventfd_ida);
|
static DEFINE_IDA(eventfd_ida);
|
||||||
|
|
||||||
struct eventfd_ctx {
|
struct eventfd_ctx {
|
||||||
@ -60,12 +62,25 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
|
|||||||
{
|
{
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Deadlock or stack overflow issues can happen if we recurse here
|
||||||
|
* through waitqueue wakeup handlers. If the caller users potentially
|
||||||
|
* nested waitqueues with custom wakeup handlers, then it should
|
||||||
|
* check eventfd_signal_count() before calling this function. If
|
||||||
|
* it returns true, the eventfd_signal() call should be deferred to a
|
||||||
|
* safe context.
|
||||||
|
*/
|
||||||
|
if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
|
||||||
|
return 0;
|
||||||
|
|
||||||
spin_lock_irqsave(&ctx->wqh.lock, flags);
|
spin_lock_irqsave(&ctx->wqh.lock, flags);
|
||||||
|
this_cpu_inc(eventfd_wake_count);
|
||||||
if (ULLONG_MAX - ctx->count < n)
|
if (ULLONG_MAX - ctx->count < n)
|
||||||
n = ULLONG_MAX - ctx->count;
|
n = ULLONG_MAX - ctx->count;
|
||||||
ctx->count += n;
|
ctx->count += n;
|
||||||
if (waitqueue_active(&ctx->wqh))
|
if (waitqueue_active(&ctx->wqh))
|
||||||
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
|
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
|
||||||
|
this_cpu_dec(eventfd_wake_count);
|
||||||
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
|
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
|
||||||
|
|
||||||
return n;
|
return n;
|
||||||
|
254
fs/io_uring.c
254
fs/io_uring.c
@ -585,8 +585,7 @@ struct io_submit_state {
|
|||||||
* io_kiocb alloc cache
|
* io_kiocb alloc cache
|
||||||
*/
|
*/
|
||||||
void *reqs[IO_IOPOLL_BATCH];
|
void *reqs[IO_IOPOLL_BATCH];
|
||||||
unsigned int free_reqs;
|
unsigned int free_reqs;
|
||||||
unsigned int cur_req;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* File reference cache
|
* File reference cache
|
||||||
@ -754,6 +753,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
|
|||||||
struct io_uring_files_update *ip,
|
struct io_uring_files_update *ip,
|
||||||
unsigned nr_args);
|
unsigned nr_args);
|
||||||
static int io_grab_files(struct io_kiocb *req);
|
static int io_grab_files(struct io_kiocb *req);
|
||||||
|
static void io_ring_file_ref_flush(struct fixed_file_data *data);
|
||||||
|
|
||||||
static struct kmem_cache *req_cachep;
|
static struct kmem_cache *req_cachep;
|
||||||
|
|
||||||
@ -1020,21 +1020,28 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
|
|||||||
|
|
||||||
static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
|
static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
|
||||||
{
|
{
|
||||||
|
if (!ctx->cq_ev_fd)
|
||||||
|
return false;
|
||||||
if (!ctx->eventfd_async)
|
if (!ctx->eventfd_async)
|
||||||
return true;
|
return true;
|
||||||
return io_wq_current_is_worker() || in_interrupt();
|
return io_wq_current_is_worker() || in_interrupt();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
|
static void __io_cqring_ev_posted(struct io_ring_ctx *ctx, bool trigger_ev)
|
||||||
{
|
{
|
||||||
if (waitqueue_active(&ctx->wait))
|
if (waitqueue_active(&ctx->wait))
|
||||||
wake_up(&ctx->wait);
|
wake_up(&ctx->wait);
|
||||||
if (waitqueue_active(&ctx->sqo_wait))
|
if (waitqueue_active(&ctx->sqo_wait))
|
||||||
wake_up(&ctx->sqo_wait);
|
wake_up(&ctx->sqo_wait);
|
||||||
if (ctx->cq_ev_fd && io_should_trigger_evfd(ctx))
|
if (trigger_ev)
|
||||||
eventfd_signal(ctx->cq_ev_fd, 1);
|
eventfd_signal(ctx->cq_ev_fd, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
|
||||||
|
{
|
||||||
|
__io_cqring_ev_posted(ctx, io_should_trigger_evfd(ctx));
|
||||||
|
}
|
||||||
|
|
||||||
/* Returns true if there are no backlogged entries after the flush */
|
/* Returns true if there are no backlogged entries after the flush */
|
||||||
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
|
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
|
||||||
{
|
{
|
||||||
@ -1183,12 +1190,10 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
|
|||||||
ret = 1;
|
ret = 1;
|
||||||
}
|
}
|
||||||
state->free_reqs = ret - 1;
|
state->free_reqs = ret - 1;
|
||||||
state->cur_req = 1;
|
req = state->reqs[ret - 1];
|
||||||
req = state->reqs[0];
|
|
||||||
} else {
|
} else {
|
||||||
req = state->reqs[state->cur_req];
|
|
||||||
state->free_reqs--;
|
state->free_reqs--;
|
||||||
state->cur_req++;
|
req = state->reqs[state->free_reqs];
|
||||||
}
|
}
|
||||||
|
|
||||||
got_it:
|
got_it:
|
||||||
@ -1855,9 +1860,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
|
|||||||
unsigned ioprio;
|
unsigned ioprio;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
if (!req->file)
|
|
||||||
return -EBADF;
|
|
||||||
|
|
||||||
if (S_ISREG(file_inode(req->file)->i_mode))
|
if (S_ISREG(file_inode(req->file)->i_mode))
|
||||||
req->flags |= REQ_F_ISREG;
|
req->flags |= REQ_F_ISREG;
|
||||||
|
|
||||||
@ -1866,8 +1868,11 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
|
|||||||
req->flags |= REQ_F_CUR_POS;
|
req->flags |= REQ_F_CUR_POS;
|
||||||
kiocb->ki_pos = req->file->f_pos;
|
kiocb->ki_pos = req->file->f_pos;
|
||||||
}
|
}
|
||||||
kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
|
|
||||||
kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
|
kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
|
||||||
|
kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
|
||||||
|
ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
|
||||||
|
if (unlikely(ret))
|
||||||
|
return ret;
|
||||||
|
|
||||||
ioprio = READ_ONCE(sqe->ioprio);
|
ioprio = READ_ONCE(sqe->ioprio);
|
||||||
if (ioprio) {
|
if (ioprio) {
|
||||||
@ -1879,10 +1884,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
|
|||||||
} else
|
} else
|
||||||
kiocb->ki_ioprio = get_current_ioprio();
|
kiocb->ki_ioprio = get_current_ioprio();
|
||||||
|
|
||||||
ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
|
|
||||||
if (unlikely(ret))
|
|
||||||
return ret;
|
|
||||||
|
|
||||||
/* don't allow async punt if RWF_NOWAIT was requested */
|
/* don't allow async punt if RWF_NOWAIT was requested */
|
||||||
if ((kiocb->ki_flags & IOCB_NOWAIT) ||
|
if ((kiocb->ki_flags & IOCB_NOWAIT) ||
|
||||||
(req->file->f_flags & O_NONBLOCK))
|
(req->file->f_flags & O_NONBLOCK))
|
||||||
@ -2164,10 +2165,12 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
|
|||||||
{
|
{
|
||||||
if (!io_op_defs[req->opcode].async_ctx)
|
if (!io_op_defs[req->opcode].async_ctx)
|
||||||
return 0;
|
return 0;
|
||||||
if (!req->io && io_alloc_async_ctx(req))
|
if (!req->io) {
|
||||||
return -ENOMEM;
|
if (io_alloc_async_ctx(req))
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
io_req_map_rw(req, io_size, iovec, fast_iov, iter);
|
io_req_map_rw(req, io_size, iovec, fast_iov, iter);
|
||||||
|
}
|
||||||
req->work.func = io_rw_async;
|
req->work.func = io_rw_async;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -2724,9 +2727,16 @@ static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
|
|||||||
struct io_fadvise *fa = &req->fadvise;
|
struct io_fadvise *fa = &req->fadvise;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
/* DONTNEED may block, others _should_ not */
|
if (force_nonblock) {
|
||||||
if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock)
|
switch (fa->advice) {
|
||||||
return -EAGAIN;
|
case POSIX_FADV_NORMAL:
|
||||||
|
case POSIX_FADV_RANDOM:
|
||||||
|
case POSIX_FADV_SEQUENTIAL:
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return -EAGAIN;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
|
ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
|
||||||
if (ret < 0)
|
if (ret < 0)
|
||||||
@ -2837,16 +2847,13 @@ static void io_close_finish(struct io_wq_work **workptr)
|
|||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
ret = filp_close(req->close.put_file, req->work.files);
|
ret = filp_close(req->close.put_file, req->work.files);
|
||||||
if (ret < 0) {
|
if (ret < 0)
|
||||||
req_set_fail_links(req);
|
req_set_fail_links(req);
|
||||||
}
|
|
||||||
io_cqring_add_event(req, ret);
|
io_cqring_add_event(req, ret);
|
||||||
}
|
}
|
||||||
|
|
||||||
fput(req->close.put_file);
|
fput(req->close.put_file);
|
||||||
|
|
||||||
/* we bypassed the re-issue, drop the submission reference */
|
|
||||||
io_put_req(req);
|
|
||||||
io_put_req_find_next(req, &nxt);
|
io_put_req_find_next(req, &nxt);
|
||||||
if (nxt)
|
if (nxt)
|
||||||
io_wq_assign_next(workptr, nxt);
|
io_wq_assign_next(workptr, nxt);
|
||||||
@ -2888,7 +2895,13 @@ static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
|
|||||||
|
|
||||||
eagain:
|
eagain:
|
||||||
req->work.func = io_close_finish;
|
req->work.func = io_close_finish;
|
||||||
return -EAGAIN;
|
/*
|
||||||
|
* Do manual async queue here to avoid grabbing files - we don't
|
||||||
|
* need the files, and it'll cause io_close_finish() to close
|
||||||
|
* the file again and cause a double CQE entry for this request
|
||||||
|
*/
|
||||||
|
io_queue_async_work(req);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||||
@ -3083,7 +3096,8 @@ static int io_send(struct io_kiocb *req, struct io_kiocb **nxt,
|
|||||||
else if (force_nonblock)
|
else if (force_nonblock)
|
||||||
flags |= MSG_DONTWAIT;
|
flags |= MSG_DONTWAIT;
|
||||||
|
|
||||||
ret = __sys_sendmsg_sock(sock, &msg, flags);
|
msg.msg_flags = flags;
|
||||||
|
ret = sock_sendmsg(sock, &msg);
|
||||||
if (force_nonblock && ret == -EAGAIN)
|
if (force_nonblock && ret == -EAGAIN)
|
||||||
return -EAGAIN;
|
return -EAGAIN;
|
||||||
if (ret == -ERESTARTSYS)
|
if (ret == -ERESTARTSYS)
|
||||||
@ -3109,6 +3123,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
|
|||||||
|
|
||||||
sr->msg_flags = READ_ONCE(sqe->msg_flags);
|
sr->msg_flags = READ_ONCE(sqe->msg_flags);
|
||||||
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
|
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
|
||||||
|
sr->len = READ_ONCE(sqe->len);
|
||||||
|
|
||||||
if (!io || req->opcode == IORING_OP_RECV)
|
if (!io || req->opcode == IORING_OP_RECV)
|
||||||
return 0;
|
return 0;
|
||||||
@ -3227,7 +3242,7 @@ static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
|
|||||||
else if (force_nonblock)
|
else if (force_nonblock)
|
||||||
flags |= MSG_DONTWAIT;
|
flags |= MSG_DONTWAIT;
|
||||||
|
|
||||||
ret = __sys_recvmsg_sock(sock, &msg, NULL, NULL, flags);
|
ret = sock_recvmsg(sock, &msg, flags);
|
||||||
if (force_nonblock && ret == -EAGAIN)
|
if (force_nonblock && ret == -EAGAIN)
|
||||||
return -EAGAIN;
|
return -EAGAIN;
|
||||||
if (ret == -ERESTARTSYS)
|
if (ret == -ERESTARTSYS)
|
||||||
@ -3561,6 +3576,14 @@ static void io_poll_flush(struct io_wq_work **workptr)
|
|||||||
__io_poll_flush(req->ctx, nodes);
|
__io_poll_flush(req->ctx, nodes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void io_poll_trigger_evfd(struct io_wq_work **workptr)
|
||||||
|
{
|
||||||
|
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
|
||||||
|
|
||||||
|
eventfd_signal(req->ctx->cq_ev_fd, 1);
|
||||||
|
io_put_req(req);
|
||||||
|
}
|
||||||
|
|
||||||
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
|
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
|
||||||
void *key)
|
void *key)
|
||||||
{
|
{
|
||||||
@ -3586,14 +3609,22 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
|
|||||||
|
|
||||||
if (llist_empty(&ctx->poll_llist) &&
|
if (llist_empty(&ctx->poll_llist) &&
|
||||||
spin_trylock_irqsave(&ctx->completion_lock, flags)) {
|
spin_trylock_irqsave(&ctx->completion_lock, flags)) {
|
||||||
|
bool trigger_ev;
|
||||||
|
|
||||||
hash_del(&req->hash_node);
|
hash_del(&req->hash_node);
|
||||||
io_poll_complete(req, mask, 0);
|
io_poll_complete(req, mask, 0);
|
||||||
req->flags |= REQ_F_COMP_LOCKED;
|
|
||||||
io_put_req(req);
|
|
||||||
spin_unlock_irqrestore(&ctx->completion_lock, flags);
|
|
||||||
|
|
||||||
io_cqring_ev_posted(ctx);
|
trigger_ev = io_should_trigger_evfd(ctx);
|
||||||
req = NULL;
|
if (trigger_ev && eventfd_signal_count()) {
|
||||||
|
trigger_ev = false;
|
||||||
|
req->work.func = io_poll_trigger_evfd;
|
||||||
|
} else {
|
||||||
|
req->flags |= REQ_F_COMP_LOCKED;
|
||||||
|
io_put_req(req);
|
||||||
|
req = NULL;
|
||||||
|
}
|
||||||
|
spin_unlock_irqrestore(&ctx->completion_lock, flags);
|
||||||
|
__io_cqring_ev_posted(ctx, trigger_ev);
|
||||||
} else {
|
} else {
|
||||||
req->result = mask;
|
req->result = mask;
|
||||||
req->llist_node.next = NULL;
|
req->llist_node.next = NULL;
|
||||||
@ -4815,8 +4846,7 @@ static void io_submit_state_end(struct io_submit_state *state)
|
|||||||
blk_finish_plug(&state->plug);
|
blk_finish_plug(&state->plug);
|
||||||
io_file_put(state);
|
io_file_put(state);
|
||||||
if (state->free_reqs)
|
if (state->free_reqs)
|
||||||
kmem_cache_free_bulk(req_cachep, state->free_reqs,
|
kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
|
||||||
&state->reqs[state->cur_req]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -5041,7 +5071,8 @@ static int io_sq_thread(void *data)
|
|||||||
* reap events and wake us up.
|
* reap events and wake us up.
|
||||||
*/
|
*/
|
||||||
if (inflight ||
|
if (inflight ||
|
||||||
(!time_after(jiffies, timeout) && ret != -EBUSY)) {
|
(!time_after(jiffies, timeout) && ret != -EBUSY &&
|
||||||
|
!percpu_ref_is_dying(&ctx->refs))) {
|
||||||
cond_resched();
|
cond_resched();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -5231,15 +5262,10 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
|
|||||||
if (!data)
|
if (!data)
|
||||||
return -ENXIO;
|
return -ENXIO;
|
||||||
|
|
||||||
/* protect against inflight atomic switch, which drops the ref */
|
|
||||||
percpu_ref_get(&data->refs);
|
|
||||||
/* wait for existing switches */
|
|
||||||
flush_work(&data->ref_work);
|
|
||||||
percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
|
percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
|
||||||
wait_for_completion(&data->done);
|
|
||||||
percpu_ref_put(&data->refs);
|
|
||||||
/* flush potential new switch */
|
|
||||||
flush_work(&data->ref_work);
|
flush_work(&data->ref_work);
|
||||||
|
wait_for_completion(&data->done);
|
||||||
|
io_ring_file_ref_flush(data);
|
||||||
percpu_ref_exit(&data->refs);
|
percpu_ref_exit(&data->refs);
|
||||||
|
|
||||||
__io_sqe_files_unregister(ctx);
|
__io_sqe_files_unregister(ctx);
|
||||||
@ -5477,14 +5503,11 @@ struct io_file_put {
|
|||||||
struct completion *done;
|
struct completion *done;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void io_ring_file_ref_switch(struct work_struct *work)
|
static void io_ring_file_ref_flush(struct fixed_file_data *data)
|
||||||
{
|
{
|
||||||
struct io_file_put *pfile, *tmp;
|
struct io_file_put *pfile, *tmp;
|
||||||
struct fixed_file_data *data;
|
|
||||||
struct llist_node *node;
|
struct llist_node *node;
|
||||||
|
|
||||||
data = container_of(work, struct fixed_file_data, ref_work);
|
|
||||||
|
|
||||||
while ((node = llist_del_all(&data->put_llist)) != NULL) {
|
while ((node = llist_del_all(&data->put_llist)) != NULL) {
|
||||||
llist_for_each_entry_safe(pfile, tmp, node, llist) {
|
llist_for_each_entry_safe(pfile, tmp, node, llist) {
|
||||||
io_ring_file_put(data->ctx, pfile->file);
|
io_ring_file_put(data->ctx, pfile->file);
|
||||||
@ -5494,7 +5517,14 @@ static void io_ring_file_ref_switch(struct work_struct *work)
|
|||||||
kfree(pfile);
|
kfree(pfile);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void io_ring_file_ref_switch(struct work_struct *work)
|
||||||
|
{
|
||||||
|
struct fixed_file_data *data;
|
||||||
|
|
||||||
|
data = container_of(work, struct fixed_file_data, ref_work);
|
||||||
|
io_ring_file_ref_flush(data);
|
||||||
percpu_ref_get(&data->refs);
|
percpu_ref_get(&data->refs);
|
||||||
percpu_ref_switch_to_percpu(&data->refs);
|
percpu_ref_switch_to_percpu(&data->refs);
|
||||||
}
|
}
|
||||||
@ -5505,8 +5535,14 @@ static void io_file_data_ref_zero(struct percpu_ref *ref)
|
|||||||
|
|
||||||
data = container_of(ref, struct fixed_file_data, refs);
|
data = container_of(ref, struct fixed_file_data, refs);
|
||||||
|
|
||||||
/* we can't safely switch from inside this context, punt to wq */
|
/*
|
||||||
queue_work(system_wq, &data->ref_work);
|
* We can't safely switch from inside this context, punt to wq. If
|
||||||
|
* the table ref is going away, the table is being unregistered.
|
||||||
|
* Don't queue up the async work for that case, the caller will
|
||||||
|
* handle it.
|
||||||
|
*/
|
||||||
|
if (!percpu_ref_is_dying(&data->refs))
|
||||||
|
queue_work(system_wq, &data->ref_work);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
|
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
|
||||||
@ -6295,6 +6331,16 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
|
|||||||
percpu_ref_kill(&ctx->refs);
|
percpu_ref_kill(&ctx->refs);
|
||||||
mutex_unlock(&ctx->uring_lock);
|
mutex_unlock(&ctx->uring_lock);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Wait for sq thread to idle, if we have one. It won't spin on new
|
||||||
|
* work after we've killed the ctx ref above. This is important to do
|
||||||
|
* before we cancel existing commands, as the thread could otherwise
|
||||||
|
* be queueing new work post that. If that's work we need to cancel,
|
||||||
|
* it could cause shutdown to hang.
|
||||||
|
*/
|
||||||
|
while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait))
|
||||||
|
cpu_relax();
|
||||||
|
|
||||||
io_kill_timeouts(ctx);
|
io_kill_timeouts(ctx);
|
||||||
io_poll_remove_all(ctx);
|
io_poll_remove_all(ctx);
|
||||||
|
|
||||||
@ -6501,6 +6547,80 @@ out_fput:
|
|||||||
return submitted ? submitted : ret;
|
return submitted ? submitted : ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int io_uring_show_cred(int id, void *p, void *data)
|
||||||
|
{
|
||||||
|
const struct cred *cred = p;
|
||||||
|
struct seq_file *m = data;
|
||||||
|
struct user_namespace *uns = seq_user_ns(m);
|
||||||
|
struct group_info *gi;
|
||||||
|
kernel_cap_t cap;
|
||||||
|
unsigned __capi;
|
||||||
|
int g;
|
||||||
|
|
||||||
|
seq_printf(m, "%5d\n", id);
|
||||||
|
seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
|
||||||
|
seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
|
||||||
|
seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
|
||||||
|
seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
|
||||||
|
seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
|
||||||
|
seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
|
||||||
|
seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
|
||||||
|
seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
|
||||||
|
seq_puts(m, "\n\tGroups:\t");
|
||||||
|
gi = cred->group_info;
|
||||||
|
for (g = 0; g < gi->ngroups; g++) {
|
||||||
|
seq_put_decimal_ull(m, g ? " " : "",
|
||||||
|
from_kgid_munged(uns, gi->gid[g]));
|
||||||
|
}
|
||||||
|
seq_puts(m, "\n\tCapEff:\t");
|
||||||
|
cap = cred->cap_effective;
|
||||||
|
CAP_FOR_EACH_U32(__capi)
|
||||||
|
seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
|
||||||
|
seq_putc(m, '\n');
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
mutex_lock(&ctx->uring_lock);
|
||||||
|
seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
|
||||||
|
for (i = 0; i < ctx->nr_user_files; i++) {
|
||||||
|
struct fixed_file_table *table;
|
||||||
|
struct file *f;
|
||||||
|
|
||||||
|
table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
|
||||||
|
f = table->files[i & IORING_FILE_TABLE_MASK];
|
||||||
|
if (f)
|
||||||
|
seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
|
||||||
|
else
|
||||||
|
seq_printf(m, "%5u: <none>\n", i);
|
||||||
|
}
|
||||||
|
seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
|
||||||
|
for (i = 0; i < ctx->nr_user_bufs; i++) {
|
||||||
|
struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
|
||||||
|
|
||||||
|
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
|
||||||
|
(unsigned int) buf->len);
|
||||||
|
}
|
||||||
|
if (!idr_is_empty(&ctx->personality_idr)) {
|
||||||
|
seq_printf(m, "Personalities:\n");
|
||||||
|
idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
|
||||||
|
}
|
||||||
|
mutex_unlock(&ctx->uring_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
|
||||||
|
{
|
||||||
|
struct io_ring_ctx *ctx = f->private_data;
|
||||||
|
|
||||||
|
if (percpu_ref_tryget(&ctx->refs)) {
|
||||||
|
__io_uring_show_fdinfo(ctx, m);
|
||||||
|
percpu_ref_put(&ctx->refs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static const struct file_operations io_uring_fops = {
|
static const struct file_operations io_uring_fops = {
|
||||||
.release = io_uring_release,
|
.release = io_uring_release,
|
||||||
.flush = io_uring_flush,
|
.flush = io_uring_flush,
|
||||||
@ -6511,6 +6631,7 @@ static const struct file_operations io_uring_fops = {
|
|||||||
#endif
|
#endif
|
||||||
.poll = io_uring_poll,
|
.poll = io_uring_poll,
|
||||||
.fasync = io_uring_fasync,
|
.fasync = io_uring_fasync,
|
||||||
|
.show_fdinfo = io_uring_show_fdinfo,
|
||||||
};
|
};
|
||||||
|
|
||||||
static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
|
static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
|
||||||
@ -6963,6 +7084,39 @@ out_fput:
|
|||||||
|
|
||||||
static int __init io_uring_init(void)
|
static int __init io_uring_init(void)
|
||||||
{
|
{
|
||||||
|
#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
|
||||||
|
BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
|
||||||
|
BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
|
||||||
|
__BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
|
||||||
|
BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
|
||||||
|
BUILD_BUG_SQE_ELEM(0, __u8, opcode);
|
||||||
|
BUILD_BUG_SQE_ELEM(1, __u8, flags);
|
||||||
|
BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
|
||||||
|
BUILD_BUG_SQE_ELEM(4, __s32, fd);
|
||||||
|
BUILD_BUG_SQE_ELEM(8, __u64, off);
|
||||||
|
BUILD_BUG_SQE_ELEM(8, __u64, addr2);
|
||||||
|
BUILD_BUG_SQE_ELEM(16, __u64, addr);
|
||||||
|
BUILD_BUG_SQE_ELEM(24, __u32, len);
|
||||||
|
BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
|
||||||
|
BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
|
||||||
|
BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
|
||||||
|
BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
|
||||||
|
BUILD_BUG_SQE_ELEM(28, __u16, poll_events);
|
||||||
|
BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
|
||||||
|
BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
|
||||||
|
BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
|
||||||
|
BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
|
||||||
|
BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
|
||||||
|
BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
|
||||||
|
BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
|
||||||
|
BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
|
||||||
|
BUILD_BUG_SQE_ELEM(32, __u64, user_data);
|
||||||
|
BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
|
||||||
|
BUILD_BUG_SQE_ELEM(42, __u16, personality);
|
||||||
|
|
||||||
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
|
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
|
||||||
req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
|
req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -12,6 +12,8 @@
|
|||||||
#include <linux/fcntl.h>
|
#include <linux/fcntl.h>
|
||||||
#include <linux/wait.h>
|
#include <linux/wait.h>
|
||||||
#include <linux/err.h>
|
#include <linux/err.h>
|
||||||
|
#include <linux/percpu-defs.h>
|
||||||
|
#include <linux/percpu.h>
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
|
* CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
|
||||||
@ -40,6 +42,13 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
|
|||||||
int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
|
int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
|
||||||
__u64 *cnt);
|
__u64 *cnt);
|
||||||
|
|
||||||
|
DECLARE_PER_CPU(int, eventfd_wake_count);
|
||||||
|
|
||||||
|
static inline bool eventfd_signal_count(void)
|
||||||
|
{
|
||||||
|
return this_cpu_read(eventfd_wake_count);
|
||||||
|
}
|
||||||
|
|
||||||
#else /* CONFIG_EVENTFD */
|
#else /* CONFIG_EVENTFD */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -68,6 +77,11 @@ static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx,
|
|||||||
return -ENOSYS;
|
return -ENOSYS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool eventfd_signal_count(void)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif /* _LINUX_EVENTFD_H */
|
#endif /* _LINUX_EVENTFD_H */
|
||||||
|
Loading…
Reference in New Issue
Block a user