mirror of
https://github.com/torvalds/linux.git
synced 2024-11-24 05:02:12 +00:00
for-6.12/io_uring-20240913
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmbkST4QHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpnU7D/47BmxQmTbsT9NFBeZrQVgmQ2Zap2WWx3Za 4qGuU1VxcafztqWnRChtxznheVG9ioHglcxfbZjc/D4/BiffgF4n5Z48qh1c0t8O +2pwq75j0WyJkHH9wCrrN9Jq8zSB6pBr2sMEQmSilMgYZKMzhXrXevKkYnthj/1a 7U9QzY+lfc8neZRHR7VDouPWIRjBhwaO62ANXWCL7F2uE6NQasU61x6YTzGuoDB3 0gR5PbSiLIusGxsYqIVmQUPNBUOw8nOzXXcbw8kBlRdnpadns8rNk+ivIMtAYw0m s6xVWNWFToVxO8956rBnjicD6ZzF5Txe6gWC6gvhKMFkOyxkihgMCOZUpSmw6D8G YlDHB4+lijpQMyPDw1UUPOYPVGSVRp/f2MuRcEhW/Yums5vd9eOVrUVsFjfYRQLr fg+lp3rEMoHxBnuKneMY2inuZW99+LGyO8F4IVublwXoXKFcq3TdGCvn5OZUBGDn E5x4QGq+cf9icK4kqN5mVi256fhOLnqDTtzIg4qiwhZ5h9UA3CFjGc56G7wqgp8d Bu5scCkJR5tXJEZA1hce+w2bXzrM6Xd2gym5A6D6k8S3QheHkKva60/qfIzhs/x0 6nlJYSlznyQbDOBDQIJC86OE4tcShNusjFIgIDg6ZvAX2qk7BBmbPNF4RGrI9TTM xz2dONRhlA== =ZNjL -----END PGP SIGNATURE----- Merge tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux Pull io_uring updates from Jens Axboe: - NAPI fixes and cleanups (Pavel, Olivier) - Add support for absolute timeouts (Pavel) - Fixes for io-wq/sqpoll affinities (Felix) - Efficiency improvements for dealing with huge pages (Chenliang) - Support for a minwait mode, where the application essentially has two timouts - one smaller one that defines the batch timeout, and the overall large one similar to what we had before. This enables efficient use of batching based on count + timeout, while still working well with periods of less intensive workloads - Use ITER_UBUF for single segment sends - Add support for incremental buffer consumption. Right now each operation will always consume a full buffer. With incremental consumption, a recv/read operation only consumes the part of the buffer that it needs to satisfy the operation - Add support for GCOV for io_uring, to help retain a high coverage of test to code ratio - Fix regression with ocfs2, where an odd -EOPNOTSUPP wasn't correctly converted to a blocking retry - Add support for cloning registered buffers from one ring to another - Misc cleanups (Anuj, me) * tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux: (35 commits) io_uring: add IORING_REGISTER_COPY_BUFFERS method io_uring/register: provide helper to get io_ring_ctx from 'fd' io_uring/rsrc: add reference count to struct io_mapped_ubuf io_uring/rsrc: clear 'slot' entry upfront io_uring/io-wq: inherit cpuset of cgroup in io worker io_uring/io-wq: do not allow pinning outside of cpuset io_uring/rw: drop -EOPNOTSUPP check in __io_complete_rw_common() io_uring/rw: treat -EOPNOTSUPP for IOCB_NOWAIT like -EAGAIN io_uring/sqpoll: do not allow pinning outside of cpuset io_uring/eventfd: move refs to refcount_t io_uring: remove unused rsrc_put_fn io_uring: add new line after variable declaration io_uring: add GCOV_PROFILE_URING Kconfig option io_uring/kbuf: add support for incremental buffer consumption io_uring/kbuf: pass in 'len' argument for buffer commit Revert "io_uring: Require zeroed sqe->len on provided-buffers send" io_uring/kbuf: move io_ring_head_to_buf() to kbuf.h io_uring/kbuf: add io_kbuf_commit() helper io_uring/kbuf: shrink nr_iovs/mode in struct buf_sel_arg io_uring: wire up min batch wake timeout ...
This commit is contained in:
commit
3a4d319a8f
@ -239,6 +239,9 @@ struct io_ring_ctx {
|
||||
struct io_rings *rings;
|
||||
struct percpu_ref refs;
|
||||
|
||||
clockid_t clockid;
|
||||
enum tk_offsets clock_offset;
|
||||
|
||||
enum task_work_notify_mode notify_method;
|
||||
unsigned sq_thread_idle;
|
||||
} ____cacheline_aligned_in_smp;
|
||||
|
@ -440,11 +440,21 @@ struct io_uring_cqe {
|
||||
* IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv
|
||||
* IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct
|
||||
* them from sends.
|
||||
* IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get
|
||||
* more completions. In other words, the buffer is being
|
||||
* partially consumed, and will be used by the kernel for
|
||||
* more completions. This is only set for buffers used via
|
||||
* the incremental buffer consumption, as provided by
|
||||
* a ring buffer setup with IOU_PBUF_RING_INC. For any
|
||||
* other provided buffer type, all completions with a
|
||||
* buffer passed back is automatically returned to the
|
||||
* application.
|
||||
*/
|
||||
#define IORING_CQE_F_BUFFER (1U << 0)
|
||||
#define IORING_CQE_F_MORE (1U << 1)
|
||||
#define IORING_CQE_F_SOCK_NONEMPTY (1U << 2)
|
||||
#define IORING_CQE_F_NOTIF (1U << 3)
|
||||
#define IORING_CQE_F_BUF_MORE (1U << 4)
|
||||
|
||||
#define IORING_CQE_BUFFER_SHIFT 16
|
||||
|
||||
@ -507,6 +517,7 @@ struct io_cqring_offsets {
|
||||
#define IORING_ENTER_SQ_WAIT (1U << 2)
|
||||
#define IORING_ENTER_EXT_ARG (1U << 3)
|
||||
#define IORING_ENTER_REGISTERED_RING (1U << 4)
|
||||
#define IORING_ENTER_ABS_TIMER (1U << 5)
|
||||
|
||||
/*
|
||||
* Passed in for io_uring_setup(2). Copied back with updated info on success
|
||||
@ -542,6 +553,7 @@ struct io_uring_params {
|
||||
#define IORING_FEAT_LINKED_FILE (1U << 12)
|
||||
#define IORING_FEAT_REG_REG_RING (1U << 13)
|
||||
#define IORING_FEAT_RECVSEND_BUNDLE (1U << 14)
|
||||
#define IORING_FEAT_MIN_TIMEOUT (1U << 15)
|
||||
|
||||
/*
|
||||
* io_uring_register(2) opcodes and arguments
|
||||
@ -595,6 +607,11 @@ enum io_uring_register_op {
|
||||
IORING_REGISTER_NAPI = 27,
|
||||
IORING_UNREGISTER_NAPI = 28,
|
||||
|
||||
IORING_REGISTER_CLOCK = 29,
|
||||
|
||||
/* copy registered buffers from source ring to current ring */
|
||||
IORING_REGISTER_COPY_BUFFERS = 30,
|
||||
|
||||
/* this goes last */
|
||||
IORING_REGISTER_LAST,
|
||||
|
||||
@ -675,6 +692,21 @@ struct io_uring_restriction {
|
||||
__u32 resv2[3];
|
||||
};
|
||||
|
||||
struct io_uring_clock_register {
|
||||
__u32 clockid;
|
||||
__u32 __resv[3];
|
||||
};
|
||||
|
||||
enum {
|
||||
IORING_REGISTER_SRC_REGISTERED = 1,
|
||||
};
|
||||
|
||||
struct io_uring_copy_buffers {
|
||||
__u32 src_fd;
|
||||
__u32 flags;
|
||||
__u32 pad[6];
|
||||
};
|
||||
|
||||
struct io_uring_buf {
|
||||
__u64 addr;
|
||||
__u32 len;
|
||||
@ -707,9 +739,17 @@ struct io_uring_buf_ring {
|
||||
* mmap(2) with the offset set as:
|
||||
* IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
|
||||
* to get a virtual mapping for the ring.
|
||||
* IOU_PBUF_RING_INC: If set, buffers consumed from this buffer ring can be
|
||||
* consumed incrementally. Normally one (or more) buffers
|
||||
* are fully consumed. With incremental consumptions, it's
|
||||
* feasible to register big ranges of buffers, and each
|
||||
* use of it will consume only as much as it needs. This
|
||||
* requires that both the kernel and application keep
|
||||
* track of where the current read/recv index is at.
|
||||
*/
|
||||
enum io_uring_register_pbuf_ring_flags {
|
||||
IOU_PBUF_RING_MMAP = 1,
|
||||
IOU_PBUF_RING_INC = 2,
|
||||
};
|
||||
|
||||
/* argument for IORING_(UN)REGISTER_PBUF_RING */
|
||||
@ -758,7 +798,7 @@ enum io_uring_register_restriction_op {
|
||||
struct io_uring_getevents_arg {
|
||||
__u64 sigmask;
|
||||
__u32 sigmask_sz;
|
||||
__u32 pad;
|
||||
__u32 min_wait_usec;
|
||||
__u64 ts;
|
||||
};
|
||||
|
||||
|
13
init/Kconfig
13
init/Kconfig
@ -1687,6 +1687,19 @@ config IO_URING
|
||||
applications to submit and complete IO through submission and
|
||||
completion rings that are shared between the kernel and application.
|
||||
|
||||
config GCOV_PROFILE_URING
|
||||
bool "Enable GCOV profiling on the io_uring subsystem"
|
||||
depends on GCOV_KERNEL
|
||||
help
|
||||
Enable GCOV profiling on the io_uring subsystem, to facilitate
|
||||
code coverage testing.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
Note that this will have a negative impact on the performance of
|
||||
the io_uring subsystem, hence this should only be enabled for
|
||||
specific test purposes.
|
||||
|
||||
config ADVISE_SYSCALLS
|
||||
bool "Enable madvise/fadvise syscalls" if EXPERT
|
||||
default y
|
||||
|
@ -2,6 +2,10 @@
|
||||
#
|
||||
# Makefile for io_uring
|
||||
|
||||
ifdef CONFIG_GCOV_PROFILE_URING
|
||||
GCOV_PROFILE := y
|
||||
endif
|
||||
|
||||
obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
|
||||
tctx.o filetable.o rw.o net.o poll.o \
|
||||
eventfd.o uring_cmd.o openclose.o \
|
||||
|
@ -15,7 +15,7 @@ struct io_ev_fd {
|
||||
struct eventfd_ctx *cq_ev_fd;
|
||||
unsigned int eventfd_async: 1;
|
||||
struct rcu_head rcu;
|
||||
atomic_t refs;
|
||||
refcount_t refs;
|
||||
atomic_t ops;
|
||||
};
|
||||
|
||||
@ -37,7 +37,7 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)
|
||||
|
||||
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
|
||||
|
||||
if (atomic_dec_and_test(&ev_fd->refs))
|
||||
if (refcount_dec_and_test(&ev_fd->refs))
|
||||
io_eventfd_free(rcu);
|
||||
}
|
||||
|
||||
@ -63,7 +63,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
|
||||
*/
|
||||
if (unlikely(!ev_fd))
|
||||
return;
|
||||
if (!atomic_inc_not_zero(&ev_fd->refs))
|
||||
if (!refcount_inc_not_zero(&ev_fd->refs))
|
||||
return;
|
||||
if (ev_fd->eventfd_async && !io_wq_current_is_worker())
|
||||
goto out;
|
||||
@ -77,7 +77,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
|
||||
}
|
||||
}
|
||||
out:
|
||||
if (atomic_dec_and_test(&ev_fd->refs))
|
||||
if (refcount_dec_and_test(&ev_fd->refs))
|
||||
call_rcu(&ev_fd->rcu, io_eventfd_free);
|
||||
}
|
||||
|
||||
@ -126,6 +126,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
|
||||
ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
|
||||
if (IS_ERR(ev_fd->cq_ev_fd)) {
|
||||
int ret = PTR_ERR(ev_fd->cq_ev_fd);
|
||||
|
||||
kfree(ev_fd);
|
||||
return ret;
|
||||
}
|
||||
@ -136,7 +137,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
|
||||
|
||||
ev_fd->eventfd_async = eventfd_async;
|
||||
ctx->has_evfd = true;
|
||||
atomic_set(&ev_fd->refs, 1);
|
||||
refcount_set(&ev_fd->refs, 1);
|
||||
atomic_set(&ev_fd->ops, 0);
|
||||
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
|
||||
return 0;
|
||||
@ -151,7 +152,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx)
|
||||
if (ev_fd) {
|
||||
ctx->has_evfd = false;
|
||||
rcu_assign_pointer(ctx->io_ev_fd, NULL);
|
||||
if (atomic_dec_and_test(&ev_fd->refs))
|
||||
if (refcount_dec_and_test(&ev_fd->refs))
|
||||
call_rcu(&ev_fd->rcu, io_eventfd_free);
|
||||
return 0;
|
||||
}
|
||||
|
@ -221,7 +221,19 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
|
||||
cqe->user_data, cqe->res, cqe->flags);
|
||||
|
||||
}
|
||||
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
|
||||
#ifdef CONFIG_NET_RX_BUSY_POLL
|
||||
if (ctx->napi_enabled) {
|
||||
seq_puts(m, "NAPI:\tenabled\n");
|
||||
seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt);
|
||||
if (ctx->napi_prefer_busy_poll)
|
||||
seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
|
||||
else
|
||||
seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
|
||||
} else {
|
||||
seq_puts(m, "NAPI:\tdisabled\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/rculist_nulls.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/cpuset.h>
|
||||
#include <linux/task_work.h>
|
||||
#include <linux/audit.h>
|
||||
#include <linux/mmu_context.h>
|
||||
@ -1167,7 +1168,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
|
||||
|
||||
if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL))
|
||||
goto err;
|
||||
cpumask_copy(wq->cpu_mask, cpu_possible_mask);
|
||||
cpuset_cpus_allowed(data->task, wq->cpu_mask);
|
||||
wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
|
||||
wq->acct[IO_WQ_ACCT_UNBOUND].max_workers =
|
||||
task_rlimit(current, RLIMIT_NPROC);
|
||||
@ -1322,17 +1323,29 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
|
||||
|
||||
int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask)
|
||||
{
|
||||
cpumask_var_t allowed_mask;
|
||||
int ret = 0;
|
||||
|
||||
if (!tctx || !tctx->io_wq)
|
||||
return -EINVAL;
|
||||
|
||||
if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL))
|
||||
return -ENOMEM;
|
||||
|
||||
rcu_read_lock();
|
||||
if (mask)
|
||||
cpuset_cpus_allowed(tctx->io_wq->task, allowed_mask);
|
||||
if (mask) {
|
||||
if (cpumask_subset(mask, allowed_mask))
|
||||
cpumask_copy(tctx->io_wq->cpu_mask, mask);
|
||||
else
|
||||
cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask);
|
||||
ret = -EINVAL;
|
||||
} else {
|
||||
cpumask_copy(tctx->io_wq->cpu_mask, allowed_mask);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
return 0;
|
||||
free_cpumask_var(allowed_mask);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -904,7 +904,7 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res)
|
||||
lockdep_assert_held(&req->ctx->uring_lock);
|
||||
|
||||
req_set_fail(req);
|
||||
io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
|
||||
io_req_set_res(req, res, io_put_kbuf(req, res, IO_URING_F_UNLOCKED));
|
||||
if (def->fail)
|
||||
def->fail(req);
|
||||
io_req_complete_defer(req);
|
||||
@ -2350,12 +2350,113 @@ static bool current_pending_io(void)
|
||||
return percpu_counter_read_positive(&tctx->inflight);
|
||||
}
|
||||
|
||||
/* when returns >0, the caller should retry */
|
||||
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
|
||||
struct io_wait_queue *iowq)
|
||||
static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
|
||||
{
|
||||
int ret;
|
||||
struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
|
||||
|
||||
WRITE_ONCE(iowq->hit_timeout, 1);
|
||||
iowq->min_timeout = 0;
|
||||
wake_up_process(iowq->wq.private);
|
||||
return HRTIMER_NORESTART;
|
||||
}
|
||||
|
||||
/*
|
||||
* Doing min_timeout portion. If we saw any timeouts, events, or have work,
|
||||
* wake up. If not, and we have a normal timeout, switch to that and keep
|
||||
* sleeping.
|
||||
*/
|
||||
static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
|
||||
{
|
||||
struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
|
||||
struct io_ring_ctx *ctx = iowq->ctx;
|
||||
|
||||
/* no general timeout, or shorter (or equal), we are done */
|
||||
if (iowq->timeout == KTIME_MAX ||
|
||||
ktime_compare(iowq->min_timeout, iowq->timeout) >= 0)
|
||||
goto out_wake;
|
||||
/* work we may need to run, wake function will see if we need to wake */
|
||||
if (io_has_work(ctx))
|
||||
goto out_wake;
|
||||
/* got events since we started waiting, min timeout is done */
|
||||
if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
|
||||
goto out_wake;
|
||||
/* if we have any events and min timeout expired, we're done */
|
||||
if (io_cqring_events(ctx))
|
||||
goto out_wake;
|
||||
|
||||
/*
|
||||
* If using deferred task_work running and application is waiting on
|
||||
* more than one request, ensure we reset it now where we are switching
|
||||
* to normal sleeps. Any request completion post min_wait should wake
|
||||
* the task and return.
|
||||
*/
|
||||
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
|
||||
atomic_set(&ctx->cq_wait_nr, 1);
|
||||
smp_mb();
|
||||
if (!llist_empty(&ctx->work_llist))
|
||||
goto out_wake;
|
||||
}
|
||||
|
||||
iowq->t.function = io_cqring_timer_wakeup;
|
||||
hrtimer_set_expires(timer, iowq->timeout);
|
||||
return HRTIMER_RESTART;
|
||||
out_wake:
|
||||
return io_cqring_timer_wakeup(timer);
|
||||
}
|
||||
|
||||
static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
|
||||
clockid_t clock_id, ktime_t start_time)
|
||||
{
|
||||
ktime_t timeout;
|
||||
|
||||
hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS);
|
||||
if (iowq->min_timeout) {
|
||||
timeout = ktime_add_ns(iowq->min_timeout, start_time);
|
||||
iowq->t.function = io_cqring_min_timer_wakeup;
|
||||
} else {
|
||||
timeout = iowq->timeout;
|
||||
iowq->t.function = io_cqring_timer_wakeup;
|
||||
}
|
||||
|
||||
hrtimer_set_expires_range_ns(&iowq->t, timeout, 0);
|
||||
hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
|
||||
|
||||
if (!READ_ONCE(iowq->hit_timeout))
|
||||
schedule();
|
||||
|
||||
hrtimer_cancel(&iowq->t);
|
||||
destroy_hrtimer_on_stack(&iowq->t);
|
||||
__set_current_state(TASK_RUNNING);
|
||||
|
||||
return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0;
|
||||
}
|
||||
|
||||
static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
|
||||
struct io_wait_queue *iowq,
|
||||
ktime_t start_time)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
* Mark us as being in io_wait if we have pending requests, so cpufreq
|
||||
* can take into account that the task is waiting for IO - turns out
|
||||
* to be important for low QD IO.
|
||||
*/
|
||||
if (current_pending_io())
|
||||
current->in_iowait = 1;
|
||||
if (iowq->timeout != KTIME_MAX || iowq->min_timeout)
|
||||
ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time);
|
||||
else
|
||||
schedule();
|
||||
current->in_iowait = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* If this returns > 0, the caller should retry */
|
||||
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
|
||||
struct io_wait_queue *iowq,
|
||||
ktime_t start_time)
|
||||
{
|
||||
if (unlikely(READ_ONCE(ctx->check_cq)))
|
||||
return 1;
|
||||
if (unlikely(!llist_empty(&ctx->work_llist)))
|
||||
@ -2367,32 +2468,26 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
|
||||
if (unlikely(io_should_wake(iowq)))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Mark us as being in io_wait if we have pending requests, so cpufreq
|
||||
* can take into account that the task is waiting for IO - turns out
|
||||
* to be important for low QD IO.
|
||||
*/
|
||||
if (current_pending_io())
|
||||
current->in_iowait = 1;
|
||||
ret = 0;
|
||||
if (iowq->timeout == KTIME_MAX)
|
||||
schedule();
|
||||
else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
|
||||
ret = -ETIME;
|
||||
current->in_iowait = 0;
|
||||
return ret;
|
||||
return __io_cqring_wait_schedule(ctx, iowq, start_time);
|
||||
}
|
||||
|
||||
struct ext_arg {
|
||||
size_t argsz;
|
||||
struct __kernel_timespec __user *ts;
|
||||
const sigset_t __user *sig;
|
||||
ktime_t min_time;
|
||||
};
|
||||
|
||||
/*
|
||||
* Wait until events become available, if we don't already have some. The
|
||||
* application must reap them itself, as they reside on the shared cq ring.
|
||||
*/
|
||||
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
|
||||
const sigset_t __user *sig, size_t sigsz,
|
||||
struct __kernel_timespec __user *uts)
|
||||
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
|
||||
struct ext_arg *ext_arg)
|
||||
{
|
||||
struct io_wait_queue iowq;
|
||||
struct io_rings *rings = ctx->rings;
|
||||
ktime_t start_time;
|
||||
int ret;
|
||||
|
||||
if (!io_allowed_run_tw(ctx))
|
||||
@ -2410,30 +2505,33 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
|
||||
iowq.wq.private = current;
|
||||
INIT_LIST_HEAD(&iowq.wq.entry);
|
||||
iowq.ctx = ctx;
|
||||
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
|
||||
iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
|
||||
iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
|
||||
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
|
||||
iowq.hit_timeout = 0;
|
||||
iowq.min_timeout = ext_arg->min_time;
|
||||
iowq.timeout = KTIME_MAX;
|
||||
start_time = io_get_time(ctx);
|
||||
|
||||
if (uts) {
|
||||
if (ext_arg->ts) {
|
||||
struct timespec64 ts;
|
||||
ktime_t dt;
|
||||
|
||||
if (get_timespec64(&ts, uts))
|
||||
if (get_timespec64(&ts, ext_arg->ts))
|
||||
return -EFAULT;
|
||||
|
||||
dt = timespec64_to_ktime(ts);
|
||||
iowq.timeout = ktime_add(dt, ktime_get());
|
||||
io_napi_adjust_timeout(ctx, &iowq, dt);
|
||||
iowq.timeout = timespec64_to_ktime(ts);
|
||||
if (!(flags & IORING_ENTER_ABS_TIMER))
|
||||
iowq.timeout = ktime_add(iowq.timeout, start_time);
|
||||
}
|
||||
|
||||
if (sig) {
|
||||
if (ext_arg->sig) {
|
||||
#ifdef CONFIG_COMPAT
|
||||
if (in_compat_syscall())
|
||||
ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
|
||||
sigsz);
|
||||
ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig,
|
||||
ext_arg->argsz);
|
||||
else
|
||||
#endif
|
||||
ret = set_user_sigmask(sig, sigsz);
|
||||
ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
@ -2443,8 +2541,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
|
||||
|
||||
trace_io_uring_cqring_wait(ctx, min_events);
|
||||
do {
|
||||
int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
|
||||
unsigned long check_cq;
|
||||
int nr_wait;
|
||||
|
||||
/* if min timeout has been hit, don't reset wait count */
|
||||
if (!iowq.hit_timeout)
|
||||
nr_wait = (int) iowq.cq_tail -
|
||||
READ_ONCE(ctx->rings->cq.tail);
|
||||
else
|
||||
nr_wait = 1;
|
||||
|
||||
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
|
||||
atomic_set(&ctx->cq_wait_nr, nr_wait);
|
||||
@ -2454,7 +2559,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
|
||||
TASK_INTERRUPTIBLE);
|
||||
}
|
||||
|
||||
ret = io_cqring_wait_schedule(ctx, &iowq);
|
||||
ret = io_cqring_wait_schedule(ctx, &iowq, start_time);
|
||||
__set_current_state(TASK_RUNNING);
|
||||
atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
|
||||
|
||||
@ -3112,9 +3217,8 @@ static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t a
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
|
||||
struct __kernel_timespec __user **ts,
|
||||
const sigset_t __user **sig)
|
||||
static int io_get_ext_arg(unsigned flags, const void __user *argp,
|
||||
struct ext_arg *ext_arg)
|
||||
{
|
||||
struct io_uring_getevents_arg arg;
|
||||
|
||||
@ -3123,8 +3227,8 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz
|
||||
* is just a pointer to the sigset_t.
|
||||
*/
|
||||
if (!(flags & IORING_ENTER_EXT_ARG)) {
|
||||
*sig = (const sigset_t __user *) argp;
|
||||
*ts = NULL;
|
||||
ext_arg->sig = (const sigset_t __user *) argp;
|
||||
ext_arg->ts = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -3132,15 +3236,14 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz
|
||||
* EXT_ARG is set - ensure we agree on the size of it and copy in our
|
||||
* timespec and sigset_t pointers if good.
|
||||
*/
|
||||
if (*argsz != sizeof(arg))
|
||||
if (ext_arg->argsz != sizeof(arg))
|
||||
return -EINVAL;
|
||||
if (copy_from_user(&arg, argp, sizeof(arg)))
|
||||
return -EFAULT;
|
||||
if (arg.pad)
|
||||
return -EINVAL;
|
||||
*sig = u64_to_user_ptr(arg.sigmask);
|
||||
*argsz = arg.sigmask_sz;
|
||||
*ts = u64_to_user_ptr(arg.ts);
|
||||
ext_arg->min_time = arg.min_wait_usec * NSEC_PER_USEC;
|
||||
ext_arg->sig = u64_to_user_ptr(arg.sigmask);
|
||||
ext_arg->argsz = arg.sigmask_sz;
|
||||
ext_arg->ts = u64_to_user_ptr(arg.ts);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -3154,7 +3257,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
|
||||
|
||||
if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
|
||||
IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
|
||||
IORING_ENTER_REGISTERED_RING)))
|
||||
IORING_ENTER_REGISTERED_RING |
|
||||
IORING_ENTER_ABS_TIMER)))
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
@ -3245,15 +3349,14 @@ iopoll_locked:
|
||||
}
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
} else {
|
||||
const sigset_t __user *sig;
|
||||
struct __kernel_timespec __user *ts;
|
||||
struct ext_arg ext_arg = { .argsz = argsz };
|
||||
|
||||
ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
|
||||
ret2 = io_get_ext_arg(flags, argp, &ext_arg);
|
||||
if (likely(!ret2)) {
|
||||
min_complete = min(min_complete,
|
||||
ctx->cq_entries);
|
||||
ret2 = io_cqring_wait(ctx, min_complete, sig,
|
||||
argsz, ts);
|
||||
ret2 = io_cqring_wait(ctx, min_complete, flags,
|
||||
&ext_arg);
|
||||
}
|
||||
}
|
||||
|
||||
@ -3424,6 +3527,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
|
||||
if (!ctx)
|
||||
return -ENOMEM;
|
||||
|
||||
ctx->clockid = CLOCK_MONOTONIC;
|
||||
ctx->clock_offset = 0;
|
||||
|
||||
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
|
||||
!(ctx->flags & IORING_SETUP_IOPOLL) &&
|
||||
!(ctx->flags & IORING_SETUP_SQPOLL))
|
||||
@ -3535,7 +3641,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
|
||||
IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
|
||||
IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
|
||||
IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
|
||||
IORING_FEAT_RECVSEND_BUNDLE;
|
||||
IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT;
|
||||
|
||||
if (copy_to_user(params, p, sizeof(*p))) {
|
||||
ret = -EFAULT;
|
||||
|
@ -39,8 +39,12 @@ struct io_wait_queue {
|
||||
struct wait_queue_entry wq;
|
||||
struct io_ring_ctx *ctx;
|
||||
unsigned cq_tail;
|
||||
unsigned cq_min_tail;
|
||||
unsigned nr_timeouts;
|
||||
int hit_timeout;
|
||||
ktime_t min_timeout;
|
||||
ktime_t timeout;
|
||||
struct hrtimer t;
|
||||
|
||||
#ifdef CONFIG_NET_RX_BUSY_POLL
|
||||
ktime_t napi_busy_poll_dt;
|
||||
@ -437,6 +441,14 @@ static inline bool io_file_can_poll(struct io_kiocb *req)
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline ktime_t io_get_time(struct io_ring_ctx *ctx)
|
||||
{
|
||||
if (ctx->clockid == CLOCK_MONOTONIC)
|
||||
return ktime_get();
|
||||
|
||||
return ktime_get_with_offset(ctx->clock_offset);
|
||||
}
|
||||
|
||||
enum {
|
||||
IO_CHECK_CQ_OVERFLOW_BIT,
|
||||
IO_CHECK_CQ_DROPPED_BIT,
|
||||
|
@ -70,7 +70,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
|
||||
return true;
|
||||
}
|
||||
|
||||
void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
|
||||
void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags)
|
||||
{
|
||||
/*
|
||||
* We can add this buffer back to two lists:
|
||||
@ -88,12 +88,12 @@ void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
|
||||
spin_lock(&ctx->completion_lock);
|
||||
__io_put_kbuf_list(req, &ctx->io_buffers_comp);
|
||||
__io_put_kbuf_list(req, len, &ctx->io_buffers_comp);
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
} else {
|
||||
lockdep_assert_held(&req->ctx->uring_lock);
|
||||
|
||||
__io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
|
||||
__io_put_kbuf_list(req, len, &req->ctx->io_buffers_cache);
|
||||
}
|
||||
}
|
||||
|
||||
@ -132,12 +132,6 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
|
||||
return 1;
|
||||
}
|
||||
|
||||
static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br,
|
||||
__u16 head, __u16 mask)
|
||||
{
|
||||
return &br->bufs[head & mask];
|
||||
}
|
||||
|
||||
static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
|
||||
struct io_buffer_list *bl,
|
||||
unsigned int issue_flags)
|
||||
@ -171,9 +165,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
|
||||
* the transfer completes (or if we get -EAGAIN and must poll of
|
||||
* retry).
|
||||
*/
|
||||
req->flags &= ~REQ_F_BUFFERS_COMMIT;
|
||||
io_kbuf_commit(req, bl, *len, 1);
|
||||
req->buf_list = NULL;
|
||||
bl->head++;
|
||||
}
|
||||
return u64_to_user_ptr(buf->addr);
|
||||
}
|
||||
@ -189,7 +182,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
|
||||
|
||||
bl = io_buffer_get_list(ctx, req->buf_index);
|
||||
if (likely(bl)) {
|
||||
if (bl->is_buf_ring)
|
||||
if (bl->flags & IOBL_BUF_RING)
|
||||
ret = io_ring_buffer_select(req, len, bl, issue_flags);
|
||||
else
|
||||
ret = io_provided_buffer_select(req, len, bl);
|
||||
@ -219,15 +212,26 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
|
||||
buf = io_ring_head_to_buf(br, head, bl->mask);
|
||||
if (arg->max_len) {
|
||||
u32 len = READ_ONCE(buf->len);
|
||||
size_t needed;
|
||||
|
||||
if (unlikely(!len))
|
||||
return -ENOBUFS;
|
||||
/*
|
||||
* Limit incremental buffers to 1 segment. No point trying
|
||||
* to peek ahead and map more than we need, when the buffers
|
||||
* themselves should be large when setup with
|
||||
* IOU_PBUF_RING_INC.
|
||||
*/
|
||||
if (bl->flags & IOBL_INC) {
|
||||
nr_avail = 1;
|
||||
} else {
|
||||
size_t needed;
|
||||
|
||||
needed = (arg->max_len + len - 1) / len;
|
||||
needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT);
|
||||
if (nr_avail > needed)
|
||||
nr_avail = needed;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* only alloc a bigger array if we know we have data to map, eg not
|
||||
@ -251,16 +255,21 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
|
||||
|
||||
req->buf_index = buf->bid;
|
||||
do {
|
||||
/* truncate end piece, if needed */
|
||||
if (buf->len > arg->max_len)
|
||||
buf->len = arg->max_len;
|
||||
u32 len = buf->len;
|
||||
|
||||
/* truncate end piece, if needed, for non partial buffers */
|
||||
if (len > arg->max_len) {
|
||||
len = arg->max_len;
|
||||
if (!(bl->flags & IOBL_INC))
|
||||
buf->len = len;
|
||||
}
|
||||
|
||||
iov->iov_base = u64_to_user_ptr(buf->addr);
|
||||
iov->iov_len = buf->len;
|
||||
iov->iov_len = len;
|
||||
iov++;
|
||||
|
||||
arg->out_len += buf->len;
|
||||
arg->max_len -= buf->len;
|
||||
arg->out_len += len;
|
||||
arg->max_len -= len;
|
||||
if (!arg->max_len)
|
||||
break;
|
||||
|
||||
@ -287,7 +296,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
|
||||
if (unlikely(!bl))
|
||||
goto out_unlock;
|
||||
|
||||
if (bl->is_buf_ring) {
|
||||
if (bl->flags & IOBL_BUF_RING) {
|
||||
ret = io_ring_buffers_peek(req, arg, bl);
|
||||
/*
|
||||
* Don't recycle these buffers if we need to go through poll.
|
||||
@ -297,8 +306,8 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
|
||||
* committed them, they cannot be put back in the queue.
|
||||
*/
|
||||
if (ret > 0) {
|
||||
req->flags |= REQ_F_BL_NO_RECYCLE;
|
||||
req->buf_list->head += ret;
|
||||
req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE;
|
||||
io_kbuf_commit(req, bl, arg->out_len, ret);
|
||||
}
|
||||
} else {
|
||||
ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs);
|
||||
@ -320,7 +329,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
|
||||
if (unlikely(!bl))
|
||||
return -ENOENT;
|
||||
|
||||
if (bl->is_buf_ring) {
|
||||
if (bl->flags & IOBL_BUF_RING) {
|
||||
ret = io_ring_buffers_peek(req, arg, bl);
|
||||
if (ret > 0)
|
||||
req->flags |= REQ_F_BUFFERS_COMMIT;
|
||||
@ -340,22 +349,22 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
|
||||
if (!nbufs)
|
||||
return 0;
|
||||
|
||||
if (bl->is_buf_ring) {
|
||||
if (bl->flags & IOBL_BUF_RING) {
|
||||
i = bl->buf_ring->tail - bl->head;
|
||||
if (bl->buf_nr_pages) {
|
||||
int j;
|
||||
|
||||
if (!bl->is_mmap) {
|
||||
if (!(bl->flags & IOBL_MMAP)) {
|
||||
for (j = 0; j < bl->buf_nr_pages; j++)
|
||||
unpin_user_page(bl->buf_pages[j]);
|
||||
}
|
||||
io_pages_unmap(bl->buf_ring, &bl->buf_pages,
|
||||
&bl->buf_nr_pages, bl->is_mmap);
|
||||
bl->is_mmap = 0;
|
||||
&bl->buf_nr_pages, bl->flags & IOBL_MMAP);
|
||||
bl->flags &= ~IOBL_MMAP;
|
||||
}
|
||||
/* make sure it's seen as empty */
|
||||
INIT_LIST_HEAD(&bl->buf_list);
|
||||
bl->is_buf_ring = 0;
|
||||
bl->flags &= ~IOBL_BUF_RING;
|
||||
return i;
|
||||
}
|
||||
|
||||
@ -442,7 +451,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
|
||||
if (bl) {
|
||||
ret = -EINVAL;
|
||||
/* can't use provide/remove buffers command on mapped buffers */
|
||||
if (!bl->is_buf_ring)
|
||||
if (!(bl->flags & IOBL_BUF_RING))
|
||||
ret = __io_remove_buffers(ctx, bl, p->nbufs);
|
||||
}
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
@ -589,7 +598,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
|
||||
}
|
||||
}
|
||||
/* can't add buffers via this command for a mapped buffer ring */
|
||||
if (bl->is_buf_ring) {
|
||||
if (bl->flags & IOBL_BUF_RING) {
|
||||
ret = -EINVAL;
|
||||
goto err;
|
||||
}
|
||||
@ -641,8 +650,8 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
|
||||
bl->buf_pages = pages;
|
||||
bl->buf_nr_pages = nr_pages;
|
||||
bl->buf_ring = br;
|
||||
bl->is_buf_ring = 1;
|
||||
bl->is_mmap = 0;
|
||||
bl->flags |= IOBL_BUF_RING;
|
||||
bl->flags &= ~IOBL_MMAP;
|
||||
return 0;
|
||||
error_unpin:
|
||||
unpin_user_pages(pages, nr_pages);
|
||||
@ -665,8 +674,7 @@ static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
bl->is_buf_ring = 1;
|
||||
bl->is_mmap = 1;
|
||||
bl->flags |= (IOBL_BUF_RING | IOBL_MMAP);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -683,7 +691,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
|
||||
|
||||
if (reg.resv[0] || reg.resv[1] || reg.resv[2])
|
||||
return -EINVAL;
|
||||
if (reg.flags & ~IOU_PBUF_RING_MMAP)
|
||||
if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
|
||||
return -EINVAL;
|
||||
if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
|
||||
if (!reg.ring_addr)
|
||||
@ -705,7 +713,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
|
||||
bl = io_buffer_get_list(ctx, reg.bgid);
|
||||
if (bl) {
|
||||
/* if mapped buffer ring OR classic exists, don't allow */
|
||||
if (bl->is_buf_ring || !list_empty(&bl->buf_list))
|
||||
if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list))
|
||||
return -EEXIST;
|
||||
} else {
|
||||
free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
|
||||
@ -721,6 +729,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
|
||||
if (!ret) {
|
||||
bl->nr_entries = reg.ring_entries;
|
||||
bl->mask = reg.ring_entries - 1;
|
||||
if (reg.flags & IOU_PBUF_RING_INC)
|
||||
bl->flags |= IOBL_INC;
|
||||
|
||||
io_buffer_add_list(ctx, bl, reg.bgid);
|
||||
return 0;
|
||||
@ -747,7 +757,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
|
||||
bl = io_buffer_get_list(ctx, reg.bgid);
|
||||
if (!bl)
|
||||
return -ENOENT;
|
||||
if (!bl->is_buf_ring)
|
||||
if (!(bl->flags & IOBL_BUF_RING))
|
||||
return -EINVAL;
|
||||
|
||||
xa_erase(&ctx->io_bl_xa, bl->bgid);
|
||||
@ -771,7 +781,7 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
|
||||
bl = io_buffer_get_list(ctx, buf_status.buf_group);
|
||||
if (!bl)
|
||||
return -ENOENT;
|
||||
if (!bl->is_buf_ring)
|
||||
if (!(bl->flags & IOBL_BUF_RING))
|
||||
return -EINVAL;
|
||||
|
||||
buf_status.head = bl->head;
|
||||
@ -802,7 +812,7 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
|
||||
bl = xa_load(&ctx->io_bl_xa, bgid);
|
||||
/* must be a mmap'able buffer ring and have pages */
|
||||
ret = false;
|
||||
if (bl && bl->is_mmap)
|
||||
if (bl && bl->flags & IOBL_MMAP)
|
||||
ret = atomic_inc_not_zero(&bl->refs);
|
||||
rcu_read_unlock();
|
||||
|
||||
|
@ -4,6 +4,16 @@
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
enum {
|
||||
/* ring mapped provided buffers */
|
||||
IOBL_BUF_RING = 1,
|
||||
/* ring mapped provided buffers, but mmap'ed by application */
|
||||
IOBL_MMAP = 2,
|
||||
/* buffers are consumed incrementally rather than always fully */
|
||||
IOBL_INC = 4,
|
||||
|
||||
};
|
||||
|
||||
struct io_buffer_list {
|
||||
/*
|
||||
* If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
|
||||
@ -25,12 +35,9 @@ struct io_buffer_list {
|
||||
__u16 head;
|
||||
__u16 mask;
|
||||
|
||||
atomic_t refs;
|
||||
__u16 flags;
|
||||
|
||||
/* ring mapped provided buffers */
|
||||
__u8 is_buf_ring;
|
||||
/* ring mapped provided buffers, but mmap'ed by application */
|
||||
__u8 is_mmap;
|
||||
atomic_t refs;
|
||||
};
|
||||
|
||||
struct io_buffer {
|
||||
@ -52,8 +59,8 @@ struct buf_sel_arg {
|
||||
struct iovec *iovs;
|
||||
size_t out_len;
|
||||
size_t max_len;
|
||||
int nr_iovs;
|
||||
int mode;
|
||||
unsigned short nr_iovs;
|
||||
unsigned short mode;
|
||||
};
|
||||
|
||||
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
|
||||
@ -73,7 +80,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
|
||||
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
|
||||
int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
|
||||
|
||||
void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
|
||||
void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags);
|
||||
|
||||
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
|
||||
|
||||
@ -117,25 +124,55 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void __io_put_kbuf_ring(struct io_kiocb *req, int nr)
|
||||
/* Mapped buffer ring, return io_uring_buf from head */
|
||||
#define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)]
|
||||
|
||||
static inline bool io_kbuf_commit(struct io_kiocb *req,
|
||||
struct io_buffer_list *bl, int len, int nr)
|
||||
{
|
||||
if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT)))
|
||||
return true;
|
||||
|
||||
req->flags &= ~REQ_F_BUFFERS_COMMIT;
|
||||
|
||||
if (unlikely(len < 0))
|
||||
return true;
|
||||
|
||||
if (bl->flags & IOBL_INC) {
|
||||
struct io_uring_buf *buf;
|
||||
|
||||
buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask);
|
||||
if (WARN_ON_ONCE(len > buf->len))
|
||||
len = buf->len;
|
||||
buf->len -= len;
|
||||
if (buf->len) {
|
||||
buf->addr += len;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bl->head += nr;
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr)
|
||||
{
|
||||
struct io_buffer_list *bl = req->buf_list;
|
||||
bool ret = true;
|
||||
|
||||
if (bl) {
|
||||
if (req->flags & REQ_F_BUFFERS_COMMIT) {
|
||||
bl->head += nr;
|
||||
req->flags &= ~REQ_F_BUFFERS_COMMIT;
|
||||
}
|
||||
ret = io_kbuf_commit(req, bl, len, nr);
|
||||
req->buf_index = bl->bgid;
|
||||
}
|
||||
req->flags &= ~REQ_F_BUFFER_RING;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void __io_put_kbuf_list(struct io_kiocb *req,
|
||||
static inline void __io_put_kbuf_list(struct io_kiocb *req, int len,
|
||||
struct list_head *list)
|
||||
{
|
||||
if (req->flags & REQ_F_BUFFER_RING) {
|
||||
__io_put_kbuf_ring(req, 1);
|
||||
__io_put_kbuf_ring(req, len, 1);
|
||||
} else {
|
||||
req->buf_index = req->kbuf->bgid;
|
||||
list_add(&req->kbuf->list, list);
|
||||
@ -150,11 +187,12 @@ static inline void io_kbuf_drop(struct io_kiocb *req)
|
||||
if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
|
||||
return;
|
||||
|
||||
__io_put_kbuf_list(req, &req->ctx->io_buffers_comp);
|
||||
/* len == 0 is fine here, non-ring will always drop all of it */
|
||||
__io_put_kbuf_list(req, 0, &req->ctx->io_buffers_comp);
|
||||
}
|
||||
|
||||
static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs,
|
||||
unsigned issue_flags)
|
||||
static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len,
|
||||
int nbufs, unsigned issue_flags)
|
||||
{
|
||||
unsigned int ret;
|
||||
|
||||
@ -162,22 +200,24 @@ static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs,
|
||||
return 0;
|
||||
|
||||
ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
|
||||
if (req->flags & REQ_F_BUFFER_RING)
|
||||
__io_put_kbuf_ring(req, nbufs);
|
||||
else
|
||||
__io_put_kbuf(req, issue_flags);
|
||||
if (req->flags & REQ_F_BUFFER_RING) {
|
||||
if (!__io_put_kbuf_ring(req, len, nbufs))
|
||||
ret |= IORING_CQE_F_BUF_MORE;
|
||||
} else {
|
||||
__io_put_kbuf(req, len, issue_flags);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline unsigned int io_put_kbuf(struct io_kiocb *req,
|
||||
static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len,
|
||||
unsigned issue_flags)
|
||||
{
|
||||
return __io_put_kbufs(req, 1, issue_flags);
|
||||
return __io_put_kbufs(req, len, 1, issue_flags);
|
||||
}
|
||||
|
||||
static inline unsigned int io_put_kbufs(struct io_kiocb *req, int nbufs,
|
||||
unsigned issue_flags)
|
||||
static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len,
|
||||
int nbufs, unsigned issue_flags)
|
||||
{
|
||||
return __io_put_kbufs(req, nbufs, issue_flags);
|
||||
return __io_put_kbufs(req, len, nbufs, issue_flags);
|
||||
}
|
||||
#endif
|
||||
|
@ -269,27 +269,6 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* __io_napi_adjust_timeout() - adjust busy loop timeout
|
||||
* @ctx: pointer to io-uring context structure
|
||||
* @iowq: pointer to io wait queue
|
||||
* @ts: pointer to timespec or NULL
|
||||
*
|
||||
* Adjust the busy loop timeout according to timespec and busy poll timeout.
|
||||
* If the specified NAPI timeout is bigger than the wait timeout, then adjust
|
||||
* the NAPI timeout accordingly.
|
||||
*/
|
||||
void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
|
||||
ktime_t to_wait)
|
||||
{
|
||||
ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
|
||||
|
||||
if (to_wait)
|
||||
poll_dt = min(poll_dt, to_wait);
|
||||
|
||||
iowq->napi_busy_poll_dt = poll_dt;
|
||||
}
|
||||
|
||||
/*
|
||||
* __io_napi_busy_loop() - execute busy poll loop
|
||||
* @ctx: pointer to io-uring context structure
|
||||
@ -299,9 +278,17 @@ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iow
|
||||
*/
|
||||
void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
|
||||
{
|
||||
iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
|
||||
if (ctx->flags & IORING_SETUP_SQPOLL)
|
||||
return;
|
||||
|
||||
if (!(ctx->flags & IORING_SETUP_SQPOLL))
|
||||
iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
|
||||
if (iowq->timeout != KTIME_MAX) {
|
||||
ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx));
|
||||
|
||||
iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt);
|
||||
}
|
||||
|
||||
iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
|
||||
io_napi_blocking_busy_loop(ctx, iowq);
|
||||
}
|
||||
|
||||
|
@ -17,8 +17,6 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg);
|
||||
|
||||
void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock);
|
||||
|
||||
void __io_napi_adjust_timeout(struct io_ring_ctx *ctx,
|
||||
struct io_wait_queue *iowq, ktime_t to_wait);
|
||||
void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq);
|
||||
int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx);
|
||||
|
||||
@ -27,15 +25,6 @@ static inline bool io_napi(struct io_ring_ctx *ctx)
|
||||
return !list_empty(&ctx->napi_list);
|
||||
}
|
||||
|
||||
static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
|
||||
struct io_wait_queue *iowq,
|
||||
ktime_t to_wait)
|
||||
{
|
||||
if (!io_napi(ctx))
|
||||
return;
|
||||
__io_napi_adjust_timeout(ctx, iowq, to_wait);
|
||||
}
|
||||
|
||||
static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,
|
||||
struct io_wait_queue *iowq)
|
||||
{
|
||||
@ -86,11 +75,6 @@ static inline bool io_napi(struct io_ring_ctx *ctx)
|
||||
static inline void io_napi_add(struct io_kiocb *req)
|
||||
{
|
||||
}
|
||||
static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
|
||||
struct io_wait_queue *iowq,
|
||||
ktime_t to_wait)
|
||||
{
|
||||
}
|
||||
static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,
|
||||
struct io_wait_queue *iowq)
|
||||
{
|
||||
|
@ -434,8 +434,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
sr->buf_group = req->buf_index;
|
||||
req->buf_list = NULL;
|
||||
}
|
||||
if (req->flags & REQ_F_BUFFER_SELECT && sr->len)
|
||||
return -EINVAL;
|
||||
|
||||
#ifdef CONFIG_COMPAT
|
||||
if (req->ctx->compat)
|
||||
@ -499,11 +497,11 @@ static inline bool io_send_finish(struct io_kiocb *req, int *ret,
|
||||
unsigned int cflags;
|
||||
|
||||
if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
|
||||
cflags = io_put_kbuf(req, issue_flags);
|
||||
cflags = io_put_kbuf(req, *ret, issue_flags);
|
||||
goto finish;
|
||||
}
|
||||
|
||||
cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags);
|
||||
cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags);
|
||||
|
||||
if (bundle_finished || req->flags & REQ_F_BL_EMPTY)
|
||||
goto finish;
|
||||
@ -599,7 +597,7 @@ retry_bundle:
|
||||
if (io_do_buffer_select(req)) {
|
||||
struct buf_sel_arg arg = {
|
||||
.iovs = &kmsg->fast_iov,
|
||||
.max_len = INT_MAX,
|
||||
.max_len = min_not_zero(sr->len, INT_MAX),
|
||||
.nr_iovs = 1,
|
||||
};
|
||||
|
||||
@ -618,14 +616,23 @@ retry_bundle:
|
||||
if (unlikely(ret < 0))
|
||||
return ret;
|
||||
|
||||
sr->len = arg.out_len;
|
||||
iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret,
|
||||
arg.out_len);
|
||||
if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
|
||||
kmsg->free_iov_nr = ret;
|
||||
kmsg->free_iov = arg.iovs;
|
||||
req->flags |= REQ_F_NEED_CLEANUP;
|
||||
}
|
||||
sr->len = arg.out_len;
|
||||
|
||||
if (ret == 1) {
|
||||
sr->buf = arg.iovs[0].iov_base;
|
||||
ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
|
||||
&kmsg->msg.msg_iter);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
} else {
|
||||
iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE,
|
||||
arg.iovs, ret, arg.out_len);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@ -835,13 +842,13 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
|
||||
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
|
||||
|
||||
if (sr->flags & IORING_RECVSEND_BUNDLE) {
|
||||
cflags |= io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret),
|
||||
cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret),
|
||||
issue_flags);
|
||||
/* bundle with no more immediate buffers, we're done */
|
||||
if (req->flags & REQ_F_BL_EMPTY)
|
||||
goto finish;
|
||||
} else {
|
||||
cflags |= io_put_kbuf(req, issue_flags);
|
||||
cflags |= io_put_kbuf(req, *ret, issue_flags);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -335,6 +335,31 @@ err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int io_register_clock(struct io_ring_ctx *ctx,
|
||||
struct io_uring_clock_register __user *arg)
|
||||
{
|
||||
struct io_uring_clock_register reg;
|
||||
|
||||
if (copy_from_user(®, arg, sizeof(reg)))
|
||||
return -EFAULT;
|
||||
if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
|
||||
return -EINVAL;
|
||||
|
||||
switch (reg.clockid) {
|
||||
case CLOCK_MONOTONIC:
|
||||
ctx->clock_offset = 0;
|
||||
break;
|
||||
case CLOCK_BOOTTIME:
|
||||
ctx->clock_offset = TK_OFFS_BOOT;
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ctx->clockid = reg.clockid;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
|
||||
void __user *arg, unsigned nr_args)
|
||||
__releases(ctx->uring_lock)
|
||||
@ -511,6 +536,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
|
||||
break;
|
||||
ret = io_unregister_napi(ctx, arg);
|
||||
break;
|
||||
case IORING_REGISTER_CLOCK:
|
||||
ret = -EINVAL;
|
||||
if (!arg || nr_args)
|
||||
break;
|
||||
ret = io_register_clock(ctx, arg);
|
||||
break;
|
||||
case IORING_REGISTER_COPY_BUFFERS:
|
||||
ret = -EINVAL;
|
||||
if (!arg || nr_args != 1)
|
||||
break;
|
||||
ret = io_register_copy_buffers(ctx, arg);
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
@ -519,6 +556,38 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Given an 'fd' value, return the ctx associated with if. If 'registered' is
|
||||
* true, then the registered index is used. Otherwise, the normal fd table.
|
||||
* Caller must call fput() on the returned file, unless it's an ERR_PTR.
|
||||
*/
|
||||
struct file *io_uring_register_get_file(int fd, bool registered)
|
||||
{
|
||||
struct file *file;
|
||||
|
||||
if (registered) {
|
||||
/*
|
||||
* Ring fd has been registered via IORING_REGISTER_RING_FDS, we
|
||||
* need only dereference our task private array to find it.
|
||||
*/
|
||||
struct io_uring_task *tctx = current->io_uring;
|
||||
|
||||
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
|
||||
return ERR_PTR(-EINVAL);
|
||||
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
|
||||
file = tctx->registered_rings[fd];
|
||||
} else {
|
||||
file = fget(fd);
|
||||
}
|
||||
|
||||
if (unlikely(!file))
|
||||
return ERR_PTR(-EBADF);
|
||||
if (io_is_uring_fops(file))
|
||||
return file;
|
||||
fput(file);
|
||||
return ERR_PTR(-EOPNOTSUPP);
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
|
||||
void __user *, arg, unsigned int, nr_args)
|
||||
{
|
||||
@ -533,35 +602,15 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
|
||||
if (opcode >= IORING_REGISTER_LAST)
|
||||
return -EINVAL;
|
||||
|
||||
if (use_registered_ring) {
|
||||
/*
|
||||
* Ring fd has been registered via IORING_REGISTER_RING_FDS, we
|
||||
* need only dereference our task private array to find it.
|
||||
*/
|
||||
struct io_uring_task *tctx = current->io_uring;
|
||||
|
||||
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
|
||||
return -EINVAL;
|
||||
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
|
||||
file = tctx->registered_rings[fd];
|
||||
if (unlikely(!file))
|
||||
return -EBADF;
|
||||
} else {
|
||||
file = fget(fd);
|
||||
if (unlikely(!file))
|
||||
return -EBADF;
|
||||
ret = -EOPNOTSUPP;
|
||||
if (!io_is_uring_fops(file))
|
||||
goto out_fput;
|
||||
}
|
||||
|
||||
file = io_uring_register_get_file(fd, use_registered_ring);
|
||||
if (IS_ERR(file))
|
||||
return PTR_ERR(file);
|
||||
ctx = file->private_data;
|
||||
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
ret = __io_uring_register(ctx, opcode, arg, nr_args);
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
|
||||
out_fput:
|
||||
if (!use_registered_ring)
|
||||
fput(file);
|
||||
return ret;
|
||||
|
@ -4,5 +4,6 @@
|
||||
|
||||
int io_eventfd_unregister(struct io_ring_ctx *ctx);
|
||||
int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
|
||||
struct file *io_uring_register_get_file(int fd, bool registered);
|
||||
|
||||
#endif
|
||||
|
245
io_uring/rsrc.c
245
io_uring/rsrc.c
@ -17,6 +17,7 @@
|
||||
#include "openclose.h"
|
||||
#include "rsrc.h"
|
||||
#include "memmap.h"
|
||||
#include "register.h"
|
||||
|
||||
struct io_rsrc_update {
|
||||
struct file *file;
|
||||
@ -114,14 +115,16 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
|
||||
struct io_mapped_ubuf *imu = *slot;
|
||||
unsigned int i;
|
||||
|
||||
*slot = NULL;
|
||||
if (imu != &dummy_ubuf) {
|
||||
if (!refcount_dec_and_test(&imu->refs))
|
||||
return;
|
||||
for (i = 0; i < imu->nr_bvecs; i++)
|
||||
unpin_user_page(imu->bvec[i].bv_page);
|
||||
if (imu->acct_pages)
|
||||
io_unaccount_mem(ctx, imu->acct_pages);
|
||||
kvfree(imu);
|
||||
}
|
||||
*slot = NULL;
|
||||
}
|
||||
|
||||
static void io_rsrc_put_work(struct io_rsrc_node *node)
|
||||
@ -855,6 +858,98 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
|
||||
struct io_imu_folio_data *data, int nr_folios)
|
||||
{
|
||||
struct page **page_array = *pages, **new_array = NULL;
|
||||
int nr_pages_left = *nr_pages, i, j;
|
||||
|
||||
/* Store head pages only*/
|
||||
new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
|
||||
GFP_KERNEL);
|
||||
if (!new_array)
|
||||
return false;
|
||||
|
||||
new_array[0] = compound_head(page_array[0]);
|
||||
/*
|
||||
* The pages are bound to the folio, it doesn't
|
||||
* actually unpin them but drops all but one reference,
|
||||
* which is usually put down by io_buffer_unmap().
|
||||
* Note, needs a better helper.
|
||||
*/
|
||||
if (data->nr_pages_head > 1)
|
||||
unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
|
||||
|
||||
j = data->nr_pages_head;
|
||||
nr_pages_left -= data->nr_pages_head;
|
||||
for (i = 1; i < nr_folios; i++) {
|
||||
unsigned int nr_unpin;
|
||||
|
||||
new_array[i] = page_array[j];
|
||||
nr_unpin = min_t(unsigned int, nr_pages_left - 1,
|
||||
data->nr_pages_mid - 1);
|
||||
if (nr_unpin)
|
||||
unpin_user_pages(&page_array[j+1], nr_unpin);
|
||||
j += data->nr_pages_mid;
|
||||
nr_pages_left -= data->nr_pages_mid;
|
||||
}
|
||||
kvfree(page_array);
|
||||
*pages = new_array;
|
||||
*nr_pages = nr_folios;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
|
||||
struct io_imu_folio_data *data)
|
||||
{
|
||||
struct page **page_array = *pages;
|
||||
struct folio *folio = page_folio(page_array[0]);
|
||||
unsigned int count = 1, nr_folios = 1;
|
||||
int i;
|
||||
|
||||
if (*nr_pages <= 1)
|
||||
return false;
|
||||
|
||||
data->nr_pages_mid = folio_nr_pages(folio);
|
||||
if (data->nr_pages_mid == 1)
|
||||
return false;
|
||||
|
||||
data->folio_shift = folio_shift(folio);
|
||||
/*
|
||||
* Check if pages are contiguous inside a folio, and all folios have
|
||||
* the same page count except for the head and tail.
|
||||
*/
|
||||
for (i = 1; i < *nr_pages; i++) {
|
||||
if (page_folio(page_array[i]) == folio &&
|
||||
page_array[i] == page_array[i-1] + 1) {
|
||||
count++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (nr_folios == 1) {
|
||||
if (folio_page_idx(folio, page_array[i-1]) !=
|
||||
data->nr_pages_mid - 1)
|
||||
return false;
|
||||
|
||||
data->nr_pages_head = count;
|
||||
} else if (count != data->nr_pages_mid) {
|
||||
return false;
|
||||
}
|
||||
|
||||
folio = page_folio(page_array[i]);
|
||||
if (folio_size(folio) != (1UL << data->folio_shift) ||
|
||||
folio_page_idx(folio, page_array[i]) != 0)
|
||||
return false;
|
||||
|
||||
count = 1;
|
||||
nr_folios++;
|
||||
}
|
||||
if (nr_folios == 1)
|
||||
data->nr_pages_head = count;
|
||||
|
||||
return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
|
||||
}
|
||||
|
||||
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
|
||||
struct io_mapped_ubuf **pimu,
|
||||
struct page **last_hpage)
|
||||
@ -864,7 +959,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
|
||||
unsigned long off;
|
||||
size_t size;
|
||||
int ret, nr_pages, i;
|
||||
struct folio *folio = NULL;
|
||||
struct io_imu_folio_data data;
|
||||
bool coalesced;
|
||||
|
||||
*pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
|
||||
if (!iov->iov_base)
|
||||
@ -879,31 +975,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* If it's a huge page, try to coalesce them into a single bvec entry */
|
||||
if (nr_pages > 1) {
|
||||
folio = page_folio(pages[0]);
|
||||
for (i = 1; i < nr_pages; i++) {
|
||||
/*
|
||||
* Pages must be consecutive and on the same folio for
|
||||
* this to work
|
||||
*/
|
||||
if (page_folio(pages[i]) != folio ||
|
||||
pages[i] != pages[i - 1] + 1) {
|
||||
folio = NULL;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (folio) {
|
||||
/*
|
||||
* The pages are bound to the folio, it doesn't
|
||||
* actually unpin them but drops all but one reference,
|
||||
* which is usually put down by io_buffer_unmap().
|
||||
* Note, needs a better helper.
|
||||
*/
|
||||
unpin_user_pages(&pages[1], nr_pages - 1);
|
||||
nr_pages = 1;
|
||||
}
|
||||
}
|
||||
/* If it's huge page(s), try to coalesce them into fewer bvec entries */
|
||||
coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data);
|
||||
|
||||
imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
|
||||
if (!imu)
|
||||
@ -915,23 +988,26 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
|
||||
goto done;
|
||||
}
|
||||
|
||||
off = (unsigned long) iov->iov_base & ~PAGE_MASK;
|
||||
size = iov->iov_len;
|
||||
/* store original address for later verification */
|
||||
imu->ubuf = (unsigned long) iov->iov_base;
|
||||
imu->ubuf_end = imu->ubuf + iov->iov_len;
|
||||
imu->nr_bvecs = nr_pages;
|
||||
imu->folio_shift = PAGE_SHIFT;
|
||||
imu->folio_mask = PAGE_MASK;
|
||||
if (coalesced) {
|
||||
imu->folio_shift = data.folio_shift;
|
||||
imu->folio_mask = ~((1UL << data.folio_shift) - 1);
|
||||
}
|
||||
refcount_set(&imu->refs, 1);
|
||||
off = (unsigned long) iov->iov_base & ~imu->folio_mask;
|
||||
*pimu = imu;
|
||||
ret = 0;
|
||||
|
||||
if (folio) {
|
||||
bvec_set_page(&imu->bvec[0], pages[0], size, off);
|
||||
goto done;
|
||||
}
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
size_t vec_len;
|
||||
|
||||
vec_len = min_t(size_t, size, PAGE_SIZE - off);
|
||||
vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
|
||||
bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
|
||||
off = 0;
|
||||
size -= vec_len;
|
||||
@ -1042,23 +1118,18 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
|
||||
* we know that:
|
||||
*
|
||||
* 1) it's a BVEC iter, we set it up
|
||||
* 2) all bvecs are PAGE_SIZE in size, except potentially the
|
||||
* 2) all bvecs are the same in size, except potentially the
|
||||
* first and last bvec
|
||||
*
|
||||
* So just find our index, and adjust the iterator afterwards.
|
||||
* If the offset is within the first bvec (or the whole first
|
||||
* bvec, just use iov_iter_advance(). This makes it easier
|
||||
* since we can just skip the first segment, which may not
|
||||
* be PAGE_SIZE aligned.
|
||||
* be folio_size aligned.
|
||||
*/
|
||||
const struct bio_vec *bvec = imu->bvec;
|
||||
|
||||
if (offset < bvec->bv_len) {
|
||||
/*
|
||||
* Note, huge pages buffers consists of one large
|
||||
* bvec entry and should always go this way. The other
|
||||
* branch doesn't expect non PAGE_SIZE'd chunks.
|
||||
*/
|
||||
iter->bvec = bvec;
|
||||
iter->count -= offset;
|
||||
iter->iov_offset = offset;
|
||||
@ -1067,14 +1138,104 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
|
||||
|
||||
/* skip first vec */
|
||||
offset -= bvec->bv_len;
|
||||
seg_skip = 1 + (offset >> PAGE_SHIFT);
|
||||
seg_skip = 1 + (offset >> imu->folio_shift);
|
||||
|
||||
iter->bvec = bvec + seg_skip;
|
||||
iter->nr_segs -= seg_skip;
|
||||
iter->count -= bvec->bv_len + offset;
|
||||
iter->iov_offset = offset & ~PAGE_MASK;
|
||||
iter->iov_offset = offset & ~imu->folio_mask;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_copy_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
|
||||
{
|
||||
struct io_mapped_ubuf **user_bufs;
|
||||
struct io_rsrc_data *data;
|
||||
int i, ret, nbufs;
|
||||
|
||||
/*
|
||||
* Drop our own lock here. We'll setup the data we need and reference
|
||||
* the source buffers, then re-grab, check, and assign at the end.
|
||||
*/
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
|
||||
mutex_lock(&src_ctx->uring_lock);
|
||||
ret = -ENXIO;
|
||||
nbufs = src_ctx->nr_user_bufs;
|
||||
if (!nbufs)
|
||||
goto out_unlock;
|
||||
ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
ret = -ENOMEM;
|
||||
user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL);
|
||||
if (!user_bufs)
|
||||
goto out_free_data;
|
||||
|
||||
for (i = 0; i < nbufs; i++) {
|
||||
struct io_mapped_ubuf *src = src_ctx->user_bufs[i];
|
||||
|
||||
refcount_inc(&src->refs);
|
||||
user_bufs[i] = src;
|
||||
}
|
||||
|
||||
/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
|
||||
mutex_unlock(&src_ctx->uring_lock);
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
if (!ctx->user_bufs) {
|
||||
ctx->user_bufs = user_bufs;
|
||||
ctx->buf_data = data;
|
||||
ctx->nr_user_bufs = nbufs;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* someone raced setting up buffers, dump ours */
|
||||
for (i = 0; i < nbufs; i++)
|
||||
io_buffer_unmap(ctx, &user_bufs[i]);
|
||||
io_rsrc_data_free(data);
|
||||
kfree(user_bufs);
|
||||
return -EBUSY;
|
||||
out_free_data:
|
||||
io_rsrc_data_free(data);
|
||||
out_unlock:
|
||||
mutex_unlock(&src_ctx->uring_lock);
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy the registered buffers from the source ring whose file descriptor
|
||||
* is given in the src_fd to the current ring. This is identical to registering
|
||||
* the buffers with ctx, except faster as mappings already exist.
|
||||
*
|
||||
* Since the memory is already accounted once, don't account it again.
|
||||
*/
|
||||
int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg)
|
||||
{
|
||||
struct io_uring_copy_buffers buf;
|
||||
bool registered_src;
|
||||
struct file *file;
|
||||
int ret;
|
||||
|
||||
if (ctx->user_bufs || ctx->nr_user_bufs)
|
||||
return -EBUSY;
|
||||
if (copy_from_user(&buf, arg, sizeof(buf)))
|
||||
return -EFAULT;
|
||||
if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED)
|
||||
return -EINVAL;
|
||||
if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
|
||||
return -EINVAL;
|
||||
|
||||
registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
|
||||
file = io_uring_register_get_file(buf.src_fd, registered_src);
|
||||
if (IS_ERR(file))
|
||||
return PTR_ERR(file);
|
||||
ret = io_copy_buffers(ctx, file->private_data);
|
||||
if (!registered_src)
|
||||
fput(file);
|
||||
return ret;
|
||||
}
|
||||
|
@ -22,8 +22,6 @@ struct io_rsrc_put {
|
||||
};
|
||||
};
|
||||
|
||||
typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
|
||||
|
||||
struct io_rsrc_data {
|
||||
struct io_ring_ctx *ctx;
|
||||
|
||||
@ -46,10 +44,21 @@ struct io_mapped_ubuf {
|
||||
u64 ubuf;
|
||||
u64 ubuf_end;
|
||||
unsigned int nr_bvecs;
|
||||
unsigned int folio_shift;
|
||||
unsigned long acct_pages;
|
||||
unsigned long folio_mask;
|
||||
refcount_t refs;
|
||||
struct bio_vec bvec[] __counted_by(nr_bvecs);
|
||||
};
|
||||
|
||||
struct io_imu_folio_data {
|
||||
/* Head folio can be partially included in the fixed buf */
|
||||
unsigned int nr_pages_head;
|
||||
/* For non-head/tail folios, has to be fully included */
|
||||
unsigned int nr_pages_mid;
|
||||
unsigned int folio_shift;
|
||||
};
|
||||
|
||||
void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
|
||||
void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
|
||||
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
|
||||
@ -59,6 +68,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
|
||||
struct io_mapped_ubuf *imu,
|
||||
u64 buf_addr, size_t len);
|
||||
|
||||
int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg);
|
||||
void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
|
||||
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
|
||||
int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
|
||||
|
@ -467,8 +467,7 @@ static void io_req_io_end(struct io_kiocb *req)
|
||||
static bool __io_complete_rw_common(struct io_kiocb *req, long res)
|
||||
{
|
||||
if (unlikely(res != req->cqe.res)) {
|
||||
if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
|
||||
io_rw_should_reissue(req)) {
|
||||
if (res == -EAGAIN && io_rw_should_reissue(req)) {
|
||||
/*
|
||||
* Reissue will start accounting again, finish the
|
||||
* current cycle.
|
||||
@ -511,7 +510,7 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
|
||||
io_req_io_end(req);
|
||||
|
||||
if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
|
||||
req->cqe.flags |= io_put_kbuf(req, 0);
|
||||
req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0);
|
||||
|
||||
io_req_rw_cleanup(req, 0);
|
||||
io_req_task_complete(req, ts);
|
||||
@ -593,7 +592,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
|
||||
*/
|
||||
io_req_io_end(req);
|
||||
io_req_set_res(req, final_ret,
|
||||
io_put_kbuf(req, issue_flags));
|
||||
io_put_kbuf(req, ret, issue_flags));
|
||||
io_req_rw_cleanup(req, issue_flags);
|
||||
return IOU_OK;
|
||||
}
|
||||
@ -855,6 +854,14 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
|
||||
|
||||
ret = io_iter_do_read(rw, &io->iter);
|
||||
|
||||
/*
|
||||
* Some file systems like to return -EOPNOTSUPP for an IOCB_NOWAIT
|
||||
* issue, even though they should be returning -EAGAIN. To be safe,
|
||||
* retry from blocking context for either.
|
||||
*/
|
||||
if (ret == -EOPNOTSUPP && force_nonblock)
|
||||
ret = -EAGAIN;
|
||||
|
||||
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
|
||||
req->flags &= ~REQ_F_REISSUE;
|
||||
/* If we can poll, just do that. */
|
||||
@ -975,7 +982,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
|
||||
* Put our buffer and post a CQE. If we fail to post a CQE, then
|
||||
* jump to the termination path. This request is then done.
|
||||
*/
|
||||
cflags = io_put_kbuf(req, issue_flags);
|
||||
cflags = io_put_kbuf(req, ret, issue_flags);
|
||||
rw->len = 0; /* similarly to above, reset len to 0 */
|
||||
|
||||
if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
|
||||
@ -1167,7 +1174,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
|
||||
if (!smp_load_acquire(&req->iopoll_completed))
|
||||
break;
|
||||
nr_events++;
|
||||
req->cqe.flags = io_put_kbuf(req, 0);
|
||||
req->cqe.flags = io_put_kbuf(req, req->cqe.res, 0);
|
||||
if (req->opcode != IORING_OP_URING_CMD)
|
||||
io_req_rw_cleanup(req, 0);
|
||||
}
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/audit.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/cpuset.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
@ -176,7 +177,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
|
||||
if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
|
||||
to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
|
||||
|
||||
if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
|
||||
if (to_submit || !wq_list_empty(&ctx->iopoll_list)) {
|
||||
const struct cred *creds = NULL;
|
||||
|
||||
if (ctx->sq_creds != current_cred())
|
||||
@ -460,10 +461,12 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
|
||||
return 0;
|
||||
|
||||
if (p->flags & IORING_SETUP_SQ_AFF) {
|
||||
struct cpumask allowed_mask;
|
||||
int cpu = p->sq_thread_cpu;
|
||||
|
||||
ret = -EINVAL;
|
||||
if (cpu >= nr_cpu_ids || !cpu_online(cpu))
|
||||
cpuset_cpus_allowed(current, &allowed_mask);
|
||||
if (!cpumask_test_cpu(cpu, &allowed_mask))
|
||||
goto err_sqpoll;
|
||||
sqd->sq_cpu = cpu;
|
||||
} else {
|
||||
|
Loading…
Reference in New Issue
Block a user