for-6.12/io_uring-20240913

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmbkST4QHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpnU7D/47BmxQmTbsT9NFBeZrQVgmQ2Zap2WWx3Za
 4qGuU1VxcafztqWnRChtxznheVG9ioHglcxfbZjc/D4/BiffgF4n5Z48qh1c0t8O
 +2pwq75j0WyJkHH9wCrrN9Jq8zSB6pBr2sMEQmSilMgYZKMzhXrXevKkYnthj/1a
 7U9QzY+lfc8neZRHR7VDouPWIRjBhwaO62ANXWCL7F2uE6NQasU61x6YTzGuoDB3
 0gR5PbSiLIusGxsYqIVmQUPNBUOw8nOzXXcbw8kBlRdnpadns8rNk+ivIMtAYw0m
 s6xVWNWFToVxO8956rBnjicD6ZzF5Txe6gWC6gvhKMFkOyxkihgMCOZUpSmw6D8G
 YlDHB4+lijpQMyPDw1UUPOYPVGSVRp/f2MuRcEhW/Yums5vd9eOVrUVsFjfYRQLr
 fg+lp3rEMoHxBnuKneMY2inuZW99+LGyO8F4IVublwXoXKFcq3TdGCvn5OZUBGDn
 E5x4QGq+cf9icK4kqN5mVi256fhOLnqDTtzIg4qiwhZ5h9UA3CFjGc56G7wqgp8d
 Bu5scCkJR5tXJEZA1hce+w2bXzrM6Xd2gym5A6D6k8S3QheHkKva60/qfIzhs/x0
 6nlJYSlznyQbDOBDQIJC86OE4tcShNusjFIgIDg6ZvAX2qk7BBmbPNF4RGrI9TTM
 xz2dONRhlA==
 =ZNjL
 -----END PGP SIGNATURE-----

Merge tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:

 - NAPI fixes and cleanups (Pavel, Olivier)

 - Add support for absolute timeouts (Pavel)

 - Fixes for io-wq/sqpoll affinities (Felix)

 - Efficiency improvements for dealing with huge pages (Chenliang)

 - Support for a minwait mode, where the application essentially has two
   timouts - one smaller one that defines the batch timeout, and the
   overall large one similar to what we had before. This enables
   efficient use of batching based on count + timeout, while still
   working well with periods of less intensive workloads

 - Use ITER_UBUF for single segment sends

 - Add support for incremental buffer consumption. Right now each
   operation will always consume a full buffer. With incremental
   consumption, a recv/read operation only consumes the part of the
   buffer that it needs to satisfy the operation

 - Add support for GCOV for io_uring, to help retain a high coverage of
   test to code ratio

 - Fix regression with ocfs2, where an odd -EOPNOTSUPP wasn't correctly
   converted to a blocking retry

 - Add support for cloning registered buffers from one ring to another

 - Misc cleanups (Anuj, me)

* tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux: (35 commits)
  io_uring: add IORING_REGISTER_COPY_BUFFERS method
  io_uring/register: provide helper to get io_ring_ctx from 'fd'
  io_uring/rsrc: add reference count to struct io_mapped_ubuf
  io_uring/rsrc: clear 'slot' entry upfront
  io_uring/io-wq: inherit cpuset of cgroup in io worker
  io_uring/io-wq: do not allow pinning outside of cpuset
  io_uring/rw: drop -EOPNOTSUPP check in __io_complete_rw_common()
  io_uring/rw: treat -EOPNOTSUPP for IOCB_NOWAIT like -EAGAIN
  io_uring/sqpoll: do not allow pinning outside of cpuset
  io_uring/eventfd: move refs to refcount_t
  io_uring: remove unused rsrc_put_fn
  io_uring: add new line after variable declaration
  io_uring: add GCOV_PROFILE_URING Kconfig option
  io_uring/kbuf: add support for incremental buffer consumption
  io_uring/kbuf: pass in 'len' argument for buffer commit
  Revert "io_uring: Require zeroed sqe->len on provided-buffers send"
  io_uring/kbuf: move io_ring_head_to_buf() to kbuf.h
  io_uring/kbuf: add io_kbuf_commit() helper
  io_uring/kbuf: shrink nr_iovs/mode in struct buf_sel_arg
  io_uring: wire up min batch wake timeout
  ...
This commit is contained in:
Linus Torvalds 2024-09-16 13:29:00 +02:00
commit 3a4d319a8f
20 changed files with 725 additions and 262 deletions

View File

@ -239,6 +239,9 @@ struct io_ring_ctx {
struct io_rings *rings; struct io_rings *rings;
struct percpu_ref refs; struct percpu_ref refs;
clockid_t clockid;
enum tk_offsets clock_offset;
enum task_work_notify_mode notify_method; enum task_work_notify_mode notify_method;
unsigned sq_thread_idle; unsigned sq_thread_idle;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;

View File

@ -440,11 +440,21 @@ struct io_uring_cqe {
* IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv * IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv
* IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct * IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct
* them from sends. * them from sends.
* IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get
* more completions. In other words, the buffer is being
* partially consumed, and will be used by the kernel for
* more completions. This is only set for buffers used via
* the incremental buffer consumption, as provided by
* a ring buffer setup with IOU_PBUF_RING_INC. For any
* other provided buffer type, all completions with a
* buffer passed back is automatically returned to the
* application.
*/ */
#define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_BUFFER (1U << 0)
#define IORING_CQE_F_MORE (1U << 1) #define IORING_CQE_F_MORE (1U << 1)
#define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2)
#define IORING_CQE_F_NOTIF (1U << 3) #define IORING_CQE_F_NOTIF (1U << 3)
#define IORING_CQE_F_BUF_MORE (1U << 4)
#define IORING_CQE_BUFFER_SHIFT 16 #define IORING_CQE_BUFFER_SHIFT 16
@ -507,6 +517,7 @@ struct io_cqring_offsets {
#define IORING_ENTER_SQ_WAIT (1U << 2) #define IORING_ENTER_SQ_WAIT (1U << 2)
#define IORING_ENTER_EXT_ARG (1U << 3) #define IORING_ENTER_EXT_ARG (1U << 3)
#define IORING_ENTER_REGISTERED_RING (1U << 4) #define IORING_ENTER_REGISTERED_RING (1U << 4)
#define IORING_ENTER_ABS_TIMER (1U << 5)
/* /*
* Passed in for io_uring_setup(2). Copied back with updated info on success * Passed in for io_uring_setup(2). Copied back with updated info on success
@ -542,6 +553,7 @@ struct io_uring_params {
#define IORING_FEAT_LINKED_FILE (1U << 12) #define IORING_FEAT_LINKED_FILE (1U << 12)
#define IORING_FEAT_REG_REG_RING (1U << 13) #define IORING_FEAT_REG_REG_RING (1U << 13)
#define IORING_FEAT_RECVSEND_BUNDLE (1U << 14) #define IORING_FEAT_RECVSEND_BUNDLE (1U << 14)
#define IORING_FEAT_MIN_TIMEOUT (1U << 15)
/* /*
* io_uring_register(2) opcodes and arguments * io_uring_register(2) opcodes and arguments
@ -595,6 +607,11 @@ enum io_uring_register_op {
IORING_REGISTER_NAPI = 27, IORING_REGISTER_NAPI = 27,
IORING_UNREGISTER_NAPI = 28, IORING_UNREGISTER_NAPI = 28,
IORING_REGISTER_CLOCK = 29,
/* copy registered buffers from source ring to current ring */
IORING_REGISTER_COPY_BUFFERS = 30,
/* this goes last */ /* this goes last */
IORING_REGISTER_LAST, IORING_REGISTER_LAST,
@ -675,6 +692,21 @@ struct io_uring_restriction {
__u32 resv2[3]; __u32 resv2[3];
}; };
struct io_uring_clock_register {
__u32 clockid;
__u32 __resv[3];
};
enum {
IORING_REGISTER_SRC_REGISTERED = 1,
};
struct io_uring_copy_buffers {
__u32 src_fd;
__u32 flags;
__u32 pad[6];
};
struct io_uring_buf { struct io_uring_buf {
__u64 addr; __u64 addr;
__u32 len; __u32 len;
@ -707,9 +739,17 @@ struct io_uring_buf_ring {
* mmap(2) with the offset set as: * mmap(2) with the offset set as:
* IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) * IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
* to get a virtual mapping for the ring. * to get a virtual mapping for the ring.
* IOU_PBUF_RING_INC: If set, buffers consumed from this buffer ring can be
* consumed incrementally. Normally one (or more) buffers
* are fully consumed. With incremental consumptions, it's
* feasible to register big ranges of buffers, and each
* use of it will consume only as much as it needs. This
* requires that both the kernel and application keep
* track of where the current read/recv index is at.
*/ */
enum io_uring_register_pbuf_ring_flags { enum io_uring_register_pbuf_ring_flags {
IOU_PBUF_RING_MMAP = 1, IOU_PBUF_RING_MMAP = 1,
IOU_PBUF_RING_INC = 2,
}; };
/* argument for IORING_(UN)REGISTER_PBUF_RING */ /* argument for IORING_(UN)REGISTER_PBUF_RING */
@ -758,7 +798,7 @@ enum io_uring_register_restriction_op {
struct io_uring_getevents_arg { struct io_uring_getevents_arg {
__u64 sigmask; __u64 sigmask;
__u32 sigmask_sz; __u32 sigmask_sz;
__u32 pad; __u32 min_wait_usec;
__u64 ts; __u64 ts;
}; };

View File

@ -1687,6 +1687,19 @@ config IO_URING
applications to submit and complete IO through submission and applications to submit and complete IO through submission and
completion rings that are shared between the kernel and application. completion rings that are shared between the kernel and application.
config GCOV_PROFILE_URING
bool "Enable GCOV profiling on the io_uring subsystem"
depends on GCOV_KERNEL
help
Enable GCOV profiling on the io_uring subsystem, to facilitate
code coverage testing.
If unsure, say N.
Note that this will have a negative impact on the performance of
the io_uring subsystem, hence this should only be enabled for
specific test purposes.
config ADVISE_SYSCALLS config ADVISE_SYSCALLS
bool "Enable madvise/fadvise syscalls" if EXPERT bool "Enable madvise/fadvise syscalls" if EXPERT
default y default y

View File

@ -2,6 +2,10 @@
# #
# Makefile for io_uring # Makefile for io_uring
ifdef CONFIG_GCOV_PROFILE_URING
GCOV_PROFILE := y
endif
obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
tctx.o filetable.o rw.o net.o poll.o \ tctx.o filetable.o rw.o net.o poll.o \
eventfd.o uring_cmd.o openclose.o \ eventfd.o uring_cmd.o openclose.o \

View File

@ -15,7 +15,7 @@ struct io_ev_fd {
struct eventfd_ctx *cq_ev_fd; struct eventfd_ctx *cq_ev_fd;
unsigned int eventfd_async: 1; unsigned int eventfd_async: 1;
struct rcu_head rcu; struct rcu_head rcu;
atomic_t refs; refcount_t refs;
atomic_t ops; atomic_t ops;
}; };
@ -37,7 +37,7 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
if (atomic_dec_and_test(&ev_fd->refs)) if (refcount_dec_and_test(&ev_fd->refs))
io_eventfd_free(rcu); io_eventfd_free(rcu);
} }
@ -63,7 +63,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
*/ */
if (unlikely(!ev_fd)) if (unlikely(!ev_fd))
return; return;
if (!atomic_inc_not_zero(&ev_fd->refs)) if (!refcount_inc_not_zero(&ev_fd->refs))
return; return;
if (ev_fd->eventfd_async && !io_wq_current_is_worker()) if (ev_fd->eventfd_async && !io_wq_current_is_worker())
goto out; goto out;
@ -77,7 +77,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
} }
} }
out: out:
if (atomic_dec_and_test(&ev_fd->refs)) if (refcount_dec_and_test(&ev_fd->refs))
call_rcu(&ev_fd->rcu, io_eventfd_free); call_rcu(&ev_fd->rcu, io_eventfd_free);
} }
@ -126,6 +126,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
if (IS_ERR(ev_fd->cq_ev_fd)) { if (IS_ERR(ev_fd->cq_ev_fd)) {
int ret = PTR_ERR(ev_fd->cq_ev_fd); int ret = PTR_ERR(ev_fd->cq_ev_fd);
kfree(ev_fd); kfree(ev_fd);
return ret; return ret;
} }
@ -136,7 +137,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
ev_fd->eventfd_async = eventfd_async; ev_fd->eventfd_async = eventfd_async;
ctx->has_evfd = true; ctx->has_evfd = true;
atomic_set(&ev_fd->refs, 1); refcount_set(&ev_fd->refs, 1);
atomic_set(&ev_fd->ops, 0); atomic_set(&ev_fd->ops, 0);
rcu_assign_pointer(ctx->io_ev_fd, ev_fd); rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
return 0; return 0;
@ -151,7 +152,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx)
if (ev_fd) { if (ev_fd) {
ctx->has_evfd = false; ctx->has_evfd = false;
rcu_assign_pointer(ctx->io_ev_fd, NULL); rcu_assign_pointer(ctx->io_ev_fd, NULL);
if (atomic_dec_and_test(&ev_fd->refs)) if (refcount_dec_and_test(&ev_fd->refs))
call_rcu(&ev_fd->rcu, io_eventfd_free); call_rcu(&ev_fd->rcu, io_eventfd_free);
return 0; return 0;
} }

View File

@ -221,7 +221,19 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
cqe->user_data, cqe->res, cqe->flags); cqe->user_data, cqe->res, cqe->flags);
} }
spin_unlock(&ctx->completion_lock); spin_unlock(&ctx->completion_lock);
#ifdef CONFIG_NET_RX_BUSY_POLL
if (ctx->napi_enabled) {
seq_puts(m, "NAPI:\tenabled\n");
seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt);
if (ctx->napi_prefer_busy_poll)
seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
else
seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
} else {
seq_puts(m, "NAPI:\tdisabled\n");
}
#endif
} }
#endif #endif

View File

@ -13,6 +13,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/rculist_nulls.h> #include <linux/rculist_nulls.h>
#include <linux/cpu.h> #include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/task_work.h> #include <linux/task_work.h>
#include <linux/audit.h> #include <linux/audit.h>
#include <linux/mmu_context.h> #include <linux/mmu_context.h>
@ -1167,7 +1168,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL)) if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL))
goto err; goto err;
cpumask_copy(wq->cpu_mask, cpu_possible_mask); cpuset_cpus_allowed(data->task, wq->cpu_mask);
wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
wq->acct[IO_WQ_ACCT_UNBOUND].max_workers = wq->acct[IO_WQ_ACCT_UNBOUND].max_workers =
task_rlimit(current, RLIMIT_NPROC); task_rlimit(current, RLIMIT_NPROC);
@ -1322,17 +1323,29 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask) int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask)
{ {
cpumask_var_t allowed_mask;
int ret = 0;
if (!tctx || !tctx->io_wq) if (!tctx || !tctx->io_wq)
return -EINVAL; return -EINVAL;
if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL))
return -ENOMEM;
rcu_read_lock(); rcu_read_lock();
if (mask) cpuset_cpus_allowed(tctx->io_wq->task, allowed_mask);
if (mask) {
if (cpumask_subset(mask, allowed_mask))
cpumask_copy(tctx->io_wq->cpu_mask, mask); cpumask_copy(tctx->io_wq->cpu_mask, mask);
else else
cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask); ret = -EINVAL;
} else {
cpumask_copy(tctx->io_wq->cpu_mask, allowed_mask);
}
rcu_read_unlock(); rcu_read_unlock();
return 0; free_cpumask_var(allowed_mask);
return ret;
} }
/* /*

View File

@ -904,7 +904,7 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res)
lockdep_assert_held(&req->ctx->uring_lock); lockdep_assert_held(&req->ctx->uring_lock);
req_set_fail(req); req_set_fail(req);
io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); io_req_set_res(req, res, io_put_kbuf(req, res, IO_URING_F_UNLOCKED));
if (def->fail) if (def->fail)
def->fail(req); def->fail(req);
io_req_complete_defer(req); io_req_complete_defer(req);
@ -2350,12 +2350,113 @@ static bool current_pending_io(void)
return percpu_counter_read_positive(&tctx->inflight); return percpu_counter_read_positive(&tctx->inflight);
} }
/* when returns >0, the caller should retry */ static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq)
{ {
int ret; struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
WRITE_ONCE(iowq->hit_timeout, 1);
iowq->min_timeout = 0;
wake_up_process(iowq->wq.private);
return HRTIMER_NORESTART;
}
/*
* Doing min_timeout portion. If we saw any timeouts, events, or have work,
* wake up. If not, and we have a normal timeout, switch to that and keep
* sleeping.
*/
static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
{
struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
struct io_ring_ctx *ctx = iowq->ctx;
/* no general timeout, or shorter (or equal), we are done */
if (iowq->timeout == KTIME_MAX ||
ktime_compare(iowq->min_timeout, iowq->timeout) >= 0)
goto out_wake;
/* work we may need to run, wake function will see if we need to wake */
if (io_has_work(ctx))
goto out_wake;
/* got events since we started waiting, min timeout is done */
if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
goto out_wake;
/* if we have any events and min timeout expired, we're done */
if (io_cqring_events(ctx))
goto out_wake;
/*
* If using deferred task_work running and application is waiting on
* more than one request, ensure we reset it now where we are switching
* to normal sleeps. Any request completion post min_wait should wake
* the task and return.
*/
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
atomic_set(&ctx->cq_wait_nr, 1);
smp_mb();
if (!llist_empty(&ctx->work_llist))
goto out_wake;
}
iowq->t.function = io_cqring_timer_wakeup;
hrtimer_set_expires(timer, iowq->timeout);
return HRTIMER_RESTART;
out_wake:
return io_cqring_timer_wakeup(timer);
}
static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
clockid_t clock_id, ktime_t start_time)
{
ktime_t timeout;
hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS);
if (iowq->min_timeout) {
timeout = ktime_add_ns(iowq->min_timeout, start_time);
iowq->t.function = io_cqring_min_timer_wakeup;
} else {
timeout = iowq->timeout;
iowq->t.function = io_cqring_timer_wakeup;
}
hrtimer_set_expires_range_ns(&iowq->t, timeout, 0);
hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
if (!READ_ONCE(iowq->hit_timeout))
schedule();
hrtimer_cancel(&iowq->t);
destroy_hrtimer_on_stack(&iowq->t);
__set_current_state(TASK_RUNNING);
return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0;
}
static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
ktime_t start_time)
{
int ret = 0;
/*
* Mark us as being in io_wait if we have pending requests, so cpufreq
* can take into account that the task is waiting for IO - turns out
* to be important for low QD IO.
*/
if (current_pending_io())
current->in_iowait = 1;
if (iowq->timeout != KTIME_MAX || iowq->min_timeout)
ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time);
else
schedule();
current->in_iowait = 0;
return ret;
}
/* If this returns > 0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
ktime_t start_time)
{
if (unlikely(READ_ONCE(ctx->check_cq))) if (unlikely(READ_ONCE(ctx->check_cq)))
return 1; return 1;
if (unlikely(!llist_empty(&ctx->work_llist))) if (unlikely(!llist_empty(&ctx->work_llist)))
@ -2367,32 +2468,26 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
if (unlikely(io_should_wake(iowq))) if (unlikely(io_should_wake(iowq)))
return 0; return 0;
/* return __io_cqring_wait_schedule(ctx, iowq, start_time);
* Mark us as being in io_wait if we have pending requests, so cpufreq
* can take into account that the task is waiting for IO - turns out
* to be important for low QD IO.
*/
if (current_pending_io())
current->in_iowait = 1;
ret = 0;
if (iowq->timeout == KTIME_MAX)
schedule();
else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
ret = -ETIME;
current->in_iowait = 0;
return ret;
} }
struct ext_arg {
size_t argsz;
struct __kernel_timespec __user *ts;
const sigset_t __user *sig;
ktime_t min_time;
};
/* /*
* Wait until events become available, if we don't already have some. The * Wait until events become available, if we don't already have some. The
* application must reap them itself, as they reside on the shared cq ring. * application must reap them itself, as they reside on the shared cq ring.
*/ */
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
const sigset_t __user *sig, size_t sigsz, struct ext_arg *ext_arg)
struct __kernel_timespec __user *uts)
{ {
struct io_wait_queue iowq; struct io_wait_queue iowq;
struct io_rings *rings = ctx->rings; struct io_rings *rings = ctx->rings;
ktime_t start_time;
int ret; int ret;
if (!io_allowed_run_tw(ctx)) if (!io_allowed_run_tw(ctx))
@ -2410,30 +2505,33 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
iowq.wq.private = current; iowq.wq.private = current;
INIT_LIST_HEAD(&iowq.wq.entry); INIT_LIST_HEAD(&iowq.wq.entry);
iowq.ctx = ctx; iowq.ctx = ctx;
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
iowq.hit_timeout = 0;
iowq.min_timeout = ext_arg->min_time;
iowq.timeout = KTIME_MAX; iowq.timeout = KTIME_MAX;
start_time = io_get_time(ctx);
if (uts) { if (ext_arg->ts) {
struct timespec64 ts; struct timespec64 ts;
ktime_t dt;
if (get_timespec64(&ts, uts)) if (get_timespec64(&ts, ext_arg->ts))
return -EFAULT; return -EFAULT;
dt = timespec64_to_ktime(ts); iowq.timeout = timespec64_to_ktime(ts);
iowq.timeout = ktime_add(dt, ktime_get()); if (!(flags & IORING_ENTER_ABS_TIMER))
io_napi_adjust_timeout(ctx, &iowq, dt); iowq.timeout = ktime_add(iowq.timeout, start_time);
} }
if (sig) { if (ext_arg->sig) {
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
if (in_compat_syscall()) if (in_compat_syscall())
ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig,
sigsz); ext_arg->argsz);
else else
#endif #endif
ret = set_user_sigmask(sig, sigsz); ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz);
if (ret) if (ret)
return ret; return ret;
@ -2443,8 +2541,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
trace_io_uring_cqring_wait(ctx, min_events); trace_io_uring_cqring_wait(ctx, min_events);
do { do {
int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
unsigned long check_cq; unsigned long check_cq;
int nr_wait;
/* if min timeout has been hit, don't reset wait count */
if (!iowq.hit_timeout)
nr_wait = (int) iowq.cq_tail -
READ_ONCE(ctx->rings->cq.tail);
else
nr_wait = 1;
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
atomic_set(&ctx->cq_wait_nr, nr_wait); atomic_set(&ctx->cq_wait_nr, nr_wait);
@ -2454,7 +2559,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
TASK_INTERRUPTIBLE); TASK_INTERRUPTIBLE);
} }
ret = io_cqring_wait_schedule(ctx, &iowq); ret = io_cqring_wait_schedule(ctx, &iowq, start_time);
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
@ -3112,9 +3217,8 @@ static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t a
return 0; return 0;
} }
static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, static int io_get_ext_arg(unsigned flags, const void __user *argp,
struct __kernel_timespec __user **ts, struct ext_arg *ext_arg)
const sigset_t __user **sig)
{ {
struct io_uring_getevents_arg arg; struct io_uring_getevents_arg arg;
@ -3123,8 +3227,8 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz
* is just a pointer to the sigset_t. * is just a pointer to the sigset_t.
*/ */
if (!(flags & IORING_ENTER_EXT_ARG)) { if (!(flags & IORING_ENTER_EXT_ARG)) {
*sig = (const sigset_t __user *) argp; ext_arg->sig = (const sigset_t __user *) argp;
*ts = NULL; ext_arg->ts = NULL;
return 0; return 0;
} }
@ -3132,15 +3236,14 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz
* EXT_ARG is set - ensure we agree on the size of it and copy in our * EXT_ARG is set - ensure we agree on the size of it and copy in our
* timespec and sigset_t pointers if good. * timespec and sigset_t pointers if good.
*/ */
if (*argsz != sizeof(arg)) if (ext_arg->argsz != sizeof(arg))
return -EINVAL; return -EINVAL;
if (copy_from_user(&arg, argp, sizeof(arg))) if (copy_from_user(&arg, argp, sizeof(arg)))
return -EFAULT; return -EFAULT;
if (arg.pad) ext_arg->min_time = arg.min_wait_usec * NSEC_PER_USEC;
return -EINVAL; ext_arg->sig = u64_to_user_ptr(arg.sigmask);
*sig = u64_to_user_ptr(arg.sigmask); ext_arg->argsz = arg.sigmask_sz;
*argsz = arg.sigmask_sz; ext_arg->ts = u64_to_user_ptr(arg.ts);
*ts = u64_to_user_ptr(arg.ts);
return 0; return 0;
} }
@ -3154,7 +3257,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
IORING_ENTER_REGISTERED_RING))) IORING_ENTER_REGISTERED_RING |
IORING_ENTER_ABS_TIMER)))
return -EINVAL; return -EINVAL;
/* /*
@ -3245,15 +3349,14 @@ iopoll_locked:
} }
mutex_unlock(&ctx->uring_lock); mutex_unlock(&ctx->uring_lock);
} else { } else {
const sigset_t __user *sig; struct ext_arg ext_arg = { .argsz = argsz };
struct __kernel_timespec __user *ts;
ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); ret2 = io_get_ext_arg(flags, argp, &ext_arg);
if (likely(!ret2)) { if (likely(!ret2)) {
min_complete = min(min_complete, min_complete = min(min_complete,
ctx->cq_entries); ctx->cq_entries);
ret2 = io_cqring_wait(ctx, min_complete, sig, ret2 = io_cqring_wait(ctx, min_complete, flags,
argsz, ts); &ext_arg);
} }
} }
@ -3424,6 +3527,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
if (!ctx) if (!ctx)
return -ENOMEM; return -ENOMEM;
ctx->clockid = CLOCK_MONOTONIC;
ctx->clock_offset = 0;
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
!(ctx->flags & IORING_SETUP_IOPOLL) && !(ctx->flags & IORING_SETUP_IOPOLL) &&
!(ctx->flags & IORING_SETUP_SQPOLL)) !(ctx->flags & IORING_SETUP_SQPOLL))
@ -3535,7 +3641,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
IORING_FEAT_RECVSEND_BUNDLE; IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT;
if (copy_to_user(params, p, sizeof(*p))) { if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT; ret = -EFAULT;

View File

@ -39,8 +39,12 @@ struct io_wait_queue {
struct wait_queue_entry wq; struct wait_queue_entry wq;
struct io_ring_ctx *ctx; struct io_ring_ctx *ctx;
unsigned cq_tail; unsigned cq_tail;
unsigned cq_min_tail;
unsigned nr_timeouts; unsigned nr_timeouts;
int hit_timeout;
ktime_t min_timeout;
ktime_t timeout; ktime_t timeout;
struct hrtimer t;
#ifdef CONFIG_NET_RX_BUSY_POLL #ifdef CONFIG_NET_RX_BUSY_POLL
ktime_t napi_busy_poll_dt; ktime_t napi_busy_poll_dt;
@ -437,6 +441,14 @@ static inline bool io_file_can_poll(struct io_kiocb *req)
return false; return false;
} }
static inline ktime_t io_get_time(struct io_ring_ctx *ctx)
{
if (ctx->clockid == CLOCK_MONOTONIC)
return ktime_get();
return ktime_get_with_offset(ctx->clock_offset);
}
enum { enum {
IO_CHECK_CQ_OVERFLOW_BIT, IO_CHECK_CQ_OVERFLOW_BIT,
IO_CHECK_CQ_DROPPED_BIT, IO_CHECK_CQ_DROPPED_BIT,

View File

@ -70,7 +70,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
return true; return true;
} }
void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags)
{ {
/* /*
* We can add this buffer back to two lists: * We can add this buffer back to two lists:
@ -88,12 +88,12 @@ void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
spin_lock(&ctx->completion_lock); spin_lock(&ctx->completion_lock);
__io_put_kbuf_list(req, &ctx->io_buffers_comp); __io_put_kbuf_list(req, len, &ctx->io_buffers_comp);
spin_unlock(&ctx->completion_lock); spin_unlock(&ctx->completion_lock);
} else { } else {
lockdep_assert_held(&req->ctx->uring_lock); lockdep_assert_held(&req->ctx->uring_lock);
__io_put_kbuf_list(req, &req->ctx->io_buffers_cache); __io_put_kbuf_list(req, len, &req->ctx->io_buffers_cache);
} }
} }
@ -132,12 +132,6 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
return 1; return 1;
} }
static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br,
__u16 head, __u16 mask)
{
return &br->bufs[head & mask];
}
static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
struct io_buffer_list *bl, struct io_buffer_list *bl,
unsigned int issue_flags) unsigned int issue_flags)
@ -171,9 +165,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
* the transfer completes (or if we get -EAGAIN and must poll of * the transfer completes (or if we get -EAGAIN and must poll of
* retry). * retry).
*/ */
req->flags &= ~REQ_F_BUFFERS_COMMIT; io_kbuf_commit(req, bl, *len, 1);
req->buf_list = NULL; req->buf_list = NULL;
bl->head++;
} }
return u64_to_user_ptr(buf->addr); return u64_to_user_ptr(buf->addr);
} }
@ -189,7 +182,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
bl = io_buffer_get_list(ctx, req->buf_index); bl = io_buffer_get_list(ctx, req->buf_index);
if (likely(bl)) { if (likely(bl)) {
if (bl->is_buf_ring) if (bl->flags & IOBL_BUF_RING)
ret = io_ring_buffer_select(req, len, bl, issue_flags); ret = io_ring_buffer_select(req, len, bl, issue_flags);
else else
ret = io_provided_buffer_select(req, len, bl); ret = io_provided_buffer_select(req, len, bl);
@ -219,15 +212,26 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
buf = io_ring_head_to_buf(br, head, bl->mask); buf = io_ring_head_to_buf(br, head, bl->mask);
if (arg->max_len) { if (arg->max_len) {
u32 len = READ_ONCE(buf->len); u32 len = READ_ONCE(buf->len);
size_t needed;
if (unlikely(!len)) if (unlikely(!len))
return -ENOBUFS; return -ENOBUFS;
/*
* Limit incremental buffers to 1 segment. No point trying
* to peek ahead and map more than we need, when the buffers
* themselves should be large when setup with
* IOU_PBUF_RING_INC.
*/
if (bl->flags & IOBL_INC) {
nr_avail = 1;
} else {
size_t needed;
needed = (arg->max_len + len - 1) / len; needed = (arg->max_len + len - 1) / len;
needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT);
if (nr_avail > needed) if (nr_avail > needed)
nr_avail = needed; nr_avail = needed;
} }
}
/* /*
* only alloc a bigger array if we know we have data to map, eg not * only alloc a bigger array if we know we have data to map, eg not
@ -251,16 +255,21 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
req->buf_index = buf->bid; req->buf_index = buf->bid;
do { do {
/* truncate end piece, if needed */ u32 len = buf->len;
if (buf->len > arg->max_len)
buf->len = arg->max_len; /* truncate end piece, if needed, for non partial buffers */
if (len > arg->max_len) {
len = arg->max_len;
if (!(bl->flags & IOBL_INC))
buf->len = len;
}
iov->iov_base = u64_to_user_ptr(buf->addr); iov->iov_base = u64_to_user_ptr(buf->addr);
iov->iov_len = buf->len; iov->iov_len = len;
iov++; iov++;
arg->out_len += buf->len; arg->out_len += len;
arg->max_len -= buf->len; arg->max_len -= len;
if (!arg->max_len) if (!arg->max_len)
break; break;
@ -287,7 +296,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
if (unlikely(!bl)) if (unlikely(!bl))
goto out_unlock; goto out_unlock;
if (bl->is_buf_ring) { if (bl->flags & IOBL_BUF_RING) {
ret = io_ring_buffers_peek(req, arg, bl); ret = io_ring_buffers_peek(req, arg, bl);
/* /*
* Don't recycle these buffers if we need to go through poll. * Don't recycle these buffers if we need to go through poll.
@ -297,8 +306,8 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
* committed them, they cannot be put back in the queue. * committed them, they cannot be put back in the queue.
*/ */
if (ret > 0) { if (ret > 0) {
req->flags |= REQ_F_BL_NO_RECYCLE; req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE;
req->buf_list->head += ret; io_kbuf_commit(req, bl, arg->out_len, ret);
} }
} else { } else {
ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs);
@ -320,7 +329,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
if (unlikely(!bl)) if (unlikely(!bl))
return -ENOENT; return -ENOENT;
if (bl->is_buf_ring) { if (bl->flags & IOBL_BUF_RING) {
ret = io_ring_buffers_peek(req, arg, bl); ret = io_ring_buffers_peek(req, arg, bl);
if (ret > 0) if (ret > 0)
req->flags |= REQ_F_BUFFERS_COMMIT; req->flags |= REQ_F_BUFFERS_COMMIT;
@ -340,22 +349,22 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (!nbufs) if (!nbufs)
return 0; return 0;
if (bl->is_buf_ring) { if (bl->flags & IOBL_BUF_RING) {
i = bl->buf_ring->tail - bl->head; i = bl->buf_ring->tail - bl->head;
if (bl->buf_nr_pages) { if (bl->buf_nr_pages) {
int j; int j;
if (!bl->is_mmap) { if (!(bl->flags & IOBL_MMAP)) {
for (j = 0; j < bl->buf_nr_pages; j++) for (j = 0; j < bl->buf_nr_pages; j++)
unpin_user_page(bl->buf_pages[j]); unpin_user_page(bl->buf_pages[j]);
} }
io_pages_unmap(bl->buf_ring, &bl->buf_pages, io_pages_unmap(bl->buf_ring, &bl->buf_pages,
&bl->buf_nr_pages, bl->is_mmap); &bl->buf_nr_pages, bl->flags & IOBL_MMAP);
bl->is_mmap = 0; bl->flags &= ~IOBL_MMAP;
} }
/* make sure it's seen as empty */ /* make sure it's seen as empty */
INIT_LIST_HEAD(&bl->buf_list); INIT_LIST_HEAD(&bl->buf_list);
bl->is_buf_ring = 0; bl->flags &= ~IOBL_BUF_RING;
return i; return i;
} }
@ -442,7 +451,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
if (bl) { if (bl) {
ret = -EINVAL; ret = -EINVAL;
/* can't use provide/remove buffers command on mapped buffers */ /* can't use provide/remove buffers command on mapped buffers */
if (!bl->is_buf_ring) if (!(bl->flags & IOBL_BUF_RING))
ret = __io_remove_buffers(ctx, bl, p->nbufs); ret = __io_remove_buffers(ctx, bl, p->nbufs);
} }
io_ring_submit_unlock(ctx, issue_flags); io_ring_submit_unlock(ctx, issue_flags);
@ -589,7 +598,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
} }
} }
/* can't add buffers via this command for a mapped buffer ring */ /* can't add buffers via this command for a mapped buffer ring */
if (bl->is_buf_ring) { if (bl->flags & IOBL_BUF_RING) {
ret = -EINVAL; ret = -EINVAL;
goto err; goto err;
} }
@ -641,8 +650,8 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
bl->buf_pages = pages; bl->buf_pages = pages;
bl->buf_nr_pages = nr_pages; bl->buf_nr_pages = nr_pages;
bl->buf_ring = br; bl->buf_ring = br;
bl->is_buf_ring = 1; bl->flags |= IOBL_BUF_RING;
bl->is_mmap = 0; bl->flags &= ~IOBL_MMAP;
return 0; return 0;
error_unpin: error_unpin:
unpin_user_pages(pages, nr_pages); unpin_user_pages(pages, nr_pages);
@ -665,8 +674,7 @@ static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
return -ENOMEM; return -ENOMEM;
} }
bl->is_buf_ring = 1; bl->flags |= (IOBL_BUF_RING | IOBL_MMAP);
bl->is_mmap = 1;
return 0; return 0;
} }
@ -683,7 +691,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (reg.resv[0] || reg.resv[1] || reg.resv[2]) if (reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL; return -EINVAL;
if (reg.flags & ~IOU_PBUF_RING_MMAP) if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
return -EINVAL; return -EINVAL;
if (!(reg.flags & IOU_PBUF_RING_MMAP)) { if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
if (!reg.ring_addr) if (!reg.ring_addr)
@ -705,7 +713,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
bl = io_buffer_get_list(ctx, reg.bgid); bl = io_buffer_get_list(ctx, reg.bgid);
if (bl) { if (bl) {
/* if mapped buffer ring OR classic exists, don't allow */ /* if mapped buffer ring OR classic exists, don't allow */
if (bl->is_buf_ring || !list_empty(&bl->buf_list)) if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list))
return -EEXIST; return -EEXIST;
} else { } else {
free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
@ -721,6 +729,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (!ret) { if (!ret) {
bl->nr_entries = reg.ring_entries; bl->nr_entries = reg.ring_entries;
bl->mask = reg.ring_entries - 1; bl->mask = reg.ring_entries - 1;
if (reg.flags & IOU_PBUF_RING_INC)
bl->flags |= IOBL_INC;
io_buffer_add_list(ctx, bl, reg.bgid); io_buffer_add_list(ctx, bl, reg.bgid);
return 0; return 0;
@ -747,7 +757,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
bl = io_buffer_get_list(ctx, reg.bgid); bl = io_buffer_get_list(ctx, reg.bgid);
if (!bl) if (!bl)
return -ENOENT; return -ENOENT;
if (!bl->is_buf_ring) if (!(bl->flags & IOBL_BUF_RING))
return -EINVAL; return -EINVAL;
xa_erase(&ctx->io_bl_xa, bl->bgid); xa_erase(&ctx->io_bl_xa, bl->bgid);
@ -771,7 +781,7 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
bl = io_buffer_get_list(ctx, buf_status.buf_group); bl = io_buffer_get_list(ctx, buf_status.buf_group);
if (!bl) if (!bl)
return -ENOENT; return -ENOENT;
if (!bl->is_buf_ring) if (!(bl->flags & IOBL_BUF_RING))
return -EINVAL; return -EINVAL;
buf_status.head = bl->head; buf_status.head = bl->head;
@ -802,7 +812,7 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
bl = xa_load(&ctx->io_bl_xa, bgid); bl = xa_load(&ctx->io_bl_xa, bgid);
/* must be a mmap'able buffer ring and have pages */ /* must be a mmap'able buffer ring and have pages */
ret = false; ret = false;
if (bl && bl->is_mmap) if (bl && bl->flags & IOBL_MMAP)
ret = atomic_inc_not_zero(&bl->refs); ret = atomic_inc_not_zero(&bl->refs);
rcu_read_unlock(); rcu_read_unlock();

View File

@ -4,6 +4,16 @@
#include <uapi/linux/io_uring.h> #include <uapi/linux/io_uring.h>
enum {
/* ring mapped provided buffers */
IOBL_BUF_RING = 1,
/* ring mapped provided buffers, but mmap'ed by application */
IOBL_MMAP = 2,
/* buffers are consumed incrementally rather than always fully */
IOBL_INC = 4,
};
struct io_buffer_list { struct io_buffer_list {
/* /*
* If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
@ -25,12 +35,9 @@ struct io_buffer_list {
__u16 head; __u16 head;
__u16 mask; __u16 mask;
atomic_t refs; __u16 flags;
/* ring mapped provided buffers */ atomic_t refs;
__u8 is_buf_ring;
/* ring mapped provided buffers, but mmap'ed by application */
__u8 is_mmap;
}; };
struct io_buffer { struct io_buffer {
@ -52,8 +59,8 @@ struct buf_sel_arg {
struct iovec *iovs; struct iovec *iovs;
size_t out_len; size_t out_len;
size_t max_len; size_t max_len;
int nr_iovs; unsigned short nr_iovs;
int mode; unsigned short mode;
}; };
void __user *io_buffer_select(struct io_kiocb *req, size_t *len, void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
@ -73,7 +80,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags);
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
@ -117,25 +124,55 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
return false; return false;
} }
static inline void __io_put_kbuf_ring(struct io_kiocb *req, int nr) /* Mapped buffer ring, return io_uring_buf from head */
#define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)]
static inline bool io_kbuf_commit(struct io_kiocb *req,
struct io_buffer_list *bl, int len, int nr)
{
if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT)))
return true;
req->flags &= ~REQ_F_BUFFERS_COMMIT;
if (unlikely(len < 0))
return true;
if (bl->flags & IOBL_INC) {
struct io_uring_buf *buf;
buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask);
if (WARN_ON_ONCE(len > buf->len))
len = buf->len;
buf->len -= len;
if (buf->len) {
buf->addr += len;
return false;
}
}
bl->head += nr;
return true;
}
static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr)
{ {
struct io_buffer_list *bl = req->buf_list; struct io_buffer_list *bl = req->buf_list;
bool ret = true;
if (bl) { if (bl) {
if (req->flags & REQ_F_BUFFERS_COMMIT) { ret = io_kbuf_commit(req, bl, len, nr);
bl->head += nr;
req->flags &= ~REQ_F_BUFFERS_COMMIT;
}
req->buf_index = bl->bgid; req->buf_index = bl->bgid;
} }
req->flags &= ~REQ_F_BUFFER_RING; req->flags &= ~REQ_F_BUFFER_RING;
return ret;
} }
static inline void __io_put_kbuf_list(struct io_kiocb *req, static inline void __io_put_kbuf_list(struct io_kiocb *req, int len,
struct list_head *list) struct list_head *list)
{ {
if (req->flags & REQ_F_BUFFER_RING) { if (req->flags & REQ_F_BUFFER_RING) {
__io_put_kbuf_ring(req, 1); __io_put_kbuf_ring(req, len, 1);
} else { } else {
req->buf_index = req->kbuf->bgid; req->buf_index = req->kbuf->bgid;
list_add(&req->kbuf->list, list); list_add(&req->kbuf->list, list);
@ -150,11 +187,12 @@ static inline void io_kbuf_drop(struct io_kiocb *req)
if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return; return;
__io_put_kbuf_list(req, &req->ctx->io_buffers_comp); /* len == 0 is fine here, non-ring will always drop all of it */
__io_put_kbuf_list(req, 0, &req->ctx->io_buffers_comp);
} }
static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs, static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len,
unsigned issue_flags) int nbufs, unsigned issue_flags)
{ {
unsigned int ret; unsigned int ret;
@ -162,22 +200,24 @@ static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs,
return 0; return 0;
ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
if (req->flags & REQ_F_BUFFER_RING) if (req->flags & REQ_F_BUFFER_RING) {
__io_put_kbuf_ring(req, nbufs); if (!__io_put_kbuf_ring(req, len, nbufs))
else ret |= IORING_CQE_F_BUF_MORE;
__io_put_kbuf(req, issue_flags); } else {
__io_put_kbuf(req, len, issue_flags);
}
return ret; return ret;
} }
static inline unsigned int io_put_kbuf(struct io_kiocb *req, static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len,
unsigned issue_flags) unsigned issue_flags)
{ {
return __io_put_kbufs(req, 1, issue_flags); return __io_put_kbufs(req, len, 1, issue_flags);
} }
static inline unsigned int io_put_kbufs(struct io_kiocb *req, int nbufs, static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len,
unsigned issue_flags) int nbufs, unsigned issue_flags)
{ {
return __io_put_kbufs(req, nbufs, issue_flags); return __io_put_kbufs(req, len, nbufs, issue_flags);
} }
#endif #endif

View File

@ -269,27 +269,6 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
return 0; return 0;
} }
/*
* __io_napi_adjust_timeout() - adjust busy loop timeout
* @ctx: pointer to io-uring context structure
* @iowq: pointer to io wait queue
* @ts: pointer to timespec or NULL
*
* Adjust the busy loop timeout according to timespec and busy poll timeout.
* If the specified NAPI timeout is bigger than the wait timeout, then adjust
* the NAPI timeout accordingly.
*/
void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
ktime_t to_wait)
{
ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
if (to_wait)
poll_dt = min(poll_dt, to_wait);
iowq->napi_busy_poll_dt = poll_dt;
}
/* /*
* __io_napi_busy_loop() - execute busy poll loop * __io_napi_busy_loop() - execute busy poll loop
* @ctx: pointer to io-uring context structure * @ctx: pointer to io-uring context structure
@ -299,9 +278,17 @@ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iow
*/ */
void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
{ {
iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); if (ctx->flags & IORING_SETUP_SQPOLL)
return;
if (!(ctx->flags & IORING_SETUP_SQPOLL)) iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
if (iowq->timeout != KTIME_MAX) {
ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx));
iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt);
}
iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
io_napi_blocking_busy_loop(ctx, iowq); io_napi_blocking_busy_loop(ctx, iowq);
} }

View File

@ -17,8 +17,6 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg);
void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock); void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock);
void __io_napi_adjust_timeout(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq, ktime_t to_wait);
void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq);
int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx);
@ -27,15 +25,6 @@ static inline bool io_napi(struct io_ring_ctx *ctx)
return !list_empty(&ctx->napi_list); return !list_empty(&ctx->napi_list);
} }
static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
ktime_t to_wait)
{
if (!io_napi(ctx))
return;
__io_napi_adjust_timeout(ctx, iowq, to_wait);
}
static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq) struct io_wait_queue *iowq)
{ {
@ -86,11 +75,6 @@ static inline bool io_napi(struct io_ring_ctx *ctx)
static inline void io_napi_add(struct io_kiocb *req) static inline void io_napi_add(struct io_kiocb *req)
{ {
} }
static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
ktime_t to_wait)
{
}
static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq) struct io_wait_queue *iowq)
{ {

View File

@ -434,8 +434,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->buf_group = req->buf_index; sr->buf_group = req->buf_index;
req->buf_list = NULL; req->buf_list = NULL;
} }
if (req->flags & REQ_F_BUFFER_SELECT && sr->len)
return -EINVAL;
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
if (req->ctx->compat) if (req->ctx->compat)
@ -499,11 +497,11 @@ static inline bool io_send_finish(struct io_kiocb *req, int *ret,
unsigned int cflags; unsigned int cflags;
if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
cflags = io_put_kbuf(req, issue_flags); cflags = io_put_kbuf(req, *ret, issue_flags);
goto finish; goto finish;
} }
cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags); cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags);
if (bundle_finished || req->flags & REQ_F_BL_EMPTY) if (bundle_finished || req->flags & REQ_F_BL_EMPTY)
goto finish; goto finish;
@ -599,7 +597,7 @@ retry_bundle:
if (io_do_buffer_select(req)) { if (io_do_buffer_select(req)) {
struct buf_sel_arg arg = { struct buf_sel_arg arg = {
.iovs = &kmsg->fast_iov, .iovs = &kmsg->fast_iov,
.max_len = INT_MAX, .max_len = min_not_zero(sr->len, INT_MAX),
.nr_iovs = 1, .nr_iovs = 1,
}; };
@ -618,14 +616,23 @@ retry_bundle:
if (unlikely(ret < 0)) if (unlikely(ret < 0))
return ret; return ret;
sr->len = arg.out_len;
iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret,
arg.out_len);
if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
kmsg->free_iov_nr = ret; kmsg->free_iov_nr = ret;
kmsg->free_iov = arg.iovs; kmsg->free_iov = arg.iovs;
req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_NEED_CLEANUP;
} }
sr->len = arg.out_len;
if (ret == 1) {
sr->buf = arg.iovs[0].iov_base;
ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
&kmsg->msg.msg_iter);
if (unlikely(ret))
return ret;
} else {
iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE,
arg.iovs, ret, arg.out_len);
}
} }
/* /*
@ -835,13 +842,13 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
cflags |= IORING_CQE_F_SOCK_NONEMPTY; cflags |= IORING_CQE_F_SOCK_NONEMPTY;
if (sr->flags & IORING_RECVSEND_BUNDLE) { if (sr->flags & IORING_RECVSEND_BUNDLE) {
cflags |= io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret),
issue_flags); issue_flags);
/* bundle with no more immediate buffers, we're done */ /* bundle with no more immediate buffers, we're done */
if (req->flags & REQ_F_BL_EMPTY) if (req->flags & REQ_F_BL_EMPTY)
goto finish; goto finish;
} else { } else {
cflags |= io_put_kbuf(req, issue_flags); cflags |= io_put_kbuf(req, *ret, issue_flags);
} }
/* /*

View File

@ -335,6 +335,31 @@ err:
return ret; return ret;
} }
static int io_register_clock(struct io_ring_ctx *ctx,
struct io_uring_clock_register __user *arg)
{
struct io_uring_clock_register reg;
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
return -EINVAL;
switch (reg.clockid) {
case CLOCK_MONOTONIC:
ctx->clock_offset = 0;
break;
case CLOCK_BOOTTIME:
ctx->clock_offset = TK_OFFS_BOOT;
break;
default:
return -EINVAL;
}
ctx->clockid = reg.clockid;
return 0;
}
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args) void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock) __releases(ctx->uring_lock)
@ -511,6 +536,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break; break;
ret = io_unregister_napi(ctx, arg); ret = io_unregister_napi(ctx, arg);
break; break;
case IORING_REGISTER_CLOCK:
ret = -EINVAL;
if (!arg || nr_args)
break;
ret = io_register_clock(ctx, arg);
break;
case IORING_REGISTER_COPY_BUFFERS:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_register_copy_buffers(ctx, arg);
break;
default: default:
ret = -EINVAL; ret = -EINVAL;
break; break;
@ -519,6 +556,38 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
return ret; return ret;
} }
/*
* Given an 'fd' value, return the ctx associated with if. If 'registered' is
* true, then the registered index is used. Otherwise, the normal fd table.
* Caller must call fput() on the returned file, unless it's an ERR_PTR.
*/
struct file *io_uring_register_get_file(int fd, bool registered)
{
struct file *file;
if (registered) {
/*
* Ring fd has been registered via IORING_REGISTER_RING_FDS, we
* need only dereference our task private array to find it.
*/
struct io_uring_task *tctx = current->io_uring;
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
return ERR_PTR(-EINVAL);
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
file = tctx->registered_rings[fd];
} else {
file = fget(fd);
}
if (unlikely(!file))
return ERR_PTR(-EBADF);
if (io_is_uring_fops(file))
return file;
fput(file);
return ERR_PTR(-EOPNOTSUPP);
}
SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
void __user *, arg, unsigned int, nr_args) void __user *, arg, unsigned int, nr_args)
{ {
@ -533,35 +602,15 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
if (opcode >= IORING_REGISTER_LAST) if (opcode >= IORING_REGISTER_LAST)
return -EINVAL; return -EINVAL;
if (use_registered_ring) { file = io_uring_register_get_file(fd, use_registered_ring);
/* if (IS_ERR(file))
* Ring fd has been registered via IORING_REGISTER_RING_FDS, we return PTR_ERR(file);
* need only dereference our task private array to find it.
*/
struct io_uring_task *tctx = current->io_uring;
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
return -EINVAL;
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
file = tctx->registered_rings[fd];
if (unlikely(!file))
return -EBADF;
} else {
file = fget(fd);
if (unlikely(!file))
return -EBADF;
ret = -EOPNOTSUPP;
if (!io_is_uring_fops(file))
goto out_fput;
}
ctx = file->private_data; ctx = file->private_data;
mutex_lock(&ctx->uring_lock); mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args); ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock); mutex_unlock(&ctx->uring_lock);
trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
out_fput:
if (!use_registered_ring) if (!use_registered_ring)
fput(file); fput(file);
return ret; return ret;

View File

@ -4,5 +4,6 @@
int io_eventfd_unregister(struct io_ring_ctx *ctx); int io_eventfd_unregister(struct io_ring_ctx *ctx);
int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id); int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
struct file *io_uring_register_get_file(int fd, bool registered);
#endif #endif

View File

@ -17,6 +17,7 @@
#include "openclose.h" #include "openclose.h"
#include "rsrc.h" #include "rsrc.h"
#include "memmap.h" #include "memmap.h"
#include "register.h"
struct io_rsrc_update { struct io_rsrc_update {
struct file *file; struct file *file;
@ -114,14 +115,16 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
struct io_mapped_ubuf *imu = *slot; struct io_mapped_ubuf *imu = *slot;
unsigned int i; unsigned int i;
*slot = NULL;
if (imu != &dummy_ubuf) { if (imu != &dummy_ubuf) {
if (!refcount_dec_and_test(&imu->refs))
return;
for (i = 0; i < imu->nr_bvecs; i++) for (i = 0; i < imu->nr_bvecs; i++)
unpin_user_page(imu->bvec[i].bv_page); unpin_user_page(imu->bvec[i].bv_page);
if (imu->acct_pages) if (imu->acct_pages)
io_unaccount_mem(ctx, imu->acct_pages); io_unaccount_mem(ctx, imu->acct_pages);
kvfree(imu); kvfree(imu);
} }
*slot = NULL;
} }
static void io_rsrc_put_work(struct io_rsrc_node *node) static void io_rsrc_put_work(struct io_rsrc_node *node)
@ -855,6 +858,98 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
return ret; return ret;
} }
static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
struct io_imu_folio_data *data, int nr_folios)
{
struct page **page_array = *pages, **new_array = NULL;
int nr_pages_left = *nr_pages, i, j;
/* Store head pages only*/
new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
GFP_KERNEL);
if (!new_array)
return false;
new_array[0] = compound_head(page_array[0]);
/*
* The pages are bound to the folio, it doesn't
* actually unpin them but drops all but one reference,
* which is usually put down by io_buffer_unmap().
* Note, needs a better helper.
*/
if (data->nr_pages_head > 1)
unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
j = data->nr_pages_head;
nr_pages_left -= data->nr_pages_head;
for (i = 1; i < nr_folios; i++) {
unsigned int nr_unpin;
new_array[i] = page_array[j];
nr_unpin = min_t(unsigned int, nr_pages_left - 1,
data->nr_pages_mid - 1);
if (nr_unpin)
unpin_user_pages(&page_array[j+1], nr_unpin);
j += data->nr_pages_mid;
nr_pages_left -= data->nr_pages_mid;
}
kvfree(page_array);
*pages = new_array;
*nr_pages = nr_folios;
return true;
}
static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
struct io_imu_folio_data *data)
{
struct page **page_array = *pages;
struct folio *folio = page_folio(page_array[0]);
unsigned int count = 1, nr_folios = 1;
int i;
if (*nr_pages <= 1)
return false;
data->nr_pages_mid = folio_nr_pages(folio);
if (data->nr_pages_mid == 1)
return false;
data->folio_shift = folio_shift(folio);
/*
* Check if pages are contiguous inside a folio, and all folios have
* the same page count except for the head and tail.
*/
for (i = 1; i < *nr_pages; i++) {
if (page_folio(page_array[i]) == folio &&
page_array[i] == page_array[i-1] + 1) {
count++;
continue;
}
if (nr_folios == 1) {
if (folio_page_idx(folio, page_array[i-1]) !=
data->nr_pages_mid - 1)
return false;
data->nr_pages_head = count;
} else if (count != data->nr_pages_mid) {
return false;
}
folio = page_folio(page_array[i]);
if (folio_size(folio) != (1UL << data->folio_shift) ||
folio_page_idx(folio, page_array[i]) != 0)
return false;
count = 1;
nr_folios++;
}
if (nr_folios == 1)
data->nr_pages_head = count;
return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
}
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
struct io_mapped_ubuf **pimu, struct io_mapped_ubuf **pimu,
struct page **last_hpage) struct page **last_hpage)
@ -864,7 +959,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
unsigned long off; unsigned long off;
size_t size; size_t size;
int ret, nr_pages, i; int ret, nr_pages, i;
struct folio *folio = NULL; struct io_imu_folio_data data;
bool coalesced;
*pimu = (struct io_mapped_ubuf *)&dummy_ubuf; *pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
if (!iov->iov_base) if (!iov->iov_base)
@ -879,31 +975,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
goto done; goto done;
} }
/* If it's a huge page, try to coalesce them into a single bvec entry */ /* If it's huge page(s), try to coalesce them into fewer bvec entries */
if (nr_pages > 1) { coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data);
folio = page_folio(pages[0]);
for (i = 1; i < nr_pages; i++) {
/*
* Pages must be consecutive and on the same folio for
* this to work
*/
if (page_folio(pages[i]) != folio ||
pages[i] != pages[i - 1] + 1) {
folio = NULL;
break;
}
}
if (folio) {
/*
* The pages are bound to the folio, it doesn't
* actually unpin them but drops all but one reference,
* which is usually put down by io_buffer_unmap().
* Note, needs a better helper.
*/
unpin_user_pages(&pages[1], nr_pages - 1);
nr_pages = 1;
}
}
imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
if (!imu) if (!imu)
@ -915,23 +988,26 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
goto done; goto done;
} }
off = (unsigned long) iov->iov_base & ~PAGE_MASK;
size = iov->iov_len; size = iov->iov_len;
/* store original address for later verification */ /* store original address for later verification */
imu->ubuf = (unsigned long) iov->iov_base; imu->ubuf = (unsigned long) iov->iov_base;
imu->ubuf_end = imu->ubuf + iov->iov_len; imu->ubuf_end = imu->ubuf + iov->iov_len;
imu->nr_bvecs = nr_pages; imu->nr_bvecs = nr_pages;
imu->folio_shift = PAGE_SHIFT;
imu->folio_mask = PAGE_MASK;
if (coalesced) {
imu->folio_shift = data.folio_shift;
imu->folio_mask = ~((1UL << data.folio_shift) - 1);
}
refcount_set(&imu->refs, 1);
off = (unsigned long) iov->iov_base & ~imu->folio_mask;
*pimu = imu; *pimu = imu;
ret = 0; ret = 0;
if (folio) {
bvec_set_page(&imu->bvec[0], pages[0], size, off);
goto done;
}
for (i = 0; i < nr_pages; i++) { for (i = 0; i < nr_pages; i++) {
size_t vec_len; size_t vec_len;
vec_len = min_t(size_t, size, PAGE_SIZE - off); vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
off = 0; off = 0;
size -= vec_len; size -= vec_len;
@ -1042,23 +1118,18 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
* we know that: * we know that:
* *
* 1) it's a BVEC iter, we set it up * 1) it's a BVEC iter, we set it up
* 2) all bvecs are PAGE_SIZE in size, except potentially the * 2) all bvecs are the same in size, except potentially the
* first and last bvec * first and last bvec
* *
* So just find our index, and adjust the iterator afterwards. * So just find our index, and adjust the iterator afterwards.
* If the offset is within the first bvec (or the whole first * If the offset is within the first bvec (or the whole first
* bvec, just use iov_iter_advance(). This makes it easier * bvec, just use iov_iter_advance(). This makes it easier
* since we can just skip the first segment, which may not * since we can just skip the first segment, which may not
* be PAGE_SIZE aligned. * be folio_size aligned.
*/ */
const struct bio_vec *bvec = imu->bvec; const struct bio_vec *bvec = imu->bvec;
if (offset < bvec->bv_len) { if (offset < bvec->bv_len) {
/*
* Note, huge pages buffers consists of one large
* bvec entry and should always go this way. The other
* branch doesn't expect non PAGE_SIZE'd chunks.
*/
iter->bvec = bvec; iter->bvec = bvec;
iter->count -= offset; iter->count -= offset;
iter->iov_offset = offset; iter->iov_offset = offset;
@ -1067,14 +1138,104 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
/* skip first vec */ /* skip first vec */
offset -= bvec->bv_len; offset -= bvec->bv_len;
seg_skip = 1 + (offset >> PAGE_SHIFT); seg_skip = 1 + (offset >> imu->folio_shift);
iter->bvec = bvec + seg_skip; iter->bvec = bvec + seg_skip;
iter->nr_segs -= seg_skip; iter->nr_segs -= seg_skip;
iter->count -= bvec->bv_len + offset; iter->count -= bvec->bv_len + offset;
iter->iov_offset = offset & ~PAGE_MASK; iter->iov_offset = offset & ~imu->folio_mask;
} }
} }
return 0; return 0;
} }
static int io_copy_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
{
struct io_mapped_ubuf **user_bufs;
struct io_rsrc_data *data;
int i, ret, nbufs;
/*
* Drop our own lock here. We'll setup the data we need and reference
* the source buffers, then re-grab, check, and assign at the end.
*/
mutex_unlock(&ctx->uring_lock);
mutex_lock(&src_ctx->uring_lock);
ret = -ENXIO;
nbufs = src_ctx->nr_user_bufs;
if (!nbufs)
goto out_unlock;
ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data);
if (ret)
goto out_unlock;
ret = -ENOMEM;
user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL);
if (!user_bufs)
goto out_free_data;
for (i = 0; i < nbufs; i++) {
struct io_mapped_ubuf *src = src_ctx->user_bufs[i];
refcount_inc(&src->refs);
user_bufs[i] = src;
}
/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
mutex_unlock(&src_ctx->uring_lock);
mutex_lock(&ctx->uring_lock);
if (!ctx->user_bufs) {
ctx->user_bufs = user_bufs;
ctx->buf_data = data;
ctx->nr_user_bufs = nbufs;
return 0;
}
/* someone raced setting up buffers, dump ours */
for (i = 0; i < nbufs; i++)
io_buffer_unmap(ctx, &user_bufs[i]);
io_rsrc_data_free(data);
kfree(user_bufs);
return -EBUSY;
out_free_data:
io_rsrc_data_free(data);
out_unlock:
mutex_unlock(&src_ctx->uring_lock);
mutex_lock(&ctx->uring_lock);
return ret;
}
/*
* Copy the registered buffers from the source ring whose file descriptor
* is given in the src_fd to the current ring. This is identical to registering
* the buffers with ctx, except faster as mappings already exist.
*
* Since the memory is already accounted once, don't account it again.
*/
int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_copy_buffers buf;
bool registered_src;
struct file *file;
int ret;
if (ctx->user_bufs || ctx->nr_user_bufs)
return -EBUSY;
if (copy_from_user(&buf, arg, sizeof(buf)))
return -EFAULT;
if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED)
return -EINVAL;
if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
return -EINVAL;
registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
file = io_uring_register_get_file(buf.src_fd, registered_src);
if (IS_ERR(file))
return PTR_ERR(file);
ret = io_copy_buffers(ctx, file->private_data);
if (!registered_src)
fput(file);
return ret;
}

View File

@ -22,8 +22,6 @@ struct io_rsrc_put {
}; };
}; };
typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
struct io_rsrc_data { struct io_rsrc_data {
struct io_ring_ctx *ctx; struct io_ring_ctx *ctx;
@ -46,10 +44,21 @@ struct io_mapped_ubuf {
u64 ubuf; u64 ubuf;
u64 ubuf_end; u64 ubuf_end;
unsigned int nr_bvecs; unsigned int nr_bvecs;
unsigned int folio_shift;
unsigned long acct_pages; unsigned long acct_pages;
unsigned long folio_mask;
refcount_t refs;
struct bio_vec bvec[] __counted_by(nr_bvecs); struct bio_vec bvec[] __counted_by(nr_bvecs);
}; };
struct io_imu_folio_data {
/* Head folio can be partially included in the fixed buf */
unsigned int nr_pages_head;
/* For non-head/tail folios, has to be fully included */
unsigned int nr_pages_mid;
unsigned int folio_shift;
};
void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
@ -59,6 +68,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
struct io_mapped_ubuf *imu, struct io_mapped_ubuf *imu,
u64 buf_addr, size_t len); u64 buf_addr, size_t len);
int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg);
void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx); void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,

View File

@ -467,8 +467,7 @@ static void io_req_io_end(struct io_kiocb *req)
static bool __io_complete_rw_common(struct io_kiocb *req, long res) static bool __io_complete_rw_common(struct io_kiocb *req, long res)
{ {
if (unlikely(res != req->cqe.res)) { if (unlikely(res != req->cqe.res)) {
if ((res == -EAGAIN || res == -EOPNOTSUPP) && if (res == -EAGAIN && io_rw_should_reissue(req)) {
io_rw_should_reissue(req)) {
/* /*
* Reissue will start accounting again, finish the * Reissue will start accounting again, finish the
* current cycle. * current cycle.
@ -511,7 +510,7 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
io_req_io_end(req); io_req_io_end(req);
if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
req->cqe.flags |= io_put_kbuf(req, 0); req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0);
io_req_rw_cleanup(req, 0); io_req_rw_cleanup(req, 0);
io_req_task_complete(req, ts); io_req_task_complete(req, ts);
@ -593,7 +592,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
*/ */
io_req_io_end(req); io_req_io_end(req);
io_req_set_res(req, final_ret, io_req_set_res(req, final_ret,
io_put_kbuf(req, issue_flags)); io_put_kbuf(req, ret, issue_flags));
io_req_rw_cleanup(req, issue_flags); io_req_rw_cleanup(req, issue_flags);
return IOU_OK; return IOU_OK;
} }
@ -855,6 +854,14 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
ret = io_iter_do_read(rw, &io->iter); ret = io_iter_do_read(rw, &io->iter);
/*
* Some file systems like to return -EOPNOTSUPP for an IOCB_NOWAIT
* issue, even though they should be returning -EAGAIN. To be safe,
* retry from blocking context for either.
*/
if (ret == -EOPNOTSUPP && force_nonblock)
ret = -EAGAIN;
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE; req->flags &= ~REQ_F_REISSUE;
/* If we can poll, just do that. */ /* If we can poll, just do that. */
@ -975,7 +982,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
* Put our buffer and post a CQE. If we fail to post a CQE, then * Put our buffer and post a CQE. If we fail to post a CQE, then
* jump to the termination path. This request is then done. * jump to the termination path. This request is then done.
*/ */
cflags = io_put_kbuf(req, issue_flags); cflags = io_put_kbuf(req, ret, issue_flags);
rw->len = 0; /* similarly to above, reset len to 0 */ rw->len = 0; /* similarly to above, reset len to 0 */
if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
@ -1167,7 +1174,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
if (!smp_load_acquire(&req->iopoll_completed)) if (!smp_load_acquire(&req->iopoll_completed))
break; break;
nr_events++; nr_events++;
req->cqe.flags = io_put_kbuf(req, 0); req->cqe.flags = io_put_kbuf(req, req->cqe.res, 0);
if (req->opcode != IORING_OP_URING_CMD) if (req->opcode != IORING_OP_URING_CMD)
io_req_rw_cleanup(req, 0); io_req_rw_cleanup(req, 0);
} }

View File

@ -10,6 +10,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/audit.h> #include <linux/audit.h>
#include <linux/security.h> #include <linux/security.h>
#include <linux/cpuset.h>
#include <linux/io_uring.h> #include <linux/io_uring.h>
#include <uapi/linux/io_uring.h> #include <uapi/linux/io_uring.h>
@ -176,7 +177,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { if (to_submit || !wq_list_empty(&ctx->iopoll_list)) {
const struct cred *creds = NULL; const struct cred *creds = NULL;
if (ctx->sq_creds != current_cred()) if (ctx->sq_creds != current_cred())
@ -460,10 +461,12 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
return 0; return 0;
if (p->flags & IORING_SETUP_SQ_AFF) { if (p->flags & IORING_SETUP_SQ_AFF) {
struct cpumask allowed_mask;
int cpu = p->sq_thread_cpu; int cpu = p->sq_thread_cpu;
ret = -EINVAL; ret = -EINVAL;
if (cpu >= nr_cpu_ids || !cpu_online(cpu)) cpuset_cpus_allowed(current, &allowed_mask);
if (!cpumask_test_cpu(cpu, &allowed_mask))
goto err_sqpoll; goto err_sqpoll;
sqd->sq_cpu = cpu; sqd->sq_cpu = cpu;
} else { } else {