mirror of
https://github.com/torvalds/linux.git
synced 2024-11-25 13:41:51 +00:00
for-5.20/io_uring-2022-07-29
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmLkm5gQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpmKMD/4l3QIrLbjYIxlfrzQcHbmYuUkbQtj3SbZg 6ejbnGVhCs1P9DdXH8MgE2BxgpiXQE0CqOK7vbSoo5ep2n2UTLI2DIxAl74SMIo7 0wmJXtUJySuViKr3NYVHqlN180MkQYddBz0nGElhkQBPBCMhW8CrtPCeURr/YyHp 2RxSYBXiUx2gRyig+klnp6oPEqelcBZJUyNHdA9yVrgl/RhB/t2rKj7D++8ukQM3 Zuyh8WIkTeTfUz9hdGG7fuCEdZN4DlO2CCEc7uy0cKi6VRCKH4hYUCqClJ+/cfd2 43dUI2O7B6D1t/ObFh8AGIDXBDqVA6ePQohQU6gooRkfQiBPKkc9d0ts4yIhRqca AjkzNM+0Eve3A01loJ8J84w8oZnvNpYEv5n8/sZVLWcyU3UIs0I88nC2OBiFtoRq d77CtFLwOTo+r3STtAhnZOqez90rhS6BqKtqlUP346PCuFItl6/MbGtwdTbLYEFj CVNIb2pERWSr2NxGv4lFyXaX/cRwruxojWH7yc3rRYjr4Ykevd1pe/fMGNiMAnKw 5em/3QU3qq0ZVcXLMihksKeHHFIQwGDRMuyuv/fktV10+yYXQ0t16WzkJT3aR8Xo cqs0r8+6Jnj3uYcOMzj/FoLcpEPr21hnwAtzLto1mG1Wh4JRn/D7Nx5zqxPLxcW+ NiU6VihPOw== =gxeV -----END PGP SIGNATURE----- Merge tag 'for-5.20/io_uring-2022-07-29' of git://git.kernel.dk/linux-block Pull io_uring updates from Jens Axboe: - As per (valid) complaint in the last merge window, fs/io_uring.c has grown quite large these days. io_uring isn't really tied to fs either, as it supports a wide variety of functionality outside of that. Move the code to io_uring/ and split it into files that either implement a specific request type, and split some code into helpers as well. The code is organized a lot better like this, and io_uring.c is now < 4K LOC (me). - Deprecate the epoll_ctl opcode. It'll still work, just trigger a warning once if used. If we don't get any complaints on this, and I don't expect any, then we can fully remove it in a future release (me). - Improve the cancel hash locking (Hao) - kbuf cleanups (Hao) - Efficiency improvements to the task_work handling (Dylan, Pavel) - Provided buffer improvements (Dylan) - Add support for recv/recvmsg multishot support. This is similar to the accept (or poll) support for have for multishot, where a single SQE can trigger everytime data is received. For applications that expect to do more than a few receives on an instantiated socket, this greatly improves efficiency (Dylan). - Efficiency improvements for poll handling (Pavel) - Poll cancelation improvements (Pavel) - Allow specifiying a range for direct descriptor allocations (Pavel) - Cleanup the cqe32 handling (Pavel) - Move io_uring types to greatly cleanup the tracing (Pavel) - Tons of great code cleanups and improvements (Pavel) - Add a way to do sync cancelations rather than through the sqe -> cqe interface, as that's a lot easier to use for some use cases (me). - Add support to IORING_OP_MSG_RING for sending direct descriptors to a different ring. This avoids the usually problematic SCM case, as we disallow those. (me) - Make the per-command alloc cache we use for apoll generic, place limits on it, and use it for netmsg as well (me). - Various cleanups (me, Michal, Gustavo, Uros) * tag 'for-5.20/io_uring-2022-07-29' of git://git.kernel.dk/linux-block: (172 commits) io_uring: ensure REQ_F_ISREG is set async offload net: fix compat pointer in get_compat_msghdr() io_uring: Don't require reinitable percpu_ref io_uring: fix types in io_recvmsg_multishot_overflow io_uring: Use atomic_long_try_cmpxchg in __io_account_mem io_uring: support multishot in recvmsg net: copy from user before calling __get_compat_msghdr net: copy from user before calling __copy_msghdr io_uring: support 0 length iov in buffer select in compat io_uring: fix multishot ending when not polled io_uring: add netmsg cache io_uring: impose max limit on apoll cache io_uring: add abstraction around apoll cache io_uring: move apoll cache to poll.c io_uring: consolidate hash_locked io-wq handling io_uring: clear REQ_F_HASH_LOCKED on hash removal io_uring: don't race double poll setting REQ_F_ASYNC_DATA io_uring: don't miss setting REQ_F_DOUBLE_POLL io_uring: disable multishot recvmsg io_uring: only trace one of complete or overflow ...
This commit is contained in:
commit
b349b1181d
@ -7811,9 +7811,6 @@ F: include/linux/fs.h
|
||||
F: include/linux/fs_types.h
|
||||
F: include/uapi/linux/fs.h
|
||||
F: include/uapi/linux/openat2.h
|
||||
X: fs/io-wq.c
|
||||
X: fs/io-wq.h
|
||||
X: fs/io_uring.c
|
||||
|
||||
FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER
|
||||
M: Riku Voipio <riku.voipio@iki.fi>
|
||||
@ -10521,9 +10518,7 @@ L: io-uring@vger.kernel.org
|
||||
S: Maintained
|
||||
T: git git://git.kernel.dk/linux-block
|
||||
T: git git://git.kernel.dk/liburing
|
||||
F: fs/io-wq.c
|
||||
F: fs/io-wq.h
|
||||
F: fs/io_uring.c
|
||||
F: io_uring/
|
||||
F: include/linux/io_uring.h
|
||||
F: include/uapi/linux/io_uring.h
|
||||
F: tools/io_uring/
|
||||
|
1
Makefile
1
Makefile
@ -1097,6 +1097,7 @@ export MODULES_NSDEPS := $(extmod_prefix)modules.nsdeps
|
||||
ifeq ($(KBUILD_EXTMOD),)
|
||||
core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/
|
||||
core-$(CONFIG_BLOCK) += block/
|
||||
core-$(CONFIG_IO_URING) += io_uring/
|
||||
|
||||
vmlinux-dirs := $(patsubst %/,%,$(filter %/, \
|
||||
$(core-y) $(core-m) $(drivers-y) $(drivers-m) \
|
||||
|
@ -34,8 +34,6 @@ obj-$(CONFIG_TIMERFD) += timerfd.o
|
||||
obj-$(CONFIG_EVENTFD) += eventfd.o
|
||||
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
|
||||
obj-$(CONFIG_AIO) += aio.o
|
||||
obj-$(CONFIG_IO_URING) += io_uring.o
|
||||
obj-$(CONFIG_IO_WQ) += io-wq.o
|
||||
obj-$(CONFIG_FS_DAX) += dax.o
|
||||
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
|
||||
obj-$(CONFIG_FS_VERITY) += verity/
|
||||
|
13273
fs/io_uring.c
13273
fs/io_uring.c
File diff suppressed because it is too large
Load Diff
544
include/linux/io_uring_types.h
Normal file
544
include/linux/io_uring_types.h
Normal file
@ -0,0 +1,544 @@
|
||||
#ifndef IO_URING_TYPES_H
|
||||
#define IO_URING_TYPES_H
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/task_work.h>
|
||||
#include <linux/bitmap.h>
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
struct io_wq_work_node {
|
||||
struct io_wq_work_node *next;
|
||||
};
|
||||
|
||||
struct io_wq_work_list {
|
||||
struct io_wq_work_node *first;
|
||||
struct io_wq_work_node *last;
|
||||
};
|
||||
|
||||
struct io_wq_work {
|
||||
struct io_wq_work_node list;
|
||||
unsigned flags;
|
||||
/* place it here instead of io_kiocb as it fills padding and saves 4B */
|
||||
int cancel_seq;
|
||||
};
|
||||
|
||||
struct io_fixed_file {
|
||||
/* file * with additional FFS_* flags */
|
||||
unsigned long file_ptr;
|
||||
};
|
||||
|
||||
struct io_file_table {
|
||||
struct io_fixed_file *files;
|
||||
unsigned long *bitmap;
|
||||
unsigned int alloc_hint;
|
||||
};
|
||||
|
||||
struct io_hash_bucket {
|
||||
spinlock_t lock;
|
||||
struct hlist_head list;
|
||||
} ____cacheline_aligned_in_smp;
|
||||
|
||||
struct io_hash_table {
|
||||
struct io_hash_bucket *hbs;
|
||||
unsigned hash_bits;
|
||||
};
|
||||
|
||||
struct io_uring {
|
||||
u32 head ____cacheline_aligned_in_smp;
|
||||
u32 tail ____cacheline_aligned_in_smp;
|
||||
};
|
||||
|
||||
/*
|
||||
* This data is shared with the application through the mmap at offsets
|
||||
* IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
|
||||
*
|
||||
* The offsets to the member fields are published through struct
|
||||
* io_sqring_offsets when calling io_uring_setup.
|
||||
*/
|
||||
struct io_rings {
|
||||
/*
|
||||
* Head and tail offsets into the ring; the offsets need to be
|
||||
* masked to get valid indices.
|
||||
*
|
||||
* The kernel controls head of the sq ring and the tail of the cq ring,
|
||||
* and the application controls tail of the sq ring and the head of the
|
||||
* cq ring.
|
||||
*/
|
||||
struct io_uring sq, cq;
|
||||
/*
|
||||
* Bitmasks to apply to head and tail offsets (constant, equals
|
||||
* ring_entries - 1)
|
||||
*/
|
||||
u32 sq_ring_mask, cq_ring_mask;
|
||||
/* Ring sizes (constant, power of 2) */
|
||||
u32 sq_ring_entries, cq_ring_entries;
|
||||
/*
|
||||
* Number of invalid entries dropped by the kernel due to
|
||||
* invalid index stored in array
|
||||
*
|
||||
* Written by the kernel, shouldn't be modified by the
|
||||
* application (i.e. get number of "new events" by comparing to
|
||||
* cached value).
|
||||
*
|
||||
* After a new SQ head value was read by the application this
|
||||
* counter includes all submissions that were dropped reaching
|
||||
* the new SQ head (and possibly more).
|
||||
*/
|
||||
u32 sq_dropped;
|
||||
/*
|
||||
* Runtime SQ flags
|
||||
*
|
||||
* Written by the kernel, shouldn't be modified by the
|
||||
* application.
|
||||
*
|
||||
* The application needs a full memory barrier before checking
|
||||
* for IORING_SQ_NEED_WAKEUP after updating the sq tail.
|
||||
*/
|
||||
atomic_t sq_flags;
|
||||
/*
|
||||
* Runtime CQ flags
|
||||
*
|
||||
* Written by the application, shouldn't be modified by the
|
||||
* kernel.
|
||||
*/
|
||||
u32 cq_flags;
|
||||
/*
|
||||
* Number of completion events lost because the queue was full;
|
||||
* this should be avoided by the application by making sure
|
||||
* there are not more requests pending than there is space in
|
||||
* the completion queue.
|
||||
*
|
||||
* Written by the kernel, shouldn't be modified by the
|
||||
* application (i.e. get number of "new events" by comparing to
|
||||
* cached value).
|
||||
*
|
||||
* As completion events come in out of order this counter is not
|
||||
* ordered with any other data.
|
||||
*/
|
||||
u32 cq_overflow;
|
||||
/*
|
||||
* Ring buffer of completion events.
|
||||
*
|
||||
* The kernel writes completion events fresh every time they are
|
||||
* produced, so the application is allowed to modify pending
|
||||
* entries.
|
||||
*/
|
||||
struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
|
||||
};
|
||||
|
||||
struct io_restriction {
|
||||
DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
|
||||
DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
|
||||
u8 sqe_flags_allowed;
|
||||
u8 sqe_flags_required;
|
||||
bool registered;
|
||||
};
|
||||
|
||||
struct io_submit_link {
|
||||
struct io_kiocb *head;
|
||||
struct io_kiocb *last;
|
||||
};
|
||||
|
||||
struct io_submit_state {
|
||||
/* inline/task_work completion list, under ->uring_lock */
|
||||
struct io_wq_work_node free_list;
|
||||
/* batch completion logic */
|
||||
struct io_wq_work_list compl_reqs;
|
||||
struct io_submit_link link;
|
||||
|
||||
bool plug_started;
|
||||
bool need_plug;
|
||||
unsigned short submit_nr;
|
||||
struct blk_plug plug;
|
||||
};
|
||||
|
||||
struct io_ev_fd {
|
||||
struct eventfd_ctx *cq_ev_fd;
|
||||
unsigned int eventfd_async: 1;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
struct io_alloc_cache {
|
||||
struct hlist_head list;
|
||||
unsigned int nr_cached;
|
||||
};
|
||||
|
||||
struct io_ring_ctx {
|
||||
/* const or read-mostly hot data */
|
||||
struct {
|
||||
struct percpu_ref refs;
|
||||
|
||||
struct io_rings *rings;
|
||||
unsigned int flags;
|
||||
enum task_work_notify_mode notify_method;
|
||||
unsigned int compat: 1;
|
||||
unsigned int drain_next: 1;
|
||||
unsigned int restricted: 1;
|
||||
unsigned int off_timeout_used: 1;
|
||||
unsigned int drain_active: 1;
|
||||
unsigned int drain_disabled: 1;
|
||||
unsigned int has_evfd: 1;
|
||||
unsigned int syscall_iopoll: 1;
|
||||
} ____cacheline_aligned_in_smp;
|
||||
|
||||
/* submission data */
|
||||
struct {
|
||||
struct mutex uring_lock;
|
||||
|
||||
/*
|
||||
* Ring buffer of indices into array of io_uring_sqe, which is
|
||||
* mmapped by the application using the IORING_OFF_SQES offset.
|
||||
*
|
||||
* This indirection could e.g. be used to assign fixed
|
||||
* io_uring_sqe entries to operations and only submit them to
|
||||
* the queue when needed.
|
||||
*
|
||||
* The kernel modifies neither the indices array nor the entries
|
||||
* array.
|
||||
*/
|
||||
u32 *sq_array;
|
||||
struct io_uring_sqe *sq_sqes;
|
||||
unsigned cached_sq_head;
|
||||
unsigned sq_entries;
|
||||
|
||||
/*
|
||||
* Fixed resources fast path, should be accessed only under
|
||||
* uring_lock, and updated through io_uring_register(2)
|
||||
*/
|
||||
struct io_rsrc_node *rsrc_node;
|
||||
int rsrc_cached_refs;
|
||||
atomic_t cancel_seq;
|
||||
struct io_file_table file_table;
|
||||
unsigned nr_user_files;
|
||||
unsigned nr_user_bufs;
|
||||
struct io_mapped_ubuf **user_bufs;
|
||||
|
||||
struct io_submit_state submit_state;
|
||||
|
||||
struct io_buffer_list *io_bl;
|
||||
struct xarray io_bl_xa;
|
||||
struct list_head io_buffers_cache;
|
||||
|
||||
struct io_hash_table cancel_table_locked;
|
||||
struct list_head cq_overflow_list;
|
||||
struct io_alloc_cache apoll_cache;
|
||||
struct io_alloc_cache netmsg_cache;
|
||||
} ____cacheline_aligned_in_smp;
|
||||
|
||||
/* IRQ completion list, under ->completion_lock */
|
||||
struct io_wq_work_list locked_free_list;
|
||||
unsigned int locked_free_nr;
|
||||
|
||||
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
|
||||
struct io_sq_data *sq_data; /* if using sq thread polling */
|
||||
|
||||
struct wait_queue_head sqo_sq_wait;
|
||||
struct list_head sqd_list;
|
||||
|
||||
unsigned long check_cq;
|
||||
|
||||
unsigned int file_alloc_start;
|
||||
unsigned int file_alloc_end;
|
||||
|
||||
struct xarray personalities;
|
||||
u32 pers_next;
|
||||
|
||||
struct {
|
||||
/*
|
||||
* We cache a range of free CQEs we can use, once exhausted it
|
||||
* should go through a slower range setup, see __io_get_cqe()
|
||||
*/
|
||||
struct io_uring_cqe *cqe_cached;
|
||||
struct io_uring_cqe *cqe_sentinel;
|
||||
|
||||
unsigned cached_cq_tail;
|
||||
unsigned cq_entries;
|
||||
struct io_ev_fd __rcu *io_ev_fd;
|
||||
struct wait_queue_head cq_wait;
|
||||
unsigned cq_extra;
|
||||
} ____cacheline_aligned_in_smp;
|
||||
|
||||
struct {
|
||||
spinlock_t completion_lock;
|
||||
|
||||
/*
|
||||
* ->iopoll_list is protected by the ctx->uring_lock for
|
||||
* io_uring instances that don't use IORING_SETUP_SQPOLL.
|
||||
* For SQPOLL, only the single threaded io_sq_thread() will
|
||||
* manipulate the list, hence no extra locking is needed there.
|
||||
*/
|
||||
struct io_wq_work_list iopoll_list;
|
||||
struct io_hash_table cancel_table;
|
||||
bool poll_multi_queue;
|
||||
|
||||
struct list_head io_buffers_comp;
|
||||
} ____cacheline_aligned_in_smp;
|
||||
|
||||
/* timeouts */
|
||||
struct {
|
||||
spinlock_t timeout_lock;
|
||||
atomic_t cq_timeouts;
|
||||
struct list_head timeout_list;
|
||||
struct list_head ltimeout_list;
|
||||
unsigned cq_last_tm_flush;
|
||||
} ____cacheline_aligned_in_smp;
|
||||
|
||||
/* Keep this last, we don't need it for the fast path */
|
||||
|
||||
struct io_restriction restrictions;
|
||||
struct task_struct *submitter_task;
|
||||
|
||||
/* slow path rsrc auxilary data, used by update/register */
|
||||
struct io_rsrc_node *rsrc_backup_node;
|
||||
struct io_mapped_ubuf *dummy_ubuf;
|
||||
struct io_rsrc_data *file_data;
|
||||
struct io_rsrc_data *buf_data;
|
||||
|
||||
struct delayed_work rsrc_put_work;
|
||||
struct llist_head rsrc_put_llist;
|
||||
struct list_head rsrc_ref_list;
|
||||
spinlock_t rsrc_ref_lock;
|
||||
|
||||
struct list_head io_buffers_pages;
|
||||
|
||||
#if defined(CONFIG_UNIX)
|
||||
struct socket *ring_sock;
|
||||
#endif
|
||||
/* hashed buffered write serialization */
|
||||
struct io_wq_hash *hash_map;
|
||||
|
||||
/* Only used for accounting purposes */
|
||||
struct user_struct *user;
|
||||
struct mm_struct *mm_account;
|
||||
|
||||
/* ctx exit and cancelation */
|
||||
struct llist_head fallback_llist;
|
||||
struct delayed_work fallback_work;
|
||||
struct work_struct exit_work;
|
||||
struct list_head tctx_list;
|
||||
struct completion ref_comp;
|
||||
|
||||
/* io-wq management, e.g. thread count */
|
||||
u32 iowq_limits[2];
|
||||
bool iowq_limits_set;
|
||||
|
||||
struct list_head defer_list;
|
||||
unsigned sq_thread_idle;
|
||||
/* protected by ->completion_lock */
|
||||
unsigned evfd_last_cq_tail;
|
||||
};
|
||||
|
||||
enum {
|
||||
REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
|
||||
REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
|
||||
REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
|
||||
REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
|
||||
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
|
||||
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
|
||||
REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT,
|
||||
|
||||
/* first byte is taken by user flags, shift it to not overlap */
|
||||
REQ_F_FAIL_BIT = 8,
|
||||
REQ_F_INFLIGHT_BIT,
|
||||
REQ_F_CUR_POS_BIT,
|
||||
REQ_F_NOWAIT_BIT,
|
||||
REQ_F_LINK_TIMEOUT_BIT,
|
||||
REQ_F_NEED_CLEANUP_BIT,
|
||||
REQ_F_POLLED_BIT,
|
||||
REQ_F_BUFFER_SELECTED_BIT,
|
||||
REQ_F_BUFFER_RING_BIT,
|
||||
REQ_F_REISSUE_BIT,
|
||||
REQ_F_CREDS_BIT,
|
||||
REQ_F_REFCOUNT_BIT,
|
||||
REQ_F_ARM_LTIMEOUT_BIT,
|
||||
REQ_F_ASYNC_DATA_BIT,
|
||||
REQ_F_SKIP_LINK_CQES_BIT,
|
||||
REQ_F_SINGLE_POLL_BIT,
|
||||
REQ_F_DOUBLE_POLL_BIT,
|
||||
REQ_F_PARTIAL_IO_BIT,
|
||||
REQ_F_CQE32_INIT_BIT,
|
||||
REQ_F_APOLL_MULTISHOT_BIT,
|
||||
REQ_F_CLEAR_POLLIN_BIT,
|
||||
REQ_F_HASH_LOCKED_BIT,
|
||||
/* keep async read/write and isreg together and in order */
|
||||
REQ_F_SUPPORT_NOWAIT_BIT,
|
||||
REQ_F_ISREG_BIT,
|
||||
|
||||
/* not a real bit, just to check we're not overflowing the space */
|
||||
__REQ_F_LAST_BIT,
|
||||
};
|
||||
|
||||
enum {
|
||||
/* ctx owns file */
|
||||
REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
|
||||
/* drain existing IO first */
|
||||
REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
|
||||
/* linked sqes */
|
||||
REQ_F_LINK = BIT(REQ_F_LINK_BIT),
|
||||
/* doesn't sever on completion < 0 */
|
||||
REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
|
||||
/* IOSQE_ASYNC */
|
||||
REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
|
||||
/* IOSQE_BUFFER_SELECT */
|
||||
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
|
||||
/* IOSQE_CQE_SKIP_SUCCESS */
|
||||
REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT),
|
||||
|
||||
/* fail rest of links */
|
||||
REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
|
||||
/* on inflight list, should be cancelled and waited on exit reliably */
|
||||
REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
|
||||
/* read/write uses file position */
|
||||
REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
|
||||
/* must not punt to workers */
|
||||
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
|
||||
/* has or had linked timeout */
|
||||
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
|
||||
/* needs cleanup */
|
||||
REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
|
||||
/* already went through poll handler */
|
||||
REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
|
||||
/* buffer already selected */
|
||||
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
|
||||
/* buffer selected from ring, needs commit */
|
||||
REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT),
|
||||
/* caller should reissue async */
|
||||
REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
|
||||
/* supports async reads/writes */
|
||||
REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
|
||||
/* regular file */
|
||||
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
|
||||
/* has creds assigned */
|
||||
REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
|
||||
/* skip refcounting if not set */
|
||||
REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
|
||||
/* there is a linked timeout that has to be armed */
|
||||
REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
|
||||
/* ->async_data allocated */
|
||||
REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
|
||||
/* don't post CQEs while failing linked requests */
|
||||
REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),
|
||||
/* single poll may be active */
|
||||
REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),
|
||||
/* double poll may active */
|
||||
REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
|
||||
/* request has already done partial IO */
|
||||
REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
|
||||
/* fast poll multishot mode */
|
||||
REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
|
||||
/* ->extra1 and ->extra2 are initialised */
|
||||
REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT),
|
||||
/* recvmsg special flag, clear EPOLLIN */
|
||||
REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT),
|
||||
/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
|
||||
REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT),
|
||||
};
|
||||
|
||||
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
|
||||
|
||||
struct io_task_work {
|
||||
struct llist_node node;
|
||||
io_req_tw_func_t func;
|
||||
};
|
||||
|
||||
struct io_cqe {
|
||||
__u64 user_data;
|
||||
__s32 res;
|
||||
/* fd initially, then cflags for completion */
|
||||
union {
|
||||
__u32 flags;
|
||||
int fd;
|
||||
};
|
||||
};
|
||||
|
||||
/*
|
||||
* Each request type overlays its private data structure on top of this one.
|
||||
* They must not exceed this one in size.
|
||||
*/
|
||||
struct io_cmd_data {
|
||||
struct file *file;
|
||||
/* each command gets 56 bytes of data */
|
||||
__u8 data[56];
|
||||
};
|
||||
|
||||
#define io_kiocb_to_cmd(req) ((void *) &(req)->cmd)
|
||||
#define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr)
|
||||
|
||||
struct io_kiocb {
|
||||
union {
|
||||
/*
|
||||
* NOTE! Each of the io_kiocb union members has the file pointer
|
||||
* as the first entry in their struct definition. So you can
|
||||
* access the file pointer through any of the sub-structs,
|
||||
* or directly as just 'file' in this struct.
|
||||
*/
|
||||
struct file *file;
|
||||
struct io_cmd_data cmd;
|
||||
};
|
||||
|
||||
u8 opcode;
|
||||
/* polled IO has completed */
|
||||
u8 iopoll_completed;
|
||||
/*
|
||||
* Can be either a fixed buffer index, or used with provided buffers.
|
||||
* For the latter, before issue it points to the buffer group ID,
|
||||
* and after selection it points to the buffer ID itself.
|
||||
*/
|
||||
u16 buf_index;
|
||||
unsigned int flags;
|
||||
|
||||
struct io_cqe cqe;
|
||||
|
||||
struct io_ring_ctx *ctx;
|
||||
struct task_struct *task;
|
||||
|
||||
struct io_rsrc_node *rsrc_node;
|
||||
|
||||
union {
|
||||
/* store used ubuf, so we can prevent reloading */
|
||||
struct io_mapped_ubuf *imu;
|
||||
|
||||
/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
|
||||
struct io_buffer *kbuf;
|
||||
|
||||
/*
|
||||
* stores buffer ID for ring provided buffers, valid IFF
|
||||
* REQ_F_BUFFER_RING is set.
|
||||
*/
|
||||
struct io_buffer_list *buf_list;
|
||||
};
|
||||
|
||||
union {
|
||||
/* used by request caches, completion batching and iopoll */
|
||||
struct io_wq_work_node comp_list;
|
||||
/* cache ->apoll->events */
|
||||
__poll_t apoll_events;
|
||||
};
|
||||
atomic_t refs;
|
||||
atomic_t poll_refs;
|
||||
struct io_task_work io_task_work;
|
||||
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
|
||||
union {
|
||||
struct hlist_node hash_node;
|
||||
struct {
|
||||
u64 extra1;
|
||||
u64 extra2;
|
||||
};
|
||||
};
|
||||
/* internal polling, see IORING_FEAT_FAST_POLL */
|
||||
struct async_poll *apoll;
|
||||
/* opcode allocated if it needs to store data for async defer */
|
||||
void *async_data;
|
||||
/* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
|
||||
struct io_kiocb *link;
|
||||
/* custom credentials, valid IFF REQ_F_CREDS is set */
|
||||
const struct cred *creds;
|
||||
struct io_wq_work work;
|
||||
};
|
||||
|
||||
struct io_overflow_cqe {
|
||||
struct list_head list;
|
||||
struct io_uring_cqe cqe;
|
||||
};
|
||||
|
||||
#endif
|
@ -416,10 +416,9 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg,
|
||||
struct user_msghdr __user *umsg, unsigned flags,
|
||||
struct sockaddr __user **uaddr,
|
||||
struct iovec **iov);
|
||||
extern int __copy_msghdr_from_user(struct msghdr *kmsg,
|
||||
struct user_msghdr __user *umsg,
|
||||
struct sockaddr __user **save_addr,
|
||||
struct iovec __user **uiov, size_t *nsegs);
|
||||
extern int __copy_msghdr(struct msghdr *kmsg,
|
||||
struct user_msghdr *umsg,
|
||||
struct sockaddr __user **save_addr);
|
||||
|
||||
/* helpers which do the actual work for syscalls */
|
||||
extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
|
||||
|
@ -46,9 +46,8 @@ struct compat_rtentry {
|
||||
unsigned short rt_irtt; /* Initial RTT */
|
||||
};
|
||||
|
||||
int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg,
|
||||
struct sockaddr __user **save_addr, compat_uptr_t *ptr,
|
||||
compat_size_t *len);
|
||||
int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr *msg,
|
||||
struct sockaddr __user **save_addr);
|
||||
int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *,
|
||||
struct sockaddr __user **, struct iovec **);
|
||||
int put_cmsg_compat(struct msghdr*, int, int, int, void *);
|
||||
|
@ -7,6 +7,7 @@
|
||||
|
||||
#include <linux/tracepoint.h>
|
||||
#include <uapi/linux/io_uring.h>
|
||||
#include <linux/io_uring_types.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
struct io_wq_work;
|
||||
@ -97,9 +98,7 @@ TRACE_EVENT(io_uring_register,
|
||||
/**
|
||||
* io_uring_file_get - called before getting references to an SQE file
|
||||
*
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @req: pointer to a submitted request
|
||||
* @user_data: user data associated with the request
|
||||
* @fd: SQE file descriptor
|
||||
*
|
||||
* Allows to trace out how often an SQE file reference is obtained, which can
|
||||
@ -108,9 +107,9 @@ TRACE_EVENT(io_uring_register,
|
||||
*/
|
||||
TRACE_EVENT(io_uring_file_get,
|
||||
|
||||
TP_PROTO(void *ctx, void *req, unsigned long long user_data, int fd),
|
||||
TP_PROTO(struct io_kiocb *req, int fd),
|
||||
|
||||
TP_ARGS(ctx, req, user_data, fd),
|
||||
TP_ARGS(req, fd),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
@ -120,9 +119,9 @@ TRACE_EVENT(io_uring_file_get,
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->ctx = req->ctx;
|
||||
__entry->req = req;
|
||||
__entry->user_data = user_data;
|
||||
__entry->user_data = req->cqe.user_data;
|
||||
__entry->fd = fd;
|
||||
),
|
||||
|
||||
@ -133,22 +132,16 @@ TRACE_EVENT(io_uring_file_get,
|
||||
/**
|
||||
* io_uring_queue_async_work - called before submitting a new async work
|
||||
*
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @req: pointer to a submitted request
|
||||
* @user_data: user data associated with the request
|
||||
* @opcode: opcode of request
|
||||
* @flags request flags
|
||||
* @work: pointer to a submitted io_wq_work
|
||||
* @rw: type of workqueue, hashed or normal
|
||||
*
|
||||
* Allows to trace asynchronous work submission.
|
||||
*/
|
||||
TRACE_EVENT(io_uring_queue_async_work,
|
||||
|
||||
TP_PROTO(void *ctx, void * req, unsigned long long user_data, u8 opcode,
|
||||
unsigned int flags, struct io_wq_work *work, int rw),
|
||||
TP_PROTO(struct io_kiocb *req, int rw),
|
||||
|
||||
TP_ARGS(ctx, req, user_data, opcode, flags, work, rw),
|
||||
TP_ARGS(req, rw),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
@ -159,19 +152,19 @@ TRACE_EVENT(io_uring_queue_async_work,
|
||||
__field( struct io_wq_work *, work )
|
||||
__field( int, rw )
|
||||
|
||||
__string( op_str, io_uring_get_opcode(opcode) )
|
||||
__string( op_str, io_uring_get_opcode(req->opcode) )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->ctx = req->ctx;
|
||||
__entry->req = req;
|
||||
__entry->user_data = user_data;
|
||||
__entry->flags = flags;
|
||||
__entry->opcode = opcode;
|
||||
__entry->work = work;
|
||||
__entry->user_data = req->cqe.user_data;
|
||||
__entry->flags = req->flags;
|
||||
__entry->opcode = req->opcode;
|
||||
__entry->work = &req->work;
|
||||
__entry->rw = rw;
|
||||
|
||||
__assign_str(op_str, io_uring_get_opcode(opcode));
|
||||
__assign_str(op_str, io_uring_get_opcode(req->opcode));
|
||||
),
|
||||
|
||||
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p",
|
||||
@ -183,19 +176,16 @@ TRACE_EVENT(io_uring_queue_async_work,
|
||||
/**
|
||||
* io_uring_defer - called when an io_uring request is deferred
|
||||
*
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @req: pointer to a deferred request
|
||||
* @user_data: user data associated with the request
|
||||
* @opcode: opcode of request
|
||||
*
|
||||
* Allows to track deferred requests, to get an insight about what requests are
|
||||
* not started immediately.
|
||||
*/
|
||||
TRACE_EVENT(io_uring_defer,
|
||||
|
||||
TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode),
|
||||
TP_PROTO(struct io_kiocb *req),
|
||||
|
||||
TP_ARGS(ctx, req, user_data, opcode),
|
||||
TP_ARGS(req),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
@ -203,16 +193,16 @@ TRACE_EVENT(io_uring_defer,
|
||||
__field( unsigned long long, data )
|
||||
__field( u8, opcode )
|
||||
|
||||
__string( op_str, io_uring_get_opcode(opcode) )
|
||||
__string( op_str, io_uring_get_opcode(req->opcode) )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->ctx = req->ctx;
|
||||
__entry->req = req;
|
||||
__entry->data = user_data;
|
||||
__entry->opcode = opcode;
|
||||
__entry->data = req->cqe.user_data;
|
||||
__entry->opcode = req->opcode;
|
||||
|
||||
__assign_str(op_str, io_uring_get_opcode(opcode));
|
||||
__assign_str(op_str, io_uring_get_opcode(req->opcode));
|
||||
),
|
||||
|
||||
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s",
|
||||
@ -224,7 +214,6 @@ TRACE_EVENT(io_uring_defer,
|
||||
* io_uring_link - called before the io_uring request added into link_list of
|
||||
* another request
|
||||
*
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @req: pointer to a linked request
|
||||
* @target_req: pointer to a previous request, that would contain @req
|
||||
*
|
||||
@ -233,9 +222,9 @@ TRACE_EVENT(io_uring_defer,
|
||||
*/
|
||||
TRACE_EVENT(io_uring_link,
|
||||
|
||||
TP_PROTO(void *ctx, void *req, void *target_req),
|
||||
TP_PROTO(struct io_kiocb *req, struct io_kiocb *target_req),
|
||||
|
||||
TP_ARGS(ctx, req, target_req),
|
||||
TP_ARGS(req, target_req),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
@ -244,7 +233,7 @@ TRACE_EVENT(io_uring_link,
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->ctx = req->ctx;
|
||||
__entry->req = req;
|
||||
__entry->target_req = target_req;
|
||||
),
|
||||
@ -285,10 +274,7 @@ TRACE_EVENT(io_uring_cqring_wait,
|
||||
/**
|
||||
* io_uring_fail_link - called before failing a linked request
|
||||
*
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @req: request, which links were cancelled
|
||||
* @user_data: user data associated with the request
|
||||
* @opcode: opcode of request
|
||||
* @link: cancelled link
|
||||
*
|
||||
* Allows to track linked requests cancellation, to see not only that some work
|
||||
@ -296,9 +282,9 @@ TRACE_EVENT(io_uring_cqring_wait,
|
||||
*/
|
||||
TRACE_EVENT(io_uring_fail_link,
|
||||
|
||||
TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, void *link),
|
||||
TP_PROTO(struct io_kiocb *req, struct io_kiocb *link),
|
||||
|
||||
TP_ARGS(ctx, req, user_data, opcode, link),
|
||||
TP_ARGS(req, link),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
@ -307,17 +293,17 @@ TRACE_EVENT(io_uring_fail_link,
|
||||
__field( u8, opcode )
|
||||
__field( void *, link )
|
||||
|
||||
__string( op_str, io_uring_get_opcode(opcode) )
|
||||
__string( op_str, io_uring_get_opcode(req->opcode) )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->ctx = req->ctx;
|
||||
__entry->req = req;
|
||||
__entry->user_data = user_data;
|
||||
__entry->opcode = opcode;
|
||||
__entry->user_data = req->cqe.user_data;
|
||||
__entry->opcode = req->opcode;
|
||||
__entry->link = link;
|
||||
|
||||
__assign_str(op_str, io_uring_get_opcode(opcode));
|
||||
__assign_str(op_str, io_uring_get_opcode(req->opcode));
|
||||
),
|
||||
|
||||
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p",
|
||||
@ -376,23 +362,17 @@ TRACE_EVENT(io_uring_complete,
|
||||
/**
|
||||
* io_uring_submit_sqe - called before submitting one SQE
|
||||
*
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @req: pointer to a submitted request
|
||||
* @user_data: user data associated with the request
|
||||
* @opcode: opcode of request
|
||||
* @flags request flags
|
||||
* @force_nonblock: whether a context blocking or not
|
||||
* @sq_thread: true if sq_thread has submitted this SQE
|
||||
*
|
||||
* Allows to track SQE submitting, to understand what was the source of it, SQ
|
||||
* thread or io_uring_enter call.
|
||||
*/
|
||||
TRACE_EVENT(io_uring_submit_sqe,
|
||||
|
||||
TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, u32 flags,
|
||||
bool force_nonblock, bool sq_thread),
|
||||
TP_PROTO(struct io_kiocb *req, bool force_nonblock),
|
||||
|
||||
TP_ARGS(ctx, req, user_data, opcode, flags, force_nonblock, sq_thread),
|
||||
TP_ARGS(req, force_nonblock),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
@ -403,19 +383,19 @@ TRACE_EVENT(io_uring_submit_sqe,
|
||||
__field( bool, force_nonblock )
|
||||
__field( bool, sq_thread )
|
||||
|
||||
__string( op_str, io_uring_get_opcode(opcode) )
|
||||
__string( op_str, io_uring_get_opcode(req->opcode) )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->ctx = req->ctx;
|
||||
__entry->req = req;
|
||||
__entry->user_data = user_data;
|
||||
__entry->opcode = opcode;
|
||||
__entry->flags = flags;
|
||||
__entry->user_data = req->cqe.user_data;
|
||||
__entry->opcode = req->opcode;
|
||||
__entry->flags = req->flags;
|
||||
__entry->force_nonblock = force_nonblock;
|
||||
__entry->sq_thread = sq_thread;
|
||||
__entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL;
|
||||
|
||||
__assign_str(op_str, io_uring_get_opcode(opcode));
|
||||
__assign_str(op_str, io_uring_get_opcode(req->opcode));
|
||||
),
|
||||
|
||||
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
|
||||
@ -427,10 +407,7 @@ TRACE_EVENT(io_uring_submit_sqe,
|
||||
/*
|
||||
* io_uring_poll_arm - called after arming a poll wait if successful
|
||||
*
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @req: pointer to the armed request
|
||||
* @user_data: user data associated with the request
|
||||
* @opcode: opcode of request
|
||||
* @mask: request poll events mask
|
||||
* @events: registered events of interest
|
||||
*
|
||||
@ -439,10 +416,9 @@ TRACE_EVENT(io_uring_submit_sqe,
|
||||
*/
|
||||
TRACE_EVENT(io_uring_poll_arm,
|
||||
|
||||
TP_PROTO(void *ctx, void *req, u64 user_data, u8 opcode,
|
||||
int mask, int events),
|
||||
TP_PROTO(struct io_kiocb *req, int mask, int events),
|
||||
|
||||
TP_ARGS(ctx, req, user_data, opcode, mask, events),
|
||||
TP_ARGS(req, mask, events),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
@ -452,18 +428,18 @@ TRACE_EVENT(io_uring_poll_arm,
|
||||
__field( int, mask )
|
||||
__field( int, events )
|
||||
|
||||
__string( op_str, io_uring_get_opcode(opcode) )
|
||||
__string( op_str, io_uring_get_opcode(req->opcode) )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->ctx = req->ctx;
|
||||
__entry->req = req;
|
||||
__entry->user_data = user_data;
|
||||
__entry->opcode = opcode;
|
||||
__entry->user_data = req->cqe.user_data;
|
||||
__entry->opcode = req->opcode;
|
||||
__entry->mask = mask;
|
||||
__entry->events = events;
|
||||
|
||||
__assign_str(op_str, io_uring_get_opcode(opcode));
|
||||
__assign_str(op_str, io_uring_get_opcode(req->opcode));
|
||||
),
|
||||
|
||||
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x",
|
||||
@ -475,18 +451,15 @@ TRACE_EVENT(io_uring_poll_arm,
|
||||
/*
|
||||
* io_uring_task_add - called after adding a task
|
||||
*
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @req: pointer to request
|
||||
* @user_data: user data associated with the request
|
||||
* @opcode: opcode of request
|
||||
* @mask: request poll events mask
|
||||
*
|
||||
*/
|
||||
TRACE_EVENT(io_uring_task_add,
|
||||
|
||||
TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, int mask),
|
||||
TP_PROTO(struct io_kiocb *req, int mask),
|
||||
|
||||
TP_ARGS(ctx, req, user_data, opcode, mask),
|
||||
TP_ARGS(req, mask),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
@ -495,17 +468,17 @@ TRACE_EVENT(io_uring_task_add,
|
||||
__field( u8, opcode )
|
||||
__field( int, mask )
|
||||
|
||||
__string( op_str, io_uring_get_opcode(opcode) )
|
||||
__string( op_str, io_uring_get_opcode(req->opcode) )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->ctx = req->ctx;
|
||||
__entry->req = req;
|
||||
__entry->user_data = user_data;
|
||||
__entry->opcode = opcode;
|
||||
__entry->user_data = req->cqe.user_data;
|
||||
__entry->opcode = req->opcode;
|
||||
__entry->mask = mask;
|
||||
|
||||
__assign_str(op_str, io_uring_get_opcode(opcode));
|
||||
__assign_str(op_str, io_uring_get_opcode(req->opcode));
|
||||
),
|
||||
|
||||
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x",
|
||||
@ -518,7 +491,6 @@ TRACE_EVENT(io_uring_task_add,
|
||||
* io_uring_req_failed - called when an sqe is errored dring submission
|
||||
*
|
||||
* @sqe: pointer to the io_uring_sqe that failed
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @req: pointer to request
|
||||
* @error: error it failed with
|
||||
*
|
||||
@ -526,9 +498,9 @@ TRACE_EVENT(io_uring_task_add,
|
||||
*/
|
||||
TRACE_EVENT(io_uring_req_failed,
|
||||
|
||||
TP_PROTO(const struct io_uring_sqe *sqe, void *ctx, void *req, int error),
|
||||
TP_PROTO(const struct io_uring_sqe *sqe, struct io_kiocb *req, int error),
|
||||
|
||||
TP_ARGS(sqe, ctx, req, error),
|
||||
TP_ARGS(sqe, req, error),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
@ -552,7 +524,7 @@ TRACE_EVENT(io_uring_req_failed,
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->ctx = req->ctx;
|
||||
__entry->req = req;
|
||||
__entry->user_data = sqe->user_data;
|
||||
__entry->opcode = sqe->opcode;
|
||||
@ -622,12 +594,42 @@ TRACE_EVENT(io_uring_cqe_overflow,
|
||||
__entry->ocqe = ocqe;
|
||||
),
|
||||
|
||||
TP_printk("ring %p, user_data 0x%llx, res %d, flags %x, "
|
||||
TP_printk("ring %p, user_data 0x%llx, res %d, cflags 0x%x, "
|
||||
"overflow_cqe %p",
|
||||
__entry->ctx, __entry->user_data, __entry->res,
|
||||
__entry->cflags, __entry->ocqe)
|
||||
);
|
||||
|
||||
/*
|
||||
* io_uring_task_work_run - ran task work
|
||||
*
|
||||
* @tctx: pointer to a io_uring_task
|
||||
* @count: how many functions it ran
|
||||
* @loops: how many loops it ran
|
||||
*
|
||||
*/
|
||||
TRACE_EVENT(io_uring_task_work_run,
|
||||
|
||||
TP_PROTO(void *tctx, unsigned int count, unsigned int loops),
|
||||
|
||||
TP_ARGS(tctx, count, loops),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, tctx )
|
||||
__field( unsigned int, count )
|
||||
__field( unsigned int, loops )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->tctx = tctx;
|
||||
__entry->count = count;
|
||||
__entry->loops = loops;
|
||||
),
|
||||
|
||||
TP_printk("tctx %p, count %u, loops %u",
|
||||
__entry->tctx, __entry->count, __entry->loops)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_IO_URING_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
|
@ -10,6 +10,7 @@
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/time_types.h>
|
||||
|
||||
/*
|
||||
* IO submission data structure (Submission Queue Entry)
|
||||
@ -50,6 +51,7 @@ struct io_uring_sqe {
|
||||
__u32 unlink_flags;
|
||||
__u32 hardlink_flags;
|
||||
__u32 xattr_flags;
|
||||
__u32 msg_ring_flags;
|
||||
};
|
||||
__u64 user_data; /* data to be passed back at completion time */
|
||||
/* pack this to avoid bogus arm OABI complaints */
|
||||
@ -140,9 +142,12 @@ enum {
|
||||
* IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
|
||||
*/
|
||||
#define IORING_SETUP_TASKRUN_FLAG (1U << 9)
|
||||
|
||||
#define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */
|
||||
#define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */
|
||||
/*
|
||||
* Only one task is allowed to submit requests
|
||||
*/
|
||||
#define IORING_SETUP_SINGLE_ISSUER (1U << 12)
|
||||
|
||||
enum io_uring_op {
|
||||
IORING_OP_NOP,
|
||||
@ -229,10 +234,13 @@ enum io_uring_op {
|
||||
*
|
||||
* IORING_POLL_UPDATE Update existing poll request, matching
|
||||
* sqe->addr as the old user_data field.
|
||||
*
|
||||
* IORING_POLL_LEVEL Level triggered poll.
|
||||
*/
|
||||
#define IORING_POLL_ADD_MULTI (1U << 0)
|
||||
#define IORING_POLL_UPDATE_EVENTS (1U << 1)
|
||||
#define IORING_POLL_UPDATE_USER_DATA (1U << 2)
|
||||
#define IORING_POLL_ADD_LEVEL (1U << 3)
|
||||
|
||||
/*
|
||||
* ASYNC_CANCEL flags.
|
||||
@ -241,10 +249,12 @@ enum io_uring_op {
|
||||
* IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the
|
||||
* request 'user_data'
|
||||
* IORING_ASYNC_CANCEL_ANY Match any request
|
||||
* IORING_ASYNC_CANCEL_FD_FIXED 'fd' passed in is a fixed descriptor
|
||||
*/
|
||||
#define IORING_ASYNC_CANCEL_ALL (1U << 0)
|
||||
#define IORING_ASYNC_CANCEL_FD (1U << 1)
|
||||
#define IORING_ASYNC_CANCEL_ANY (1U << 2)
|
||||
#define IORING_ASYNC_CANCEL_FD_FIXED (1U << 3)
|
||||
|
||||
/*
|
||||
* send/sendmsg and recv/recvmsg flags (sqe->ioprio)
|
||||
@ -253,14 +263,35 @@ enum io_uring_op {
|
||||
* or receive and arm poll if that yields an
|
||||
* -EAGAIN result, arm poll upfront and skip
|
||||
* the initial transfer attempt.
|
||||
*
|
||||
* IORING_RECV_MULTISHOT Multishot recv. Sets IORING_CQE_F_MORE if
|
||||
* the handler will continue to report
|
||||
* CQEs on behalf of the same SQE.
|
||||
*/
|
||||
#define IORING_RECVSEND_POLL_FIRST (1U << 0)
|
||||
#define IORING_RECV_MULTISHOT (1U << 1)
|
||||
|
||||
/*
|
||||
* accept flags stored in sqe->ioprio
|
||||
*/
|
||||
#define IORING_ACCEPT_MULTISHOT (1U << 0)
|
||||
|
||||
/*
|
||||
* IORING_OP_MSG_RING command types, stored in sqe->addr
|
||||
*/
|
||||
enum {
|
||||
IORING_MSG_DATA, /* pass sqe->len as 'res' and off as user_data */
|
||||
IORING_MSG_SEND_FD, /* send a registered fd to another ring */
|
||||
};
|
||||
|
||||
/*
|
||||
* IORING_OP_MSG_RING flags (sqe->msg_ring_flags)
|
||||
*
|
||||
* IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. Not
|
||||
* applicable for IORING_MSG_DATA, obviously.
|
||||
*/
|
||||
#define IORING_MSG_RING_CQE_SKIP (1U << 0)
|
||||
|
||||
/*
|
||||
* IO completion data structure (Completion Queue Entry)
|
||||
*/
|
||||
@ -420,6 +451,12 @@ enum {
|
||||
IORING_REGISTER_PBUF_RING = 22,
|
||||
IORING_UNREGISTER_PBUF_RING = 23,
|
||||
|
||||
/* sync cancelation API */
|
||||
IORING_REGISTER_SYNC_CANCEL = 24,
|
||||
|
||||
/* register a range of fixed file slots for automatic slot allocation */
|
||||
IORING_REGISTER_FILE_ALLOC_RANGE = 25,
|
||||
|
||||
/* this goes last */
|
||||
IORING_REGISTER_LAST
|
||||
};
|
||||
@ -483,7 +520,7 @@ struct io_uring_probe {
|
||||
__u8 ops_len; /* length of ops[] array below */
|
||||
__u16 resv;
|
||||
__u32 resv2[3];
|
||||
struct io_uring_probe_op ops[0];
|
||||
struct io_uring_probe_op ops[];
|
||||
};
|
||||
|
||||
struct io_uring_restriction {
|
||||
@ -555,4 +592,32 @@ struct io_uring_getevents_arg {
|
||||
__u64 ts;
|
||||
};
|
||||
|
||||
/*
|
||||
* Argument for IORING_REGISTER_SYNC_CANCEL
|
||||
*/
|
||||
struct io_uring_sync_cancel_reg {
|
||||
__u64 addr;
|
||||
__s32 fd;
|
||||
__u32 flags;
|
||||
struct __kernel_timespec timeout;
|
||||
__u64 pad[4];
|
||||
};
|
||||
|
||||
/*
|
||||
* Argument for IORING_REGISTER_FILE_ALLOC_RANGE
|
||||
* The range is specified as [off, off + len)
|
||||
*/
|
||||
struct io_uring_file_index_range {
|
||||
__u32 off;
|
||||
__u32 len;
|
||||
__u64 resv;
|
||||
};
|
||||
|
||||
struct io_uring_recvmsg_out {
|
||||
__u32 namelen;
|
||||
__u32 controllen;
|
||||
__u32 payloadlen;
|
||||
__u32 flags;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
11
io_uring/Makefile
Normal file
11
io_uring/Makefile
Normal file
@ -0,0 +1,11 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
#
|
||||
# Makefile for io_uring
|
||||
|
||||
obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
|
||||
sync.o advise.o filetable.o \
|
||||
openclose.o uring_cmd.o epoll.o \
|
||||
statx.o net.o msg_ring.o timeout.o \
|
||||
sqpoll.o fdinfo.o tctx.o poll.o \
|
||||
cancel.o kbuf.o rsrc.o rw.o opdef.o
|
||||
obj-$(CONFIG_IO_WQ) += io-wq.o
|
99
io_uring/advise.c
Normal file
99
io_uring/advise.c
Normal file
@ -0,0 +1,99 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include <uapi/linux/fadvise.h>
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "advise.h"
|
||||
|
||||
struct io_fadvise {
|
||||
struct file *file;
|
||||
u64 offset;
|
||||
u32 len;
|
||||
u32 advice;
|
||||
};
|
||||
|
||||
struct io_madvise {
|
||||
struct file *file;
|
||||
u64 addr;
|
||||
u32 len;
|
||||
u32 advice;
|
||||
};
|
||||
|
||||
int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
|
||||
struct io_madvise *ma = io_kiocb_to_cmd(req);
|
||||
|
||||
if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
|
||||
return -EINVAL;
|
||||
|
||||
ma->addr = READ_ONCE(sqe->addr);
|
||||
ma->len = READ_ONCE(sqe->len);
|
||||
ma->advice = READ_ONCE(sqe->fadvise_advice);
|
||||
return 0;
|
||||
#else
|
||||
return -EOPNOTSUPP;
|
||||
#endif
|
||||
}
|
||||
|
||||
int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
|
||||
struct io_madvise *ma = io_kiocb_to_cmd(req);
|
||||
int ret;
|
||||
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
return -EAGAIN;
|
||||
|
||||
ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
#else
|
||||
return -EOPNOTSUPP;
|
||||
#endif
|
||||
}
|
||||
|
||||
int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_fadvise *fa = io_kiocb_to_cmd(req);
|
||||
|
||||
if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
|
||||
return -EINVAL;
|
||||
|
||||
fa->offset = READ_ONCE(sqe->off);
|
||||
fa->len = READ_ONCE(sqe->len);
|
||||
fa->advice = READ_ONCE(sqe->fadvise_advice);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_fadvise *fa = io_kiocb_to_cmd(req);
|
||||
int ret;
|
||||
|
||||
if (issue_flags & IO_URING_F_NONBLOCK) {
|
||||
switch (fa->advice) {
|
||||
case POSIX_FADV_NORMAL:
|
||||
case POSIX_FADV_RANDOM:
|
||||
case POSIX_FADV_SEQUENTIAL:
|
||||
break;
|
||||
default:
|
||||
return -EAGAIN;
|
||||
}
|
||||
}
|
||||
|
||||
ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
|
||||
if (ret < 0)
|
||||
req_set_fail(req);
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
7
io_uring/advise.h
Normal file
7
io_uring/advise.h
Normal file
@ -0,0 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_madvise(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_fadvise(struct io_kiocb *req, unsigned int issue_flags);
|
53
io_uring/alloc_cache.h
Normal file
53
io_uring/alloc_cache.h
Normal file
@ -0,0 +1,53 @@
|
||||
#ifndef IOU_ALLOC_CACHE_H
|
||||
#define IOU_ALLOC_CACHE_H
|
||||
|
||||
/*
|
||||
* Don't allow the cache to grow beyond this size.
|
||||
*/
|
||||
#define IO_ALLOC_CACHE_MAX 512
|
||||
|
||||
struct io_cache_entry {
|
||||
struct hlist_node node;
|
||||
};
|
||||
|
||||
static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
|
||||
struct io_cache_entry *entry)
|
||||
{
|
||||
if (cache->nr_cached < IO_ALLOC_CACHE_MAX) {
|
||||
cache->nr_cached++;
|
||||
hlist_add_head(&entry->node, &cache->list);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
|
||||
{
|
||||
if (!hlist_empty(&cache->list)) {
|
||||
struct hlist_node *node = cache->list.first;
|
||||
|
||||
hlist_del(node);
|
||||
return container_of(node, struct io_cache_entry, node);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline void io_alloc_cache_init(struct io_alloc_cache *cache)
|
||||
{
|
||||
INIT_HLIST_HEAD(&cache->list);
|
||||
cache->nr_cached = 0;
|
||||
}
|
||||
|
||||
static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
|
||||
void (*free)(struct io_cache_entry *))
|
||||
{
|
||||
while (!hlist_empty(&cache->list)) {
|
||||
struct hlist_node *node = cache->list.first;
|
||||
|
||||
hlist_del(node);
|
||||
free(container_of(node, struct io_cache_entry, node));
|
||||
}
|
||||
cache->nr_cached = 0;
|
||||
}
|
||||
#endif
|
315
io_uring/cancel.c
Normal file
315
io_uring/cancel.c
Normal file
@ -0,0 +1,315 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/nospec.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "tctx.h"
|
||||
#include "poll.h"
|
||||
#include "timeout.h"
|
||||
#include "cancel.h"
|
||||
|
||||
struct io_cancel {
|
||||
struct file *file;
|
||||
u64 addr;
|
||||
u32 flags;
|
||||
s32 fd;
|
||||
};
|
||||
|
||||
#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
|
||||
IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED)
|
||||
|
||||
static bool io_cancel_cb(struct io_wq_work *work, void *data)
|
||||
{
|
||||
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
|
||||
struct io_cancel_data *cd = data;
|
||||
|
||||
if (req->ctx != cd->ctx)
|
||||
return false;
|
||||
if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
|
||||
;
|
||||
} else if (cd->flags & IORING_ASYNC_CANCEL_FD) {
|
||||
if (req->file != cd->file)
|
||||
return false;
|
||||
} else {
|
||||
if (req->cqe.user_data != cd->data)
|
||||
return false;
|
||||
}
|
||||
if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
|
||||
if (cd->seq == req->work.cancel_seq)
|
||||
return false;
|
||||
req->work.cancel_seq = cd->seq;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static int io_async_cancel_one(struct io_uring_task *tctx,
|
||||
struct io_cancel_data *cd)
|
||||
{
|
||||
enum io_wq_cancel cancel_ret;
|
||||
int ret = 0;
|
||||
bool all;
|
||||
|
||||
if (!tctx || !tctx->io_wq)
|
||||
return -ENOENT;
|
||||
|
||||
all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
|
||||
cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all);
|
||||
switch (cancel_ret) {
|
||||
case IO_WQ_CANCEL_OK:
|
||||
ret = 0;
|
||||
break;
|
||||
case IO_WQ_CANCEL_RUNNING:
|
||||
ret = -EALREADY;
|
||||
break;
|
||||
case IO_WQ_CANCEL_NOTFOUND:
|
||||
ret = -ENOENT;
|
||||
break;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
|
||||
unsigned issue_flags)
|
||||
{
|
||||
struct io_ring_ctx *ctx = cd->ctx;
|
||||
int ret;
|
||||
|
||||
WARN_ON_ONCE(!io_wq_current_is_worker() && tctx != current->io_uring);
|
||||
|
||||
ret = io_async_cancel_one(tctx, cd);
|
||||
/*
|
||||
* Fall-through even for -EALREADY, as we may have poll armed
|
||||
* that need unarming.
|
||||
*/
|
||||
if (!ret)
|
||||
return 0;
|
||||
|
||||
ret = io_poll_cancel(ctx, cd, issue_flags);
|
||||
if (ret != -ENOENT)
|
||||
return ret;
|
||||
|
||||
spin_lock(&ctx->completion_lock);
|
||||
if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
|
||||
ret = io_timeout_cancel(ctx, cd);
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_cancel *cancel = io_kiocb_to_cmd(req);
|
||||
|
||||
if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
|
||||
return -EINVAL;
|
||||
if (sqe->off || sqe->len || sqe->splice_fd_in)
|
||||
return -EINVAL;
|
||||
|
||||
cancel->addr = READ_ONCE(sqe->addr);
|
||||
cancel->flags = READ_ONCE(sqe->cancel_flags);
|
||||
if (cancel->flags & ~CANCEL_FLAGS)
|
||||
return -EINVAL;
|
||||
if (cancel->flags & IORING_ASYNC_CANCEL_FD) {
|
||||
if (cancel->flags & IORING_ASYNC_CANCEL_ANY)
|
||||
return -EINVAL;
|
||||
cancel->fd = READ_ONCE(sqe->fd);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __io_async_cancel(struct io_cancel_data *cd,
|
||||
struct io_uring_task *tctx,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
|
||||
struct io_ring_ctx *ctx = cd->ctx;
|
||||
struct io_tctx_node *node;
|
||||
int ret, nr = 0;
|
||||
|
||||
do {
|
||||
ret = io_try_cancel(tctx, cd, issue_flags);
|
||||
if (ret == -ENOENT)
|
||||
break;
|
||||
if (!all)
|
||||
return ret;
|
||||
nr++;
|
||||
} while (1);
|
||||
|
||||
/* slow path, try all io-wq's */
|
||||
io_ring_submit_lock(ctx, issue_flags);
|
||||
ret = -ENOENT;
|
||||
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
|
||||
struct io_uring_task *tctx = node->task->io_uring;
|
||||
|
||||
ret = io_async_cancel_one(tctx, cd);
|
||||
if (ret != -ENOENT) {
|
||||
if (!all)
|
||||
break;
|
||||
nr++;
|
||||
}
|
||||
}
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
return all ? nr : ret;
|
||||
}
|
||||
|
||||
int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_cancel *cancel = io_kiocb_to_cmd(req);
|
||||
struct io_cancel_data cd = {
|
||||
.ctx = req->ctx,
|
||||
.data = cancel->addr,
|
||||
.flags = cancel->flags,
|
||||
.seq = atomic_inc_return(&req->ctx->cancel_seq),
|
||||
};
|
||||
struct io_uring_task *tctx = req->task->io_uring;
|
||||
int ret;
|
||||
|
||||
if (cd.flags & IORING_ASYNC_CANCEL_FD) {
|
||||
if (req->flags & REQ_F_FIXED_FILE ||
|
||||
cd.flags & IORING_ASYNC_CANCEL_FD_FIXED) {
|
||||
req->flags |= REQ_F_FIXED_FILE;
|
||||
req->file = io_file_get_fixed(req, cancel->fd,
|
||||
issue_flags);
|
||||
} else {
|
||||
req->file = io_file_get_normal(req, cancel->fd);
|
||||
}
|
||||
if (!req->file) {
|
||||
ret = -EBADF;
|
||||
goto done;
|
||||
}
|
||||
cd.file = req->file;
|
||||
}
|
||||
|
||||
ret = __io_async_cancel(&cd, tctx, issue_flags);
|
||||
done:
|
||||
if (ret < 0)
|
||||
req_set_fail(req);
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
void init_hash_table(struct io_hash_table *table, unsigned size)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < size; i++) {
|
||||
spin_lock_init(&table->hbs[i].lock);
|
||||
INIT_HLIST_HEAD(&table->hbs[i].list);
|
||||
}
|
||||
}
|
||||
|
||||
static int __io_sync_cancel(struct io_uring_task *tctx,
|
||||
struct io_cancel_data *cd, int fd)
|
||||
{
|
||||
struct io_ring_ctx *ctx = cd->ctx;
|
||||
|
||||
/* fixed must be grabbed every time since we drop the uring_lock */
|
||||
if ((cd->flags & IORING_ASYNC_CANCEL_FD) &&
|
||||
(cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
|
||||
unsigned long file_ptr;
|
||||
|
||||
if (unlikely(fd > ctx->nr_user_files))
|
||||
return -EBADF;
|
||||
fd = array_index_nospec(fd, ctx->nr_user_files);
|
||||
file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
|
||||
cd->file = (struct file *) (file_ptr & FFS_MASK);
|
||||
if (!cd->file)
|
||||
return -EBADF;
|
||||
}
|
||||
|
||||
return __io_async_cancel(cd, tctx, 0);
|
||||
}
|
||||
|
||||
int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
|
||||
__must_hold(&ctx->uring_lock)
|
||||
{
|
||||
struct io_cancel_data cd = {
|
||||
.ctx = ctx,
|
||||
.seq = atomic_inc_return(&ctx->cancel_seq),
|
||||
};
|
||||
ktime_t timeout = KTIME_MAX;
|
||||
struct io_uring_sync_cancel_reg sc;
|
||||
struct fd f = { };
|
||||
DEFINE_WAIT(wait);
|
||||
int ret;
|
||||
|
||||
if (copy_from_user(&sc, arg, sizeof(sc)))
|
||||
return -EFAULT;
|
||||
if (sc.flags & ~CANCEL_FLAGS)
|
||||
return -EINVAL;
|
||||
if (sc.pad[0] || sc.pad[1] || sc.pad[2] || sc.pad[3])
|
||||
return -EINVAL;
|
||||
|
||||
cd.data = sc.addr;
|
||||
cd.flags = sc.flags;
|
||||
|
||||
/* we can grab a normal file descriptor upfront */
|
||||
if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&
|
||||
!(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
|
||||
f = fdget(sc.fd);
|
||||
if (!f.file)
|
||||
return -EBADF;
|
||||
cd.file = f.file;
|
||||
}
|
||||
|
||||
ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
|
||||
|
||||
/* found something, done! */
|
||||
if (ret != -EALREADY)
|
||||
goto out;
|
||||
|
||||
if (sc.timeout.tv_sec != -1UL || sc.timeout.tv_nsec != -1UL) {
|
||||
struct timespec64 ts = {
|
||||
.tv_sec = sc.timeout.tv_sec,
|
||||
.tv_nsec = sc.timeout.tv_nsec
|
||||
};
|
||||
|
||||
timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
|
||||
}
|
||||
|
||||
/*
|
||||
* Keep looking until we get -ENOENT. we'll get woken everytime
|
||||
* every time a request completes and will retry the cancelation.
|
||||
*/
|
||||
do {
|
||||
cd.seq = atomic_inc_return(&ctx->cancel_seq);
|
||||
|
||||
prepare_to_wait(&ctx->cq_wait, &wait, TASK_INTERRUPTIBLE);
|
||||
|
||||
ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
|
||||
|
||||
if (ret != -EALREADY)
|
||||
break;
|
||||
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
ret = io_run_task_work_sig();
|
||||
if (ret < 0) {
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
break;
|
||||
}
|
||||
ret = schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS);
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
if (!ret) {
|
||||
ret = -ETIME;
|
||||
break;
|
||||
}
|
||||
} while (1);
|
||||
|
||||
finish_wait(&ctx->cq_wait, &wait);
|
||||
|
||||
if (ret == -ENOENT || ret > 0)
|
||||
ret = 0;
|
||||
out:
|
||||
fdput(f);
|
||||
return ret;
|
||||
}
|
23
io_uring/cancel.h
Normal file
23
io_uring/cancel.h
Normal file
@ -0,0 +1,23 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include <linux/io_uring_types.h>
|
||||
|
||||
struct io_cancel_data {
|
||||
struct io_ring_ctx *ctx;
|
||||
union {
|
||||
u64 data;
|
||||
struct file *file;
|
||||
};
|
||||
u32 flags;
|
||||
int seq;
|
||||
};
|
||||
|
||||
|
||||
int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
|
||||
unsigned int issue_flags);
|
||||
void init_hash_table(struct io_hash_table *table, unsigned size);
|
||||
|
||||
int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);
|
65
io_uring/epoll.c
Normal file
65
io_uring/epoll.c
Normal file
@ -0,0 +1,65 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/io_uring.h>
|
||||
#include <linux/eventpoll.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "epoll.h"
|
||||
|
||||
#if defined(CONFIG_EPOLL)
|
||||
struct io_epoll {
|
||||
struct file *file;
|
||||
int epfd;
|
||||
int op;
|
||||
int fd;
|
||||
struct epoll_event event;
|
||||
};
|
||||
|
||||
int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_epoll *epoll = io_kiocb_to_cmd(req);
|
||||
|
||||
pr_warn_once("%s: epoll_ctl support in io_uring is deprecated and will "
|
||||
"be removed in a future Linux kernel version.\n",
|
||||
current->comm);
|
||||
|
||||
if (sqe->buf_index || sqe->splice_fd_in)
|
||||
return -EINVAL;
|
||||
|
||||
epoll->epfd = READ_ONCE(sqe->fd);
|
||||
epoll->op = READ_ONCE(sqe->len);
|
||||
epoll->fd = READ_ONCE(sqe->off);
|
||||
|
||||
if (ep_op_has_event(epoll->op)) {
|
||||
struct epoll_event __user *ev;
|
||||
|
||||
ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
|
||||
if (copy_from_user(&epoll->event, ev, sizeof(*ev)))
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_epoll *ie = io_kiocb_to_cmd(req);
|
||||
int ret;
|
||||
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
|
||||
|
||||
ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
|
||||
if (force_nonblock && ret == -EAGAIN)
|
||||
return -EAGAIN;
|
||||
|
||||
if (ret < 0)
|
||||
req_set_fail(req);
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
||||
#endif
|
6
io_uring/epoll.h
Normal file
6
io_uring/epoll.h
Normal file
@ -0,0 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#if defined(CONFIG_EPOLL)
|
||||
int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags);
|
||||
#endif
|
194
io_uring/fdinfo.c
Normal file
194
io_uring/fdinfo.c
Normal file
@ -0,0 +1,194 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "sqpoll.h"
|
||||
#include "fdinfo.h"
|
||||
#include "cancel.h"
|
||||
#include "rsrc.h"
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
|
||||
const struct cred *cred)
|
||||
{
|
||||
struct user_namespace *uns = seq_user_ns(m);
|
||||
struct group_info *gi;
|
||||
kernel_cap_t cap;
|
||||
unsigned __capi;
|
||||
int g;
|
||||
|
||||
seq_printf(m, "%5d\n", id);
|
||||
seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
|
||||
seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
|
||||
seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
|
||||
seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
|
||||
seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
|
||||
seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
|
||||
seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
|
||||
seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
|
||||
seq_puts(m, "\n\tGroups:\t");
|
||||
gi = cred->group_info;
|
||||
for (g = 0; g < gi->ngroups; g++) {
|
||||
seq_put_decimal_ull(m, g ? " " : "",
|
||||
from_kgid_munged(uns, gi->gid[g]));
|
||||
}
|
||||
seq_puts(m, "\n\tCapEff:\t");
|
||||
cap = cred->cap_effective;
|
||||
CAP_FOR_EACH_U32(__capi)
|
||||
seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
|
||||
seq_putc(m, '\n');
|
||||
return 0;
|
||||
}
|
||||
|
||||
static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
|
||||
struct seq_file *m)
|
||||
{
|
||||
struct io_sq_data *sq = NULL;
|
||||
struct io_overflow_cqe *ocqe;
|
||||
struct io_rings *r = ctx->rings;
|
||||
unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
|
||||
unsigned int sq_head = READ_ONCE(r->sq.head);
|
||||
unsigned int sq_tail = READ_ONCE(r->sq.tail);
|
||||
unsigned int cq_head = READ_ONCE(r->cq.head);
|
||||
unsigned int cq_tail = READ_ONCE(r->cq.tail);
|
||||
unsigned int cq_shift = 0;
|
||||
unsigned int sq_entries, cq_entries;
|
||||
bool has_lock;
|
||||
bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
|
||||
unsigned int i;
|
||||
|
||||
if (is_cqe32)
|
||||
cq_shift = 1;
|
||||
|
||||
/*
|
||||
* we may get imprecise sqe and cqe info if uring is actively running
|
||||
* since we get cached_sq_head and cached_cq_tail without uring_lock
|
||||
* and sq_tail and cq_head are changed by userspace. But it's ok since
|
||||
* we usually use these info when it is stuck.
|
||||
*/
|
||||
seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
|
||||
seq_printf(m, "SqHead:\t%u\n", sq_head);
|
||||
seq_printf(m, "SqTail:\t%u\n", sq_tail);
|
||||
seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
|
||||
seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
|
||||
seq_printf(m, "CqHead:\t%u\n", cq_head);
|
||||
seq_printf(m, "CqTail:\t%u\n", cq_tail);
|
||||
seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
|
||||
seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
|
||||
sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
|
||||
for (i = 0; i < sq_entries; i++) {
|
||||
unsigned int entry = i + sq_head;
|
||||
unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
|
||||
struct io_uring_sqe *sqe;
|
||||
|
||||
if (sq_idx > sq_mask)
|
||||
continue;
|
||||
sqe = &ctx->sq_sqes[sq_idx];
|
||||
seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
|
||||
sq_idx, sqe->opcode, sqe->fd, sqe->flags,
|
||||
sqe->user_data);
|
||||
}
|
||||
seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
|
||||
cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
|
||||
for (i = 0; i < cq_entries; i++) {
|
||||
unsigned int entry = i + cq_head;
|
||||
struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift];
|
||||
|
||||
if (!is_cqe32) {
|
||||
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
|
||||
entry & cq_mask, cqe->user_data, cqe->res,
|
||||
cqe->flags);
|
||||
} else {
|
||||
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, "
|
||||
"extra1:%llu, extra2:%llu\n",
|
||||
entry & cq_mask, cqe->user_data, cqe->res,
|
||||
cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Avoid ABBA deadlock between the seq lock and the io_uring mutex,
|
||||
* since fdinfo case grabs it in the opposite direction of normal use
|
||||
* cases. If we fail to get the lock, we just don't iterate any
|
||||
* structures that could be going away outside the io_uring mutex.
|
||||
*/
|
||||
has_lock = mutex_trylock(&ctx->uring_lock);
|
||||
|
||||
if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
|
||||
sq = ctx->sq_data;
|
||||
if (!sq->thread)
|
||||
sq = NULL;
|
||||
}
|
||||
|
||||
seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
|
||||
seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
|
||||
seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
|
||||
for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
|
||||
struct file *f = io_file_from_index(&ctx->file_table, i);
|
||||
|
||||
if (f)
|
||||
seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
|
||||
else
|
||||
seq_printf(m, "%5u: <none>\n", i);
|
||||
}
|
||||
seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
|
||||
for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
|
||||
struct io_mapped_ubuf *buf = ctx->user_bufs[i];
|
||||
unsigned int len = buf->ubuf_end - buf->ubuf;
|
||||
|
||||
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
|
||||
}
|
||||
if (has_lock && !xa_empty(&ctx->personalities)) {
|
||||
unsigned long index;
|
||||
const struct cred *cred;
|
||||
|
||||
seq_printf(m, "Personalities:\n");
|
||||
xa_for_each(&ctx->personalities, index, cred)
|
||||
io_uring_show_cred(m, index, cred);
|
||||
}
|
||||
if (has_lock)
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
|
||||
seq_puts(m, "PollList:\n");
|
||||
for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) {
|
||||
struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
|
||||
struct io_kiocb *req;
|
||||
|
||||
spin_lock(&hb->lock);
|
||||
hlist_for_each_entry(req, &hb->list, hash_node)
|
||||
seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
|
||||
task_work_pending(req->task));
|
||||
spin_unlock(&hb->lock);
|
||||
}
|
||||
|
||||
seq_puts(m, "CqOverflowList:\n");
|
||||
spin_lock(&ctx->completion_lock);
|
||||
list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
|
||||
struct io_uring_cqe *cqe = &ocqe->cqe;
|
||||
|
||||
seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
|
||||
cqe->user_data, cqe->res, cqe->flags);
|
||||
|
||||
}
|
||||
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
}
|
||||
|
||||
__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
|
||||
{
|
||||
struct io_ring_ctx *ctx = f->private_data;
|
||||
|
||||
if (percpu_ref_tryget(&ctx->refs)) {
|
||||
__io_uring_show_fdinfo(ctx, m);
|
||||
percpu_ref_put(&ctx->refs);
|
||||
}
|
||||
}
|
||||
#endif
|
3
io_uring/fdinfo.h
Normal file
3
io_uring/fdinfo.h
Normal file
@ -0,0 +1,3 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
void io_uring_show_fdinfo(struct seq_file *m, struct file *f);
|
193
io_uring/filetable.c
Normal file
193
io_uring/filetable.c
Normal file
@ -0,0 +1,193 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/nospec.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "rsrc.h"
|
||||
#include "filetable.h"
|
||||
|
||||
static int io_file_bitmap_get(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_file_table *table = &ctx->file_table;
|
||||
unsigned long nr = ctx->file_alloc_end;
|
||||
int ret;
|
||||
|
||||
do {
|
||||
ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint);
|
||||
if (ret != nr)
|
||||
return ret;
|
||||
|
||||
if (table->alloc_hint == ctx->file_alloc_start)
|
||||
break;
|
||||
nr = table->alloc_hint;
|
||||
table->alloc_hint = ctx->file_alloc_start;
|
||||
} while (1);
|
||||
|
||||
return -ENFILE;
|
||||
}
|
||||
|
||||
bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
|
||||
{
|
||||
table->files = kvcalloc(nr_files, sizeof(table->files[0]),
|
||||
GFP_KERNEL_ACCOUNT);
|
||||
if (unlikely(!table->files))
|
||||
return false;
|
||||
|
||||
table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
|
||||
if (unlikely(!table->bitmap)) {
|
||||
kvfree(table->files);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void io_free_file_tables(struct io_file_table *table)
|
||||
{
|
||||
kvfree(table->files);
|
||||
bitmap_free(table->bitmap);
|
||||
table->files = NULL;
|
||||
table->bitmap = NULL;
|
||||
}
|
||||
|
||||
static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
|
||||
u32 slot_index)
|
||||
__must_hold(&req->ctx->uring_lock)
|
||||
{
|
||||
bool needs_switch = false;
|
||||
struct io_fixed_file *file_slot;
|
||||
int ret;
|
||||
|
||||
if (io_is_uring_fops(file))
|
||||
return -EBADF;
|
||||
if (!ctx->file_data)
|
||||
return -ENXIO;
|
||||
if (slot_index >= ctx->nr_user_files)
|
||||
return -EINVAL;
|
||||
|
||||
slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
|
||||
file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
|
||||
|
||||
if (file_slot->file_ptr) {
|
||||
struct file *old_file;
|
||||
|
||||
ret = io_rsrc_node_switch_start(ctx);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
|
||||
ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
|
||||
ctx->rsrc_node, old_file);
|
||||
if (ret)
|
||||
goto err;
|
||||
file_slot->file_ptr = 0;
|
||||
io_file_bitmap_clear(&ctx->file_table, slot_index);
|
||||
needs_switch = true;
|
||||
}
|
||||
|
||||
ret = io_scm_file_account(ctx, file);
|
||||
if (!ret) {
|
||||
*io_get_tag_slot(ctx->file_data, slot_index) = 0;
|
||||
io_fixed_file_set(file_slot, file);
|
||||
io_file_bitmap_set(&ctx->file_table, slot_index);
|
||||
}
|
||||
err:
|
||||
if (needs_switch)
|
||||
io_rsrc_node_switch(ctx, ctx->file_data);
|
||||
if (ret)
|
||||
fput(file);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,
|
||||
unsigned int file_slot)
|
||||
{
|
||||
bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC;
|
||||
int ret;
|
||||
|
||||
if (alloc_slot) {
|
||||
ret = io_file_bitmap_get(ctx);
|
||||
if (unlikely(ret < 0))
|
||||
return ret;
|
||||
file_slot = ret;
|
||||
} else {
|
||||
file_slot--;
|
||||
}
|
||||
|
||||
ret = io_install_fixed_file(ctx, file, file_slot);
|
||||
if (!ret && alloc_slot)
|
||||
ret = file_slot;
|
||||
return ret;
|
||||
}
|
||||
/*
|
||||
* Note when io_fixed_fd_install() returns error value, it will ensure
|
||||
* fput() is called correspondingly.
|
||||
*/
|
||||
int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
|
||||
struct file *file, unsigned int file_slot)
|
||||
{
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
int ret;
|
||||
|
||||
io_ring_submit_lock(ctx, issue_flags);
|
||||
ret = __io_fixed_fd_install(ctx, file, file_slot);
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
|
||||
if (unlikely(ret < 0))
|
||||
fput(file);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
|
||||
{
|
||||
struct io_fixed_file *file_slot;
|
||||
struct file *file;
|
||||
int ret;
|
||||
|
||||
if (unlikely(!ctx->file_data))
|
||||
return -ENXIO;
|
||||
if (offset >= ctx->nr_user_files)
|
||||
return -EINVAL;
|
||||
ret = io_rsrc_node_switch_start(ctx);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
offset = array_index_nospec(offset, ctx->nr_user_files);
|
||||
file_slot = io_fixed_file_slot(&ctx->file_table, offset);
|
||||
if (!file_slot->file_ptr)
|
||||
return -EBADF;
|
||||
|
||||
file = (struct file *)(file_slot->file_ptr & FFS_MASK);
|
||||
ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
file_slot->file_ptr = 0;
|
||||
io_file_bitmap_clear(&ctx->file_table, offset);
|
||||
io_rsrc_node_switch(ctx, ctx->file_data);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_register_file_alloc_range(struct io_ring_ctx *ctx,
|
||||
struct io_uring_file_index_range __user *arg)
|
||||
{
|
||||
struct io_uring_file_index_range range;
|
||||
u32 end;
|
||||
|
||||
if (copy_from_user(&range, arg, sizeof(range)))
|
||||
return -EFAULT;
|
||||
if (check_add_overflow(range.off, range.len, &end))
|
||||
return -EOVERFLOW;
|
||||
if (range.resv || end > ctx->nr_user_files)
|
||||
return -EINVAL;
|
||||
|
||||
io_file_table_set_alloc_range(ctx, range.off, range.len);
|
||||
return 0;
|
||||
}
|
88
io_uring/filetable.h
Normal file
88
io_uring/filetable.h
Normal file
@ -0,0 +1,88 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#ifndef IOU_FILE_TABLE_H
|
||||
#define IOU_FILE_TABLE_H
|
||||
|
||||
#include <linux/file.h>
|
||||
#include <linux/io_uring_types.h>
|
||||
|
||||
/*
|
||||
* FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0
|
||||
* and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we
|
||||
* can't safely always dereference the file when the task has exited and ring
|
||||
* cleanup is done. If a file is tracked and part of SCM, then unix gc on
|
||||
* process exit may reap it before __io_sqe_files_unregister() is run.
|
||||
*/
|
||||
#define FFS_NOWAIT 0x1UL
|
||||
#define FFS_ISREG 0x2UL
|
||||
#if defined(CONFIG_64BIT)
|
||||
#define FFS_SCM 0x4UL
|
||||
#else
|
||||
#define IO_URING_SCM_ALL
|
||||
#define FFS_SCM 0x0UL
|
||||
#endif
|
||||
#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM)
|
||||
|
||||
bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files);
|
||||
void io_free_file_tables(struct io_file_table *table);
|
||||
|
||||
int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
|
||||
struct file *file, unsigned int file_slot);
|
||||
int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,
|
||||
unsigned int file_slot);
|
||||
int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset);
|
||||
|
||||
int io_register_file_alloc_range(struct io_ring_ctx *ctx,
|
||||
struct io_uring_file_index_range __user *arg);
|
||||
|
||||
unsigned int io_file_get_flags(struct file *file);
|
||||
|
||||
static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
|
||||
{
|
||||
__clear_bit(bit, table->bitmap);
|
||||
table->alloc_hint = bit;
|
||||
}
|
||||
|
||||
static inline void io_file_bitmap_set(struct io_file_table *table, int bit)
|
||||
{
|
||||
WARN_ON_ONCE(test_bit(bit, table->bitmap));
|
||||
__set_bit(bit, table->bitmap);
|
||||
table->alloc_hint = bit + 1;
|
||||
}
|
||||
|
||||
static inline struct io_fixed_file *
|
||||
io_fixed_file_slot(struct io_file_table *table, unsigned i)
|
||||
{
|
||||
return &table->files[i];
|
||||
}
|
||||
|
||||
static inline struct file *io_file_from_index(struct io_file_table *table,
|
||||
int index)
|
||||
{
|
||||
struct io_fixed_file *slot = io_fixed_file_slot(table, index);
|
||||
|
||||
return (struct file *) (slot->file_ptr & FFS_MASK);
|
||||
}
|
||||
|
||||
static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
|
||||
struct file *file)
|
||||
{
|
||||
unsigned long file_ptr = (unsigned long) file;
|
||||
|
||||
file_ptr |= io_file_get_flags(file);
|
||||
file_slot->file_ptr = file_ptr;
|
||||
}
|
||||
|
||||
static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx)
|
||||
{
|
||||
ctx->file_table.alloc_hint = ctx->file_alloc_start;
|
||||
}
|
||||
|
||||
static inline void io_file_table_set_alloc_range(struct io_ring_ctx *ctx,
|
||||
unsigned off, unsigned len)
|
||||
{
|
||||
ctx->file_alloc_start = off;
|
||||
ctx->file_alloc_end = off + len;
|
||||
io_reset_alloc_hint(ctx);
|
||||
}
|
||||
|
||||
#endif
|
293
io_uring/fs.c
Normal file
293
io_uring/fs.c
Normal file
@ -0,0 +1,293 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "../fs/internal.h"
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "fs.h"
|
||||
|
||||
struct io_rename {
|
||||
struct file *file;
|
||||
int old_dfd;
|
||||
int new_dfd;
|
||||
struct filename *oldpath;
|
||||
struct filename *newpath;
|
||||
int flags;
|
||||
};
|
||||
|
||||
struct io_unlink {
|
||||
struct file *file;
|
||||
int dfd;
|
||||
int flags;
|
||||
struct filename *filename;
|
||||
};
|
||||
|
||||
struct io_mkdir {
|
||||
struct file *file;
|
||||
int dfd;
|
||||
umode_t mode;
|
||||
struct filename *filename;
|
||||
};
|
||||
|
||||
struct io_link {
|
||||
struct file *file;
|
||||
int old_dfd;
|
||||
int new_dfd;
|
||||
struct filename *oldpath;
|
||||
struct filename *newpath;
|
||||
int flags;
|
||||
};
|
||||
|
||||
int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_rename *ren = io_kiocb_to_cmd(req);
|
||||
const char __user *oldf, *newf;
|
||||
|
||||
if (sqe->buf_index || sqe->splice_fd_in)
|
||||
return -EINVAL;
|
||||
if (unlikely(req->flags & REQ_F_FIXED_FILE))
|
||||
return -EBADF;
|
||||
|
||||
ren->old_dfd = READ_ONCE(sqe->fd);
|
||||
oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
|
||||
newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
|
||||
ren->new_dfd = READ_ONCE(sqe->len);
|
||||
ren->flags = READ_ONCE(sqe->rename_flags);
|
||||
|
||||
ren->oldpath = getname(oldf);
|
||||
if (IS_ERR(ren->oldpath))
|
||||
return PTR_ERR(ren->oldpath);
|
||||
|
||||
ren->newpath = getname(newf);
|
||||
if (IS_ERR(ren->newpath)) {
|
||||
putname(ren->oldpath);
|
||||
return PTR_ERR(ren->newpath);
|
||||
}
|
||||
|
||||
req->flags |= REQ_F_NEED_CLEANUP;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_rename *ren = io_kiocb_to_cmd(req);
|
||||
int ret;
|
||||
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
return -EAGAIN;
|
||||
|
||||
ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
|
||||
ren->newpath, ren->flags);
|
||||
|
||||
req->flags &= ~REQ_F_NEED_CLEANUP;
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
void io_renameat_cleanup(struct io_kiocb *req)
|
||||
{
|
||||
struct io_rename *ren = io_kiocb_to_cmd(req);
|
||||
|
||||
putname(ren->oldpath);
|
||||
putname(ren->newpath);
|
||||
}
|
||||
|
||||
int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_unlink *un = io_kiocb_to_cmd(req);
|
||||
const char __user *fname;
|
||||
|
||||
if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in)
|
||||
return -EINVAL;
|
||||
if (unlikely(req->flags & REQ_F_FIXED_FILE))
|
||||
return -EBADF;
|
||||
|
||||
un->dfd = READ_ONCE(sqe->fd);
|
||||
|
||||
un->flags = READ_ONCE(sqe->unlink_flags);
|
||||
if (un->flags & ~AT_REMOVEDIR)
|
||||
return -EINVAL;
|
||||
|
||||
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
|
||||
un->filename = getname(fname);
|
||||
if (IS_ERR(un->filename))
|
||||
return PTR_ERR(un->filename);
|
||||
|
||||
req->flags |= REQ_F_NEED_CLEANUP;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_unlink *un = io_kiocb_to_cmd(req);
|
||||
int ret;
|
||||
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
return -EAGAIN;
|
||||
|
||||
if (un->flags & AT_REMOVEDIR)
|
||||
ret = do_rmdir(un->dfd, un->filename);
|
||||
else
|
||||
ret = do_unlinkat(un->dfd, un->filename);
|
||||
|
||||
req->flags &= ~REQ_F_NEED_CLEANUP;
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
void io_unlinkat_cleanup(struct io_kiocb *req)
|
||||
{
|
||||
struct io_unlink *ul = io_kiocb_to_cmd(req);
|
||||
|
||||
putname(ul->filename);
|
||||
}
|
||||
|
||||
int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_mkdir *mkd = io_kiocb_to_cmd(req);
|
||||
const char __user *fname;
|
||||
|
||||
if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
|
||||
return -EINVAL;
|
||||
if (unlikely(req->flags & REQ_F_FIXED_FILE))
|
||||
return -EBADF;
|
||||
|
||||
mkd->dfd = READ_ONCE(sqe->fd);
|
||||
mkd->mode = READ_ONCE(sqe->len);
|
||||
|
||||
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
|
||||
mkd->filename = getname(fname);
|
||||
if (IS_ERR(mkd->filename))
|
||||
return PTR_ERR(mkd->filename);
|
||||
|
||||
req->flags |= REQ_F_NEED_CLEANUP;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_mkdir *mkd = io_kiocb_to_cmd(req);
|
||||
int ret;
|
||||
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
return -EAGAIN;
|
||||
|
||||
ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
|
||||
|
||||
req->flags &= ~REQ_F_NEED_CLEANUP;
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
void io_mkdirat_cleanup(struct io_kiocb *req)
|
||||
{
|
||||
struct io_mkdir *md = io_kiocb_to_cmd(req);
|
||||
|
||||
putname(md->filename);
|
||||
}
|
||||
|
||||
int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_link *sl = io_kiocb_to_cmd(req);
|
||||
const char __user *oldpath, *newpath;
|
||||
|
||||
if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
|
||||
return -EINVAL;
|
||||
if (unlikely(req->flags & REQ_F_FIXED_FILE))
|
||||
return -EBADF;
|
||||
|
||||
sl->new_dfd = READ_ONCE(sqe->fd);
|
||||
oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
|
||||
newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
|
||||
|
||||
sl->oldpath = getname(oldpath);
|
||||
if (IS_ERR(sl->oldpath))
|
||||
return PTR_ERR(sl->oldpath);
|
||||
|
||||
sl->newpath = getname(newpath);
|
||||
if (IS_ERR(sl->newpath)) {
|
||||
putname(sl->oldpath);
|
||||
return PTR_ERR(sl->newpath);
|
||||
}
|
||||
|
||||
req->flags |= REQ_F_NEED_CLEANUP;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_link *sl = io_kiocb_to_cmd(req);
|
||||
int ret;
|
||||
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
return -EAGAIN;
|
||||
|
||||
ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
|
||||
|
||||
req->flags &= ~REQ_F_NEED_CLEANUP;
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_link *lnk = io_kiocb_to_cmd(req);
|
||||
const char __user *oldf, *newf;
|
||||
|
||||
if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
|
||||
return -EINVAL;
|
||||
if (unlikely(req->flags & REQ_F_FIXED_FILE))
|
||||
return -EBADF;
|
||||
|
||||
lnk->old_dfd = READ_ONCE(sqe->fd);
|
||||
lnk->new_dfd = READ_ONCE(sqe->len);
|
||||
oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
|
||||
newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
|
||||
lnk->flags = READ_ONCE(sqe->hardlink_flags);
|
||||
|
||||
lnk->oldpath = getname(oldf);
|
||||
if (IS_ERR(lnk->oldpath))
|
||||
return PTR_ERR(lnk->oldpath);
|
||||
|
||||
lnk->newpath = getname(newf);
|
||||
if (IS_ERR(lnk->newpath)) {
|
||||
putname(lnk->oldpath);
|
||||
return PTR_ERR(lnk->newpath);
|
||||
}
|
||||
|
||||
req->flags |= REQ_F_NEED_CLEANUP;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_link *lnk = io_kiocb_to_cmd(req);
|
||||
int ret;
|
||||
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
return -EAGAIN;
|
||||
|
||||
ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
|
||||
lnk->newpath, lnk->flags);
|
||||
|
||||
req->flags &= ~REQ_F_NEED_CLEANUP;
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
void io_link_cleanup(struct io_kiocb *req)
|
||||
{
|
||||
struct io_link *sl = io_kiocb_to_cmd(req);
|
||||
|
||||
putname(sl->oldpath);
|
||||
putname(sl->newpath);
|
||||
}
|
20
io_uring/fs.h
Normal file
20
io_uring/fs.h
Normal file
@ -0,0 +1,20 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_renameat(struct io_kiocb *req, unsigned int issue_flags);
|
||||
void io_renameat_cleanup(struct io_kiocb *req);
|
||||
|
||||
int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags);
|
||||
void io_unlinkat_cleanup(struct io_kiocb *req);
|
||||
|
||||
int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags);
|
||||
void io_mkdirat_cleanup(struct io_kiocb *req);
|
||||
|
||||
int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_linkat(struct io_kiocb *req, unsigned int issue_flags);
|
||||
void io_link_cleanup(struct io_kiocb *req);
|
@ -18,6 +18,8 @@
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "io-wq.h"
|
||||
#include "slist.h"
|
||||
#include "io_uring.h"
|
||||
|
||||
#define WORKER_IDLE_TIMEOUT (5 * HZ)
|
||||
|
||||
@ -518,23 +520,11 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static bool io_flush_signals(void)
|
||||
{
|
||||
if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) {
|
||||
__set_current_state(TASK_RUNNING);
|
||||
clear_notify_signal();
|
||||
if (task_work_pending(current))
|
||||
task_work_run();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static void io_assign_current_work(struct io_worker *worker,
|
||||
struct io_wq_work *work)
|
||||
{
|
||||
if (work) {
|
||||
io_flush_signals();
|
||||
io_run_task_work();
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
@ -654,7 +644,7 @@ static int io_wqe_worker(void *data)
|
||||
last_timeout = false;
|
||||
__io_worker_idle(wqe, worker);
|
||||
raw_spin_unlock(&wqe->lock);
|
||||
if (io_flush_signals())
|
||||
if (io_run_task_work())
|
||||
continue;
|
||||
ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
|
||||
if (signal_pending(current)) {
|
83
io_uring/io-wq.h
Normal file
83
io_uring/io-wq.h
Normal file
@ -0,0 +1,83 @@
|
||||
#ifndef INTERNAL_IO_WQ_H
|
||||
#define INTERNAL_IO_WQ_H
|
||||
|
||||
#include <linux/refcount.h>
|
||||
#include <linux/io_uring_types.h>
|
||||
|
||||
struct io_wq;
|
||||
|
||||
enum {
|
||||
IO_WQ_WORK_CANCEL = 1,
|
||||
IO_WQ_WORK_HASHED = 2,
|
||||
IO_WQ_WORK_UNBOUND = 4,
|
||||
IO_WQ_WORK_CONCURRENT = 16,
|
||||
|
||||
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
|
||||
};
|
||||
|
||||
enum io_wq_cancel {
|
||||
IO_WQ_CANCEL_OK, /* cancelled before started */
|
||||
IO_WQ_CANCEL_RUNNING, /* found, running, and attempted cancelled */
|
||||
IO_WQ_CANCEL_NOTFOUND, /* work not found */
|
||||
};
|
||||
|
||||
typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
|
||||
typedef void (io_wq_work_fn)(struct io_wq_work *);
|
||||
|
||||
struct io_wq_hash {
|
||||
refcount_t refs;
|
||||
unsigned long map;
|
||||
struct wait_queue_head wait;
|
||||
};
|
||||
|
||||
static inline void io_wq_put_hash(struct io_wq_hash *hash)
|
||||
{
|
||||
if (refcount_dec_and_test(&hash->refs))
|
||||
kfree(hash);
|
||||
}
|
||||
|
||||
struct io_wq_data {
|
||||
struct io_wq_hash *hash;
|
||||
struct task_struct *task;
|
||||
io_wq_work_fn *do_work;
|
||||
free_work_fn *free_work;
|
||||
};
|
||||
|
||||
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
|
||||
void io_wq_exit_start(struct io_wq *wq);
|
||||
void io_wq_put_and_exit(struct io_wq *wq);
|
||||
|
||||
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
|
||||
void io_wq_hash_work(struct io_wq_work *work, void *val);
|
||||
|
||||
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
|
||||
int io_wq_max_workers(struct io_wq *wq, int *new_count);
|
||||
|
||||
static inline bool io_wq_is_hashed(struct io_wq_work *work)
|
||||
{
|
||||
return work->flags & IO_WQ_WORK_HASHED;
|
||||
}
|
||||
|
||||
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
|
||||
|
||||
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
|
||||
void *data, bool cancel_all);
|
||||
|
||||
#if defined(CONFIG_IO_WQ)
|
||||
extern void io_wq_worker_sleeping(struct task_struct *);
|
||||
extern void io_wq_worker_running(struct task_struct *);
|
||||
#else
|
||||
static inline void io_wq_worker_sleeping(struct task_struct *tsk)
|
||||
{
|
||||
}
|
||||
static inline void io_wq_worker_running(struct task_struct *tsk)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline bool io_wq_current_is_worker(void)
|
||||
{
|
||||
return in_task() && (current->flags & PF_IO_WORKER) &&
|
||||
current->worker_private;
|
||||
}
|
||||
#endif
|
3980
io_uring/io_uring.c
Normal file
3980
io_uring/io_uring.c
Normal file
File diff suppressed because it is too large
Load Diff
261
io_uring/io_uring.h
Normal file
261
io_uring/io_uring.h
Normal file
@ -0,0 +1,261 @@
|
||||
#ifndef IOU_CORE_H
|
||||
#define IOU_CORE_H
|
||||
|
||||
#include <linux/errno.h>
|
||||
#include <linux/lockdep.h>
|
||||
#include <linux/io_uring_types.h>
|
||||
#include "io-wq.h"
|
||||
#include "slist.h"
|
||||
#include "filetable.h"
|
||||
|
||||
#ifndef CREATE_TRACE_POINTS
|
||||
#include <trace/events/io_uring.h>
|
||||
#endif
|
||||
|
||||
enum {
|
||||
IOU_OK = 0,
|
||||
IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED,
|
||||
|
||||
/*
|
||||
* Intended only when both REQ_F_POLLED and REQ_F_APOLL_MULTISHOT
|
||||
* are set to indicate to the poll runner that multishot should be
|
||||
* removed and the result is set on req->cqe.res.
|
||||
*/
|
||||
IOU_STOP_MULTISHOT = -ECANCELED,
|
||||
};
|
||||
|
||||
struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx);
|
||||
bool io_req_cqe_overflow(struct io_kiocb *req);
|
||||
int io_run_task_work_sig(void);
|
||||
void io_req_complete_failed(struct io_kiocb *req, s32 res);
|
||||
void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
|
||||
void io_req_complete_post(struct io_kiocb *req);
|
||||
void __io_req_complete_post(struct io_kiocb *req);
|
||||
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
|
||||
bool allow_overflow);
|
||||
void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
|
||||
|
||||
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
|
||||
|
||||
struct file *io_file_get_normal(struct io_kiocb *req, int fd);
|
||||
struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
|
||||
unsigned issue_flags);
|
||||
|
||||
static inline bool io_req_ffs_set(struct io_kiocb *req)
|
||||
{
|
||||
return req->flags & REQ_F_FIXED_FILE;
|
||||
}
|
||||
|
||||
bool io_is_uring_fops(struct file *file);
|
||||
bool io_alloc_async_data(struct io_kiocb *req);
|
||||
void io_req_task_work_add(struct io_kiocb *req);
|
||||
void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
|
||||
void io_req_task_queue(struct io_kiocb *req);
|
||||
void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
|
||||
void io_req_task_complete(struct io_kiocb *req, bool *locked);
|
||||
void io_req_task_queue_fail(struct io_kiocb *req, int ret);
|
||||
void io_req_task_submit(struct io_kiocb *req, bool *locked);
|
||||
void tctx_task_work(struct callback_head *cb);
|
||||
__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
|
||||
int io_uring_alloc_task_context(struct task_struct *task,
|
||||
struct io_ring_ctx *ctx);
|
||||
|
||||
int io_poll_issue(struct io_kiocb *req, bool *locked);
|
||||
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
|
||||
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
|
||||
void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node);
|
||||
int io_req_prep_async(struct io_kiocb *req);
|
||||
|
||||
struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
|
||||
void io_wq_submit_work(struct io_wq_work *work);
|
||||
|
||||
void io_free_req(struct io_kiocb *req);
|
||||
void io_queue_next(struct io_kiocb *req);
|
||||
|
||||
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
|
||||
bool cancel_all);
|
||||
|
||||
#define io_for_each_link(pos, head) \
|
||||
for (pos = (head); pos; pos = pos->link)
|
||||
|
||||
static inline void io_cq_lock(struct io_ring_ctx *ctx)
|
||||
__acquires(ctx->completion_lock)
|
||||
{
|
||||
spin_lock(&ctx->completion_lock);
|
||||
}
|
||||
|
||||
void io_cq_unlock_post(struct io_ring_ctx *ctx);
|
||||
|
||||
static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
|
||||
{
|
||||
if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
|
||||
struct io_uring_cqe *cqe = ctx->cqe_cached;
|
||||
|
||||
ctx->cached_cq_tail++;
|
||||
ctx->cqe_cached++;
|
||||
if (ctx->flags & IORING_SETUP_CQE32)
|
||||
ctx->cqe_cached++;
|
||||
return cqe;
|
||||
}
|
||||
|
||||
return __io_get_cqe(ctx);
|
||||
}
|
||||
|
||||
static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
|
||||
struct io_kiocb *req)
|
||||
{
|
||||
struct io_uring_cqe *cqe;
|
||||
|
||||
/*
|
||||
* If we can't get a cq entry, userspace overflowed the
|
||||
* submission (by quite a lot). Increment the overflow count in
|
||||
* the ring.
|
||||
*/
|
||||
cqe = io_get_cqe(ctx);
|
||||
if (unlikely(!cqe))
|
||||
return io_req_cqe_overflow(req);
|
||||
|
||||
trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
|
||||
req->cqe.res, req->cqe.flags,
|
||||
(req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0,
|
||||
(req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0);
|
||||
|
||||
memcpy(cqe, &req->cqe, sizeof(*cqe));
|
||||
|
||||
if (ctx->flags & IORING_SETUP_CQE32) {
|
||||
u64 extra1 = 0, extra2 = 0;
|
||||
|
||||
if (req->flags & REQ_F_CQE32_INIT) {
|
||||
extra1 = req->extra1;
|
||||
extra2 = req->extra2;
|
||||
}
|
||||
|
||||
WRITE_ONCE(cqe->big_cqe[0], extra1);
|
||||
WRITE_ONCE(cqe->big_cqe[1], extra2);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline void req_set_fail(struct io_kiocb *req)
|
||||
{
|
||||
req->flags |= REQ_F_FAIL;
|
||||
if (req->flags & REQ_F_CQE_SKIP) {
|
||||
req->flags &= ~REQ_F_CQE_SKIP;
|
||||
req->flags |= REQ_F_SKIP_LINK_CQES;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags)
|
||||
{
|
||||
req->cqe.res = res;
|
||||
req->cqe.flags = cflags;
|
||||
}
|
||||
|
||||
static inline bool req_has_async_data(struct io_kiocb *req)
|
||||
{
|
||||
return req->flags & REQ_F_ASYNC_DATA;
|
||||
}
|
||||
|
||||
static inline void io_put_file(struct file *file)
|
||||
{
|
||||
if (file)
|
||||
fput(file);
|
||||
}
|
||||
|
||||
static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx,
|
||||
unsigned issue_flags)
|
||||
{
|
||||
lockdep_assert_held(&ctx->uring_lock);
|
||||
if (issue_flags & IO_URING_F_UNLOCKED)
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
}
|
||||
|
||||
static inline void io_ring_submit_lock(struct io_ring_ctx *ctx,
|
||||
unsigned issue_flags)
|
||||
{
|
||||
/*
|
||||
* "Normal" inline submissions always hold the uring_lock, since we
|
||||
* grab it from the system call. Same is true for the SQPOLL offload.
|
||||
* The only exception is when we've detached the request and issue it
|
||||
* from an async worker thread, grab the lock for that case.
|
||||
*/
|
||||
if (issue_flags & IO_URING_F_UNLOCKED)
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
lockdep_assert_held(&ctx->uring_lock);
|
||||
}
|
||||
|
||||
static inline void io_commit_cqring(struct io_ring_ctx *ctx)
|
||||
{
|
||||
/* order cqe stores with ring update */
|
||||
smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
|
||||
}
|
||||
|
||||
static inline void io_cqring_wake(struct io_ring_ctx *ctx)
|
||||
{
|
||||
/*
|
||||
* wake_up_all() may seem excessive, but io_wake_function() and
|
||||
* io_should_wake() handle the termination of the loop and only
|
||||
* wake as many waiters as we need to.
|
||||
*/
|
||||
if (wq_has_sleeper(&ctx->cq_wait))
|
||||
wake_up_all(&ctx->cq_wait);
|
||||
}
|
||||
|
||||
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_rings *r = ctx->rings;
|
||||
|
||||
return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
|
||||
}
|
||||
|
||||
static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_rings *rings = ctx->rings;
|
||||
|
||||
/* make sure SQ entry isn't read before tail */
|
||||
return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
|
||||
}
|
||||
|
||||
static inline bool io_run_task_work(void)
|
||||
{
|
||||
if (test_thread_flag(TIF_NOTIFY_SIGNAL)) {
|
||||
__set_current_state(TASK_RUNNING);
|
||||
clear_notify_signal();
|
||||
if (task_work_pending(current))
|
||||
task_work_run();
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
|
||||
{
|
||||
if (!*locked) {
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
*locked = true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Don't complete immediately but use deferred completion infrastructure.
|
||||
* Protected by ->uring_lock and can only be used either with
|
||||
* IO_URING_F_COMPLETE_DEFER or inside a tw handler holding the mutex.
|
||||
*/
|
||||
static inline void io_req_complete_defer(struct io_kiocb *req)
|
||||
__must_hold(&req->ctx->uring_lock)
|
||||
{
|
||||
struct io_submit_state *state = &req->ctx->submit_state;
|
||||
|
||||
lockdep_assert_held(&req->ctx->uring_lock);
|
||||
|
||||
wq_list_add_tail(&req->comp_list, &state->compl_reqs);
|
||||
}
|
||||
|
||||
static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
|
||||
{
|
||||
if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd))
|
||||
__io_commit_cqring_flush(ctx);
|
||||
}
|
||||
|
||||
#endif
|
549
io_uring/kbuf.c
Normal file
549
io_uring/kbuf.c
Normal file
@ -0,0 +1,549 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/poll.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "opdef.h"
|
||||
#include "kbuf.h"
|
||||
|
||||
#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
|
||||
|
||||
#define BGID_ARRAY 64
|
||||
|
||||
struct io_provide_buf {
|
||||
struct file *file;
|
||||
__u64 addr;
|
||||
__u32 len;
|
||||
__u32 bgid;
|
||||
__u16 nbufs;
|
||||
__u16 bid;
|
||||
};
|
||||
|
||||
static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
|
||||
unsigned int bgid)
|
||||
{
|
||||
if (ctx->io_bl && bgid < BGID_ARRAY)
|
||||
return &ctx->io_bl[bgid];
|
||||
|
||||
return xa_load(&ctx->io_bl_xa, bgid);
|
||||
}
|
||||
|
||||
static int io_buffer_add_list(struct io_ring_ctx *ctx,
|
||||
struct io_buffer_list *bl, unsigned int bgid)
|
||||
{
|
||||
bl->bgid = bgid;
|
||||
if (bgid < BGID_ARRAY)
|
||||
return 0;
|
||||
|
||||
return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
|
||||
}
|
||||
|
||||
void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
|
||||
{
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
struct io_buffer_list *bl;
|
||||
struct io_buffer *buf;
|
||||
|
||||
/*
|
||||
* For legacy provided buffer mode, don't recycle if we already did
|
||||
* IO to this buffer. For ring-mapped provided buffer mode, we should
|
||||
* increment ring->head to explicitly monopolize the buffer to avoid
|
||||
* multiple use.
|
||||
*/
|
||||
if (req->flags & REQ_F_PARTIAL_IO)
|
||||
return;
|
||||
|
||||
io_ring_submit_lock(ctx, issue_flags);
|
||||
|
||||
buf = req->kbuf;
|
||||
bl = io_buffer_get_list(ctx, buf->bgid);
|
||||
list_add(&buf->list, &bl->buf_list);
|
||||
req->flags &= ~REQ_F_BUFFER_SELECTED;
|
||||
req->buf_index = buf->bgid;
|
||||
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
return;
|
||||
}
|
||||
|
||||
unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
|
||||
{
|
||||
unsigned int cflags;
|
||||
|
||||
/*
|
||||
* We can add this buffer back to two lists:
|
||||
*
|
||||
* 1) The io_buffers_cache list. This one is protected by the
|
||||
* ctx->uring_lock. If we already hold this lock, add back to this
|
||||
* list as we can grab it from issue as well.
|
||||
* 2) The io_buffers_comp list. This one is protected by the
|
||||
* ctx->completion_lock.
|
||||
*
|
||||
* We migrate buffers from the comp_list to the issue cache list
|
||||
* when we need one.
|
||||
*/
|
||||
if (req->flags & REQ_F_BUFFER_RING) {
|
||||
/* no buffers to recycle for this case */
|
||||
cflags = __io_put_kbuf_list(req, NULL);
|
||||
} else if (issue_flags & IO_URING_F_UNLOCKED) {
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
|
||||
spin_lock(&ctx->completion_lock);
|
||||
cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp);
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
} else {
|
||||
lockdep_assert_held(&req->ctx->uring_lock);
|
||||
|
||||
cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
|
||||
}
|
||||
return cflags;
|
||||
}
|
||||
|
||||
static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
|
||||
struct io_buffer_list *bl)
|
||||
{
|
||||
if (!list_empty(&bl->buf_list)) {
|
||||
struct io_buffer *kbuf;
|
||||
|
||||
kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
|
||||
list_del(&kbuf->list);
|
||||
if (*len == 0 || *len > kbuf->len)
|
||||
*len = kbuf->len;
|
||||
req->flags |= REQ_F_BUFFER_SELECTED;
|
||||
req->kbuf = kbuf;
|
||||
req->buf_index = kbuf->bid;
|
||||
return u64_to_user_ptr(kbuf->addr);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
|
||||
struct io_buffer_list *bl,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
struct io_uring_buf_ring *br = bl->buf_ring;
|
||||
struct io_uring_buf *buf;
|
||||
__u16 head = bl->head;
|
||||
|
||||
if (unlikely(smp_load_acquire(&br->tail) == head))
|
||||
return NULL;
|
||||
|
||||
head &= bl->mask;
|
||||
if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
|
||||
buf = &br->bufs[head];
|
||||
} else {
|
||||
int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
|
||||
int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
|
||||
buf = page_address(bl->buf_pages[index]);
|
||||
buf += off;
|
||||
}
|
||||
if (*len == 0 || *len > buf->len)
|
||||
*len = buf->len;
|
||||
req->flags |= REQ_F_BUFFER_RING;
|
||||
req->buf_list = bl;
|
||||
req->buf_index = buf->bid;
|
||||
|
||||
if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
|
||||
/*
|
||||
* If we came in unlocked, we have no choice but to consume the
|
||||
* buffer here, otherwise nothing ensures that the buffer won't
|
||||
* get used by others. This does mean it'll be pinned until the
|
||||
* IO completes, coming in unlocked means we're being called from
|
||||
* io-wq context and there may be further retries in async hybrid
|
||||
* mode. For the locked case, the caller must call commit when
|
||||
* the transfer completes (or if we get -EAGAIN and must poll of
|
||||
* retry).
|
||||
*/
|
||||
req->buf_list = NULL;
|
||||
bl->head++;
|
||||
}
|
||||
return u64_to_user_ptr(buf->addr);
|
||||
}
|
||||
|
||||
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
struct io_buffer_list *bl;
|
||||
void __user *ret = NULL;
|
||||
|
||||
io_ring_submit_lock(req->ctx, issue_flags);
|
||||
|
||||
bl = io_buffer_get_list(ctx, req->buf_index);
|
||||
if (likely(bl)) {
|
||||
if (bl->buf_nr_pages)
|
||||
ret = io_ring_buffer_select(req, len, bl, issue_flags);
|
||||
else
|
||||
ret = io_provided_buffer_select(req, len, bl);
|
||||
}
|
||||
io_ring_submit_unlock(req->ctx, issue_flags);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
|
||||
{
|
||||
int i;
|
||||
|
||||
ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
|
||||
GFP_KERNEL);
|
||||
if (!ctx->io_bl)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = 0; i < BGID_ARRAY; i++) {
|
||||
INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
|
||||
ctx->io_bl[i].bgid = i;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __io_remove_buffers(struct io_ring_ctx *ctx,
|
||||
struct io_buffer_list *bl, unsigned nbufs)
|
||||
{
|
||||
unsigned i = 0;
|
||||
|
||||
/* shouldn't happen */
|
||||
if (!nbufs)
|
||||
return 0;
|
||||
|
||||
if (bl->buf_nr_pages) {
|
||||
int j;
|
||||
|
||||
i = bl->buf_ring->tail - bl->head;
|
||||
for (j = 0; j < bl->buf_nr_pages; j++)
|
||||
unpin_user_page(bl->buf_pages[j]);
|
||||
kvfree(bl->buf_pages);
|
||||
bl->buf_pages = NULL;
|
||||
bl->buf_nr_pages = 0;
|
||||
/* make sure it's seen as empty */
|
||||
INIT_LIST_HEAD(&bl->buf_list);
|
||||
return i;
|
||||
}
|
||||
|
||||
/* the head kbuf is the list itself */
|
||||
while (!list_empty(&bl->buf_list)) {
|
||||
struct io_buffer *nxt;
|
||||
|
||||
nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
|
||||
list_del(&nxt->list);
|
||||
if (++i == nbufs)
|
||||
return i;
|
||||
cond_resched();
|
||||
}
|
||||
i++;
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
void io_destroy_buffers(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_buffer_list *bl;
|
||||
unsigned long index;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < BGID_ARRAY; i++) {
|
||||
if (!ctx->io_bl)
|
||||
break;
|
||||
__io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
|
||||
}
|
||||
|
||||
xa_for_each(&ctx->io_bl_xa, index, bl) {
|
||||
xa_erase(&ctx->io_bl_xa, bl->bgid);
|
||||
__io_remove_buffers(ctx, bl, -1U);
|
||||
kfree(bl);
|
||||
}
|
||||
|
||||
while (!list_empty(&ctx->io_buffers_pages)) {
|
||||
struct page *page;
|
||||
|
||||
page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
|
||||
list_del_init(&page->lru);
|
||||
__free_page(page);
|
||||
}
|
||||
}
|
||||
|
||||
int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_provide_buf *p = io_kiocb_to_cmd(req);
|
||||
u64 tmp;
|
||||
|
||||
if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
|
||||
sqe->splice_fd_in)
|
||||
return -EINVAL;
|
||||
|
||||
tmp = READ_ONCE(sqe->fd);
|
||||
if (!tmp || tmp > USHRT_MAX)
|
||||
return -EINVAL;
|
||||
|
||||
memset(p, 0, sizeof(*p));
|
||||
p->nbufs = tmp;
|
||||
p->bgid = READ_ONCE(sqe->buf_group);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_provide_buf *p = io_kiocb_to_cmd(req);
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
struct io_buffer_list *bl;
|
||||
int ret = 0;
|
||||
|
||||
io_ring_submit_lock(ctx, issue_flags);
|
||||
|
||||
ret = -ENOENT;
|
||||
bl = io_buffer_get_list(ctx, p->bgid);
|
||||
if (bl) {
|
||||
ret = -EINVAL;
|
||||
/* can't use provide/remove buffers command on mapped buffers */
|
||||
if (!bl->buf_nr_pages)
|
||||
ret = __io_remove_buffers(ctx, bl, p->nbufs);
|
||||
}
|
||||
if (ret < 0)
|
||||
req_set_fail(req);
|
||||
|
||||
/* complete before unlock, IOPOLL may need the lock */
|
||||
io_req_set_res(req, ret, 0);
|
||||
__io_req_complete(req, issue_flags);
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
return IOU_ISSUE_SKIP_COMPLETE;
|
||||
}
|
||||
|
||||
int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
unsigned long size, tmp_check;
|
||||
struct io_provide_buf *p = io_kiocb_to_cmd(req);
|
||||
u64 tmp;
|
||||
|
||||
if (sqe->rw_flags || sqe->splice_fd_in)
|
||||
return -EINVAL;
|
||||
|
||||
tmp = READ_ONCE(sqe->fd);
|
||||
if (!tmp || tmp > USHRT_MAX)
|
||||
return -E2BIG;
|
||||
p->nbufs = tmp;
|
||||
p->addr = READ_ONCE(sqe->addr);
|
||||
p->len = READ_ONCE(sqe->len);
|
||||
|
||||
if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
|
||||
&size))
|
||||
return -EOVERFLOW;
|
||||
if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
|
||||
return -EOVERFLOW;
|
||||
|
||||
size = (unsigned long)p->len * p->nbufs;
|
||||
if (!access_ok(u64_to_user_ptr(p->addr), size))
|
||||
return -EFAULT;
|
||||
|
||||
p->bgid = READ_ONCE(sqe->buf_group);
|
||||
tmp = READ_ONCE(sqe->off);
|
||||
if (tmp > USHRT_MAX)
|
||||
return -E2BIG;
|
||||
p->bid = tmp;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_buffer *buf;
|
||||
struct page *page;
|
||||
int bufs_in_page;
|
||||
|
||||
/*
|
||||
* Completions that don't happen inline (eg not under uring_lock) will
|
||||
* add to ->io_buffers_comp. If we don't have any free buffers, check
|
||||
* the completion list and splice those entries first.
|
||||
*/
|
||||
if (!list_empty_careful(&ctx->io_buffers_comp)) {
|
||||
spin_lock(&ctx->completion_lock);
|
||||
if (!list_empty(&ctx->io_buffers_comp)) {
|
||||
list_splice_init(&ctx->io_buffers_comp,
|
||||
&ctx->io_buffers_cache);
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
return 0;
|
||||
}
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* No free buffers and no completion entries either. Allocate a new
|
||||
* page worth of buffer entries and add those to our freelist.
|
||||
*/
|
||||
page = alloc_page(GFP_KERNEL_ACCOUNT);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
|
||||
list_add(&page->lru, &ctx->io_buffers_pages);
|
||||
|
||||
buf = page_address(page);
|
||||
bufs_in_page = PAGE_SIZE / sizeof(*buf);
|
||||
while (bufs_in_page) {
|
||||
list_add_tail(&buf->list, &ctx->io_buffers_cache);
|
||||
buf++;
|
||||
bufs_in_page--;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
|
||||
struct io_buffer_list *bl)
|
||||
{
|
||||
struct io_buffer *buf;
|
||||
u64 addr = pbuf->addr;
|
||||
int i, bid = pbuf->bid;
|
||||
|
||||
for (i = 0; i < pbuf->nbufs; i++) {
|
||||
if (list_empty(&ctx->io_buffers_cache) &&
|
||||
io_refill_buffer_cache(ctx))
|
||||
break;
|
||||
buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
|
||||
list);
|
||||
list_move_tail(&buf->list, &bl->buf_list);
|
||||
buf->addr = addr;
|
||||
buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
|
||||
buf->bid = bid;
|
||||
buf->bgid = pbuf->bgid;
|
||||
addr += pbuf->len;
|
||||
bid++;
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
return i ? 0 : -ENOMEM;
|
||||
}
|
||||
|
||||
int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_provide_buf *p = io_kiocb_to_cmd(req);
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
struct io_buffer_list *bl;
|
||||
int ret = 0;
|
||||
|
||||
io_ring_submit_lock(ctx, issue_flags);
|
||||
|
||||
if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
|
||||
ret = io_init_bl_list(ctx);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
bl = io_buffer_get_list(ctx, p->bgid);
|
||||
if (unlikely(!bl)) {
|
||||
bl = kzalloc(sizeof(*bl), GFP_KERNEL);
|
||||
if (!bl) {
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
INIT_LIST_HEAD(&bl->buf_list);
|
||||
ret = io_buffer_add_list(ctx, bl, p->bgid);
|
||||
if (ret) {
|
||||
kfree(bl);
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
/* can't add buffers via this command for a mapped buffer ring */
|
||||
if (bl->buf_nr_pages) {
|
||||
ret = -EINVAL;
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = io_add_buffers(ctx, p, bl);
|
||||
err:
|
||||
if (ret < 0)
|
||||
req_set_fail(req);
|
||||
/* complete before unlock, IOPOLL may need the lock */
|
||||
io_req_set_res(req, ret, 0);
|
||||
__io_req_complete(req, issue_flags);
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
return IOU_ISSUE_SKIP_COMPLETE;
|
||||
}
|
||||
|
||||
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
|
||||
{
|
||||
struct io_uring_buf_ring *br;
|
||||
struct io_uring_buf_reg reg;
|
||||
struct io_buffer_list *bl, *free_bl = NULL;
|
||||
struct page **pages;
|
||||
int nr_pages;
|
||||
|
||||
if (copy_from_user(®, arg, sizeof(reg)))
|
||||
return -EFAULT;
|
||||
|
||||
if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
|
||||
return -EINVAL;
|
||||
if (!reg.ring_addr)
|
||||
return -EFAULT;
|
||||
if (reg.ring_addr & ~PAGE_MASK)
|
||||
return -EINVAL;
|
||||
if (!is_power_of_2(reg.ring_entries))
|
||||
return -EINVAL;
|
||||
|
||||
/* cannot disambiguate full vs empty due to head/tail size */
|
||||
if (reg.ring_entries >= 65536)
|
||||
return -EINVAL;
|
||||
|
||||
if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
|
||||
int ret = io_init_bl_list(ctx);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
bl = io_buffer_get_list(ctx, reg.bgid);
|
||||
if (bl) {
|
||||
/* if mapped buffer ring OR classic exists, don't allow */
|
||||
if (bl->buf_nr_pages || !list_empty(&bl->buf_list))
|
||||
return -EEXIST;
|
||||
} else {
|
||||
free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
|
||||
if (!bl)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
pages = io_pin_pages(reg.ring_addr,
|
||||
struct_size(br, bufs, reg.ring_entries),
|
||||
&nr_pages);
|
||||
if (IS_ERR(pages)) {
|
||||
kfree(free_bl);
|
||||
return PTR_ERR(pages);
|
||||
}
|
||||
|
||||
br = page_address(pages[0]);
|
||||
bl->buf_pages = pages;
|
||||
bl->buf_nr_pages = nr_pages;
|
||||
bl->nr_entries = reg.ring_entries;
|
||||
bl->buf_ring = br;
|
||||
bl->mask = reg.ring_entries - 1;
|
||||
io_buffer_add_list(ctx, bl, reg.bgid);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
|
||||
{
|
||||
struct io_uring_buf_reg reg;
|
||||
struct io_buffer_list *bl;
|
||||
|
||||
if (copy_from_user(®, arg, sizeof(reg)))
|
||||
return -EFAULT;
|
||||
if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
|
||||
return -EINVAL;
|
||||
|
||||
bl = io_buffer_get_list(ctx, reg.bgid);
|
||||
if (!bl)
|
||||
return -ENOENT;
|
||||
if (!bl->buf_nr_pages)
|
||||
return -EINVAL;
|
||||
|
||||
__io_remove_buffers(ctx, bl, -1U);
|
||||
if (bl->bgid >= BGID_ARRAY) {
|
||||
xa_erase(&ctx->io_bl_xa, bl->bgid);
|
||||
kfree(bl);
|
||||
}
|
||||
return 0;
|
||||
}
|
140
io_uring/kbuf.h
Normal file
140
io_uring/kbuf.h
Normal file
@ -0,0 +1,140 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#ifndef IOU_KBUF_H
|
||||
#define IOU_KBUF_H
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
struct io_buffer_list {
|
||||
/*
|
||||
* If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
|
||||
* then these are classic provided buffers and ->buf_list is used.
|
||||
*/
|
||||
union {
|
||||
struct list_head buf_list;
|
||||
struct {
|
||||
struct page **buf_pages;
|
||||
struct io_uring_buf_ring *buf_ring;
|
||||
};
|
||||
};
|
||||
__u16 bgid;
|
||||
|
||||
/* below is for ring provided buffers */
|
||||
__u16 buf_nr_pages;
|
||||
__u16 nr_entries;
|
||||
__u16 head;
|
||||
__u16 mask;
|
||||
};
|
||||
|
||||
struct io_buffer {
|
||||
struct list_head list;
|
||||
__u64 addr;
|
||||
__u32 len;
|
||||
__u16 bid;
|
||||
__u16 bgid;
|
||||
};
|
||||
|
||||
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
|
||||
unsigned int issue_flags);
|
||||
void io_destroy_buffers(struct io_ring_ctx *ctx);
|
||||
|
||||
int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
|
||||
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
|
||||
|
||||
unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
|
||||
|
||||
void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
|
||||
|
||||
static inline void io_kbuf_recycle_ring(struct io_kiocb *req)
|
||||
{
|
||||
/*
|
||||
* We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
|
||||
* the flag and hence ensure that bl->head doesn't get incremented.
|
||||
* If the tail has already been incremented, hang on to it.
|
||||
* The exception is partial io, that case we should increment bl->head
|
||||
* to monopolize the buffer.
|
||||
*/
|
||||
if (req->buf_list) {
|
||||
if (req->flags & REQ_F_PARTIAL_IO) {
|
||||
/*
|
||||
* If we end up here, then the io_uring_lock has
|
||||
* been kept held since we retrieved the buffer.
|
||||
* For the io-wq case, we already cleared
|
||||
* req->buf_list when the buffer was retrieved,
|
||||
* hence it cannot be set here for that case.
|
||||
*/
|
||||
req->buf_list->head++;
|
||||
req->buf_list = NULL;
|
||||
} else {
|
||||
req->buf_index = req->buf_list->bgid;
|
||||
req->flags &= ~REQ_F_BUFFER_RING;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool io_do_buffer_select(struct io_kiocb *req)
|
||||
{
|
||||
if (!(req->flags & REQ_F_BUFFER_SELECT))
|
||||
return false;
|
||||
return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
|
||||
}
|
||||
|
||||
static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
|
||||
{
|
||||
/*
|
||||
* READV uses fields in `struct io_rw` (len/addr) to stash the selected
|
||||
* buffer data. However if that buffer is recycled the original request
|
||||
* data stored in addr is lost. Therefore forbid recycling for now.
|
||||
*/
|
||||
if (req->opcode == IORING_OP_READV)
|
||||
return;
|
||||
|
||||
if (req->flags & REQ_F_BUFFER_SELECTED)
|
||||
io_kbuf_recycle_legacy(req, issue_flags);
|
||||
if (req->flags & REQ_F_BUFFER_RING)
|
||||
io_kbuf_recycle_ring(req);
|
||||
}
|
||||
|
||||
static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req,
|
||||
struct list_head *list)
|
||||
{
|
||||
unsigned int ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
|
||||
|
||||
if (req->flags & REQ_F_BUFFER_RING) {
|
||||
if (req->buf_list) {
|
||||
req->buf_index = req->buf_list->bgid;
|
||||
req->buf_list->head++;
|
||||
}
|
||||
req->flags &= ~REQ_F_BUFFER_RING;
|
||||
} else {
|
||||
req->buf_index = req->kbuf->bgid;
|
||||
list_add(&req->kbuf->list, list);
|
||||
req->flags &= ~REQ_F_BUFFER_SELECTED;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
|
||||
{
|
||||
lockdep_assert_held(&req->ctx->completion_lock);
|
||||
|
||||
if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
|
||||
return 0;
|
||||
return __io_put_kbuf_list(req, &req->ctx->io_buffers_comp);
|
||||
}
|
||||
|
||||
static inline unsigned int io_put_kbuf(struct io_kiocb *req,
|
||||
unsigned issue_flags)
|
||||
{
|
||||
|
||||
if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
|
||||
return 0;
|
||||
return __io_put_kbuf(req, issue_flags);
|
||||
}
|
||||
#endif
|
171
io_uring/msg_ring.c
Normal file
171
io_uring/msg_ring.c
Normal file
@ -0,0 +1,171 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/nospec.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "rsrc.h"
|
||||
#include "filetable.h"
|
||||
#include "msg_ring.h"
|
||||
|
||||
struct io_msg {
|
||||
struct file *file;
|
||||
u64 user_data;
|
||||
u32 len;
|
||||
u32 cmd;
|
||||
u32 src_fd;
|
||||
u32 dst_fd;
|
||||
u32 flags;
|
||||
};
|
||||
|
||||
static int io_msg_ring_data(struct io_kiocb *req)
|
||||
{
|
||||
struct io_ring_ctx *target_ctx = req->file->private_data;
|
||||
struct io_msg *msg = io_kiocb_to_cmd(req);
|
||||
|
||||
if (msg->src_fd || msg->dst_fd || msg->flags)
|
||||
return -EINVAL;
|
||||
|
||||
if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true))
|
||||
return 0;
|
||||
|
||||
return -EOVERFLOW;
|
||||
}
|
||||
|
||||
static void io_double_unlock_ctx(struct io_ring_ctx *ctx,
|
||||
struct io_ring_ctx *octx,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
if (issue_flags & IO_URING_F_UNLOCKED)
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
mutex_unlock(&octx->uring_lock);
|
||||
}
|
||||
|
||||
static int io_double_lock_ctx(struct io_ring_ctx *ctx,
|
||||
struct io_ring_ctx *octx,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
/*
|
||||
* To ensure proper ordering between the two ctxs, we can only
|
||||
* attempt a trylock on the target. If that fails and we already have
|
||||
* the source ctx lock, punt to io-wq.
|
||||
*/
|
||||
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
|
||||
if (!mutex_trylock(&octx->uring_lock))
|
||||
return -EAGAIN;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Always grab smallest value ctx first. We know ctx != octx. */
|
||||
if (ctx < octx) {
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
mutex_lock(&octx->uring_lock);
|
||||
} else {
|
||||
mutex_lock(&octx->uring_lock);
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_ring_ctx *target_ctx = req->file->private_data;
|
||||
struct io_msg *msg = io_kiocb_to_cmd(req);
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
unsigned long file_ptr;
|
||||
struct file *src_file;
|
||||
int ret;
|
||||
|
||||
if (target_ctx == ctx)
|
||||
return -EINVAL;
|
||||
|
||||
ret = io_double_lock_ctx(ctx, target_ctx, issue_flags);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
||||
ret = -EBADF;
|
||||
if (unlikely(msg->src_fd >= ctx->nr_user_files))
|
||||
goto out_unlock;
|
||||
|
||||
msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files);
|
||||
file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr;
|
||||
src_file = (struct file *) (file_ptr & FFS_MASK);
|
||||
get_file(src_file);
|
||||
|
||||
ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd);
|
||||
if (ret < 0) {
|
||||
fput(src_file);
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
if (msg->flags & IORING_MSG_RING_CQE_SKIP)
|
||||
goto out_unlock;
|
||||
|
||||
/*
|
||||
* If this fails, the target still received the file descriptor but
|
||||
* wasn't notified of the fact. This means that if this request
|
||||
* completes with -EOVERFLOW, then the sender must ensure that a
|
||||
* later IORING_OP_MSG_RING delivers the message.
|
||||
*/
|
||||
if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true))
|
||||
ret = -EOVERFLOW;
|
||||
out_unlock:
|
||||
io_double_unlock_ctx(ctx, target_ctx, issue_flags);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_msg *msg = io_kiocb_to_cmd(req);
|
||||
|
||||
if (unlikely(sqe->buf_index || sqe->personality))
|
||||
return -EINVAL;
|
||||
|
||||
msg->user_data = READ_ONCE(sqe->off);
|
||||
msg->len = READ_ONCE(sqe->len);
|
||||
msg->cmd = READ_ONCE(sqe->addr);
|
||||
msg->src_fd = READ_ONCE(sqe->addr3);
|
||||
msg->dst_fd = READ_ONCE(sqe->file_index);
|
||||
msg->flags = READ_ONCE(sqe->msg_ring_flags);
|
||||
if (msg->flags & ~IORING_MSG_RING_CQE_SKIP)
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_msg *msg = io_kiocb_to_cmd(req);
|
||||
int ret;
|
||||
|
||||
ret = -EBADFD;
|
||||
if (!io_is_uring_fops(req->file))
|
||||
goto done;
|
||||
|
||||
switch (msg->cmd) {
|
||||
case IORING_MSG_DATA:
|
||||
ret = io_msg_ring_data(req);
|
||||
break;
|
||||
case IORING_MSG_SEND_FD:
|
||||
ret = io_msg_send_fd(req, issue_flags);
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
|
||||
done:
|
||||
if (ret < 0)
|
||||
req_set_fail(req);
|
||||
io_req_set_res(req, ret, 0);
|
||||
/* put file to avoid an attempt to IOPOLL the req */
|
||||
io_put_file(req->file);
|
||||
req->file = NULL;
|
||||
return IOU_OK;
|
||||
}
|
4
io_uring/msg_ring.h
Normal file
4
io_uring/msg_ring.h
Normal file
@ -0,0 +1,4 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags);
|
1047
io_uring/net.c
Normal file
1047
io_uring/net.c
Normal file
File diff suppressed because it is too large
Load Diff
60
io_uring/net.h
Normal file
60
io_uring/net.h
Normal file
@ -0,0 +1,60 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include <linux/net.h>
|
||||
#include <linux/uio.h>
|
||||
|
||||
#include "alloc_cache.h"
|
||||
|
||||
#if defined(CONFIG_NET)
|
||||
struct io_async_msghdr {
|
||||
union {
|
||||
struct iovec fast_iov[UIO_FASTIOV];
|
||||
struct {
|
||||
struct iovec fast_iov_one;
|
||||
__kernel_size_t controllen;
|
||||
int namelen;
|
||||
__kernel_size_t payloadlen;
|
||||
};
|
||||
struct io_cache_entry cache;
|
||||
};
|
||||
/* points to an allocated iov, if NULL we use fast_iov instead */
|
||||
struct iovec *free_iov;
|
||||
struct sockaddr __user *uaddr;
|
||||
struct msghdr msg;
|
||||
struct sockaddr_storage addr;
|
||||
};
|
||||
|
||||
struct io_async_connect {
|
||||
struct sockaddr_storage address;
|
||||
};
|
||||
|
||||
int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_shutdown(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_sendmsg_prep_async(struct io_kiocb *req);
|
||||
void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req);
|
||||
int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags);
|
||||
int io_send(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_recvmsg_prep_async(struct io_kiocb *req);
|
||||
int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags);
|
||||
int io_recv(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_accept(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_socket(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_connect_prep_async(struct io_kiocb *req);
|
||||
int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_connect(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
void io_netmsg_cache_free(struct io_cache_entry *entry);
|
||||
#else
|
||||
static inline void io_netmsg_cache_free(struct io_cache_entry *entry)
|
||||
{
|
||||
}
|
||||
#endif
|
25
io_uring/nop.c
Normal file
25
io_uring/nop.c
Normal file
@ -0,0 +1,25 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "nop.h"
|
||||
|
||||
int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* IORING_OP_NOP just posts a completion event, nothing else.
|
||||
*/
|
||||
int io_nop(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
io_req_set_res(req, 0, 0);
|
||||
return IOU_OK;
|
||||
}
|
4
io_uring/nop.h
Normal file
4
io_uring/nop.h
Normal file
@ -0,0 +1,4 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_nop(struct io_kiocb *req, unsigned int issue_flags);
|
494
io_uring/opdef.c
Normal file
494
io_uring/opdef.c
Normal file
@ -0,0 +1,494 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* io_uring opcode handling table
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "opdef.h"
|
||||
#include "refs.h"
|
||||
#include "tctx.h"
|
||||
#include "sqpoll.h"
|
||||
#include "fdinfo.h"
|
||||
#include "kbuf.h"
|
||||
#include "rsrc.h"
|
||||
|
||||
#include "xattr.h"
|
||||
#include "nop.h"
|
||||
#include "fs.h"
|
||||
#include "splice.h"
|
||||
#include "sync.h"
|
||||
#include "advise.h"
|
||||
#include "openclose.h"
|
||||
#include "uring_cmd.h"
|
||||
#include "epoll.h"
|
||||
#include "statx.h"
|
||||
#include "net.h"
|
||||
#include "msg_ring.h"
|
||||
#include "timeout.h"
|
||||
#include "poll.h"
|
||||
#include "cancel.h"
|
||||
#include "rw.h"
|
||||
|
||||
static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
WARN_ON_ONCE(1);
|
||||
return -ECANCELED;
|
||||
}
|
||||
|
||||
static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
|
||||
const struct io_uring_sqe *sqe)
|
||||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
const struct io_op_def io_op_defs[] = {
|
||||
[IORING_OP_NOP] = {
|
||||
.audit_skip = 1,
|
||||
.iopoll = 1,
|
||||
.name = "NOP",
|
||||
.prep = io_nop_prep,
|
||||
.issue = io_nop,
|
||||
},
|
||||
[IORING_OP_READV] = {
|
||||
.needs_file = 1,
|
||||
.unbound_nonreg_file = 1,
|
||||
.pollin = 1,
|
||||
.buffer_select = 1,
|
||||
.plug = 1,
|
||||
.audit_skip = 1,
|
||||
.ioprio = 1,
|
||||
.iopoll = 1,
|
||||
.async_size = sizeof(struct io_async_rw),
|
||||
.name = "READV",
|
||||
.prep = io_prep_rw,
|
||||
.issue = io_read,
|
||||
.prep_async = io_readv_prep_async,
|
||||
.cleanup = io_readv_writev_cleanup,
|
||||
},
|
||||
[IORING_OP_WRITEV] = {
|
||||
.needs_file = 1,
|
||||
.hash_reg_file = 1,
|
||||
.unbound_nonreg_file = 1,
|
||||
.pollout = 1,
|
||||
.plug = 1,
|
||||
.audit_skip = 1,
|
||||
.ioprio = 1,
|
||||
.iopoll = 1,
|
||||
.async_size = sizeof(struct io_async_rw),
|
||||
.name = "WRITEV",
|
||||
.prep = io_prep_rw,
|
||||
.issue = io_write,
|
||||
.prep_async = io_writev_prep_async,
|
||||
.cleanup = io_readv_writev_cleanup,
|
||||
},
|
||||
[IORING_OP_FSYNC] = {
|
||||
.needs_file = 1,
|
||||
.audit_skip = 1,
|
||||
.name = "FSYNC",
|
||||
.prep = io_fsync_prep,
|
||||
.issue = io_fsync,
|
||||
},
|
||||
[IORING_OP_READ_FIXED] = {
|
||||
.needs_file = 1,
|
||||
.unbound_nonreg_file = 1,
|
||||
.pollin = 1,
|
||||
.plug = 1,
|
||||
.audit_skip = 1,
|
||||
.ioprio = 1,
|
||||
.iopoll = 1,
|
||||
.async_size = sizeof(struct io_async_rw),
|
||||
.name = "READ_FIXED",
|
||||
.prep = io_prep_rw,
|
||||
.issue = io_read,
|
||||
},
|
||||
[IORING_OP_WRITE_FIXED] = {
|
||||
.needs_file = 1,
|
||||
.hash_reg_file = 1,
|
||||
.unbound_nonreg_file = 1,
|
||||
.pollout = 1,
|
||||
.plug = 1,
|
||||
.audit_skip = 1,
|
||||
.ioprio = 1,
|
||||
.iopoll = 1,
|
||||
.async_size = sizeof(struct io_async_rw),
|
||||
.name = "WRITE_FIXED",
|
||||
.prep = io_prep_rw,
|
||||
.issue = io_write,
|
||||
},
|
||||
[IORING_OP_POLL_ADD] = {
|
||||
.needs_file = 1,
|
||||
.unbound_nonreg_file = 1,
|
||||
.audit_skip = 1,
|
||||
.name = "POLL_ADD",
|
||||
.prep = io_poll_add_prep,
|
||||
.issue = io_poll_add,
|
||||
},
|
||||
[IORING_OP_POLL_REMOVE] = {
|
||||
.audit_skip = 1,
|
||||
.name = "POLL_REMOVE",
|
||||
.prep = io_poll_remove_prep,
|
||||
.issue = io_poll_remove,
|
||||
},
|
||||
[IORING_OP_SYNC_FILE_RANGE] = {
|
||||
.needs_file = 1,
|
||||
.audit_skip = 1,
|
||||
.name = "SYNC_FILE_RANGE",
|
||||
.prep = io_sfr_prep,
|
||||
.issue = io_sync_file_range,
|
||||
},
|
||||
[IORING_OP_SENDMSG] = {
|
||||
.needs_file = 1,
|
||||
.unbound_nonreg_file = 1,
|
||||
.pollout = 1,
|
||||
.ioprio = 1,
|
||||
.name = "SENDMSG",
|
||||
#if defined(CONFIG_NET)
|
||||
.async_size = sizeof(struct io_async_msghdr),
|
||||
.prep = io_sendmsg_prep,
|
||||
.issue = io_sendmsg,
|
||||
.prep_async = io_sendmsg_prep_async,
|
||||
.cleanup = io_sendmsg_recvmsg_cleanup,
|
||||
#else
|
||||
.prep = io_eopnotsupp_prep,
|
||||
#endif
|
||||
},
|
||||
[IORING_OP_RECVMSG] = {
|
||||
.needs_file = 1,
|
||||
.unbound_nonreg_file = 1,
|
||||
.pollin = 1,
|
||||
.buffer_select = 1,
|
||||
.ioprio = 1,
|
||||
.name = "RECVMSG",
|
||||
#if defined(CONFIG_NET)
|
||||
.async_size = sizeof(struct io_async_msghdr),
|
||||
.prep = io_recvmsg_prep,
|
||||
.issue = io_recvmsg,
|
||||
.prep_async = io_recvmsg_prep_async,
|
||||
.cleanup = io_sendmsg_recvmsg_cleanup,
|
||||
#else
|
||||
.prep = io_eopnotsupp_prep,
|
||||
#endif
|
||||
},
|
||||
[IORING_OP_TIMEOUT] = {
|
||||
.audit_skip = 1,
|
||||
.async_size = sizeof(struct io_timeout_data),
|
||||
.name = "TIMEOUT",
|
||||
.prep = io_timeout_prep,
|
||||
.issue = io_timeout,
|
||||
},
|
||||
[IORING_OP_TIMEOUT_REMOVE] = {
|
||||
/* used by timeout updates' prep() */
|
||||
.audit_skip = 1,
|
||||
.name = "TIMEOUT_REMOVE",
|
||||
.prep = io_timeout_remove_prep,
|
||||
.issue = io_timeout_remove,
|
||||
},
|
||||
[IORING_OP_ACCEPT] = {
|
||||
.needs_file = 1,
|
||||
.unbound_nonreg_file = 1,
|
||||
.pollin = 1,
|
||||
.poll_exclusive = 1,
|
||||
.ioprio = 1, /* used for flags */
|
||||
.name = "ACCEPT",
|
||||
#if defined(CONFIG_NET)
|
||||
.prep = io_accept_prep,
|
||||
.issue = io_accept,
|
||||
#else
|
||||
.prep = io_eopnotsupp_prep,
|
||||
#endif
|
||||
},
|
||||
[IORING_OP_ASYNC_CANCEL] = {
|
||||
.audit_skip = 1,
|
||||
.name = "ASYNC_CANCEL",
|
||||
.prep = io_async_cancel_prep,
|
||||
.issue = io_async_cancel,
|
||||
},
|
||||
[IORING_OP_LINK_TIMEOUT] = {
|
||||
.audit_skip = 1,
|
||||
.async_size = sizeof(struct io_timeout_data),
|
||||
.name = "LINK_TIMEOUT",
|
||||
.prep = io_link_timeout_prep,
|
||||
.issue = io_no_issue,
|
||||
},
|
||||
[IORING_OP_CONNECT] = {
|
||||
.needs_file = 1,
|
||||
.unbound_nonreg_file = 1,
|
||||
.pollout = 1,
|
||||
.name = "CONNECT",
|
||||
#if defined(CONFIG_NET)
|
||||
.async_size = sizeof(struct io_async_connect),
|
||||
.prep = io_connect_prep,
|
||||
.issue = io_connect,
|
||||
.prep_async = io_connect_prep_async,
|
||||
#else
|
||||
.prep = io_eopnotsupp_prep,
|
||||
#endif
|
||||
},
|
||||
[IORING_OP_FALLOCATE] = {
|
||||
.needs_file = 1,
|
||||
.name = "FALLOCATE",
|
||||
.prep = io_fallocate_prep,
|
||||
.issue = io_fallocate,
|
||||
},
|
||||
[IORING_OP_OPENAT] = {
|
||||
.name = "OPENAT",
|
||||
.prep = io_openat_prep,
|
||||
.issue = io_openat,
|
||||
.cleanup = io_open_cleanup,
|
||||
},
|
||||
[IORING_OP_CLOSE] = {
|
||||
.name = "CLOSE",
|
||||
.prep = io_close_prep,
|
||||
.issue = io_close,
|
||||
},
|
||||
[IORING_OP_FILES_UPDATE] = {
|
||||
.audit_skip = 1,
|
||||
.iopoll = 1,
|
||||
.name = "FILES_UPDATE",
|
||||
.prep = io_files_update_prep,
|
||||
.issue = io_files_update,
|
||||
},
|
||||
[IORING_OP_STATX] = {
|
||||
.audit_skip = 1,
|
||||
.name = "STATX",
|
||||
.prep = io_statx_prep,
|
||||
.issue = io_statx,
|
||||
.cleanup = io_statx_cleanup,
|
||||
},
|
||||
[IORING_OP_READ] = {
|
||||
.needs_file = 1,
|
||||
.unbound_nonreg_file = 1,
|
||||
.pollin = 1,
|
||||
.buffer_select = 1,
|
||||
.plug = 1,
|
||||
.audit_skip = 1,
|
||||
.ioprio = 1,
|
||||
.iopoll = 1,
|
||||
.async_size = sizeof(struct io_async_rw),
|
||||
.name = "READ",
|
||||
.prep = io_prep_rw,
|
||||
.issue = io_read,
|
||||
},
|
||||
[IORING_OP_WRITE] = {
|
||||
.needs_file = 1,
|
||||
.hash_reg_file = 1,
|
||||
.unbound_nonreg_file = 1,
|
||||
.pollout = 1,
|
||||
.plug = 1,
|
||||
.audit_skip = 1,
|
||||
.ioprio = 1,
|
||||
.iopoll = 1,
|
||||
.async_size = sizeof(struct io_async_rw),
|
||||
.name = "WRITE",
|
||||
.prep = io_prep_rw,
|
||||
.issue = io_write,
|
||||
},
|
||||
[IORING_OP_FADVISE] = {
|
||||
.needs_file = 1,
|
||||
.audit_skip = 1,
|
||||
.name = "FADVISE",
|
||||
.prep = io_fadvise_prep,
|
||||
.issue = io_fadvise,
|
||||
},
|
||||
[IORING_OP_MADVISE] = {
|
||||
.name = "MADVISE",
|
||||
.prep = io_madvise_prep,
|
||||
.issue = io_madvise,
|
||||
},
|
||||
[IORING_OP_SEND] = {
|
||||
.needs_file = 1,
|
||||
.unbound_nonreg_file = 1,
|
||||
.pollout = 1,
|
||||
.audit_skip = 1,
|
||||
.ioprio = 1,
|
||||
.name = "SEND",
|
||||
#if defined(CONFIG_NET)
|
||||
.prep = io_sendmsg_prep,
|
||||
.issue = io_send,
|
||||
#else
|
||||
.prep = io_eopnotsupp_prep,
|
||||
#endif
|
||||
},
|
||||
[IORING_OP_RECV] = {
|
||||
.needs_file = 1,
|
||||
.unbound_nonreg_file = 1,
|
||||
.pollin = 1,
|
||||
.buffer_select = 1,
|
||||
.audit_skip = 1,
|
||||
.ioprio = 1,
|
||||
.name = "RECV",
|
||||
#if defined(CONFIG_NET)
|
||||
.prep = io_recvmsg_prep,
|
||||
.issue = io_recv,
|
||||
#else
|
||||
.prep = io_eopnotsupp_prep,
|
||||
#endif
|
||||
},
|
||||
[IORING_OP_OPENAT2] = {
|
||||
.name = "OPENAT2",
|
||||
.prep = io_openat2_prep,
|
||||
.issue = io_openat2,
|
||||
.cleanup = io_open_cleanup,
|
||||
},
|
||||
[IORING_OP_EPOLL_CTL] = {
|
||||
.unbound_nonreg_file = 1,
|
||||
.audit_skip = 1,
|
||||
.name = "EPOLL",
|
||||
#if defined(CONFIG_EPOLL)
|
||||
.prep = io_epoll_ctl_prep,
|
||||
.issue = io_epoll_ctl,
|
||||
#else
|
||||
.prep = io_eopnotsupp_prep,
|
||||
#endif
|
||||
},
|
||||
[IORING_OP_SPLICE] = {
|
||||
.needs_file = 1,
|
||||
.hash_reg_file = 1,
|
||||
.unbound_nonreg_file = 1,
|
||||
.audit_skip = 1,
|
||||
.name = "SPLICE",
|
||||
.prep = io_splice_prep,
|
||||
.issue = io_splice,
|
||||
},
|
||||
[IORING_OP_PROVIDE_BUFFERS] = {
|
||||
.audit_skip = 1,
|
||||
.iopoll = 1,
|
||||
.name = "PROVIDE_BUFFERS",
|
||||
.prep = io_provide_buffers_prep,
|
||||
.issue = io_provide_buffers,
|
||||
},
|
||||
[IORING_OP_REMOVE_BUFFERS] = {
|
||||
.audit_skip = 1,
|
||||
.iopoll = 1,
|
||||
.name = "REMOVE_BUFFERS",
|
||||
.prep = io_remove_buffers_prep,
|
||||
.issue = io_remove_buffers,
|
||||
},
|
||||
[IORING_OP_TEE] = {
|
||||
.needs_file = 1,
|
||||
.hash_reg_file = 1,
|
||||
.unbound_nonreg_file = 1,
|
||||
.audit_skip = 1,
|
||||
.name = "TEE",
|
||||
.prep = io_tee_prep,
|
||||
.issue = io_tee,
|
||||
},
|
||||
[IORING_OP_SHUTDOWN] = {
|
||||
.needs_file = 1,
|
||||
.name = "SHUTDOWN",
|
||||
#if defined(CONFIG_NET)
|
||||
.prep = io_shutdown_prep,
|
||||
.issue = io_shutdown,
|
||||
#else
|
||||
.prep = io_eopnotsupp_prep,
|
||||
#endif
|
||||
},
|
||||
[IORING_OP_RENAMEAT] = {
|
||||
.name = "RENAMEAT",
|
||||
.prep = io_renameat_prep,
|
||||
.issue = io_renameat,
|
||||
.cleanup = io_renameat_cleanup,
|
||||
},
|
||||
[IORING_OP_UNLINKAT] = {
|
||||
.name = "UNLINKAT",
|
||||
.prep = io_unlinkat_prep,
|
||||
.issue = io_unlinkat,
|
||||
.cleanup = io_unlinkat_cleanup,
|
||||
},
|
||||
[IORING_OP_MKDIRAT] = {
|
||||
.name = "MKDIRAT",
|
||||
.prep = io_mkdirat_prep,
|
||||
.issue = io_mkdirat,
|
||||
.cleanup = io_mkdirat_cleanup,
|
||||
},
|
||||
[IORING_OP_SYMLINKAT] = {
|
||||
.name = "SYMLINKAT",
|
||||
.prep = io_symlinkat_prep,
|
||||
.issue = io_symlinkat,
|
||||
.cleanup = io_link_cleanup,
|
||||
},
|
||||
[IORING_OP_LINKAT] = {
|
||||
.name = "LINKAT",
|
||||
.prep = io_linkat_prep,
|
||||
.issue = io_linkat,
|
||||
.cleanup = io_link_cleanup,
|
||||
},
|
||||
[IORING_OP_MSG_RING] = {
|
||||
.needs_file = 1,
|
||||
.iopoll = 1,
|
||||
.name = "MSG_RING",
|
||||
.prep = io_msg_ring_prep,
|
||||
.issue = io_msg_ring,
|
||||
},
|
||||
[IORING_OP_FSETXATTR] = {
|
||||
.needs_file = 1,
|
||||
.name = "FSETXATTR",
|
||||
.prep = io_fsetxattr_prep,
|
||||
.issue = io_fsetxattr,
|
||||
.cleanup = io_xattr_cleanup,
|
||||
},
|
||||
[IORING_OP_SETXATTR] = {
|
||||
.name = "SETXATTR",
|
||||
.prep = io_setxattr_prep,
|
||||
.issue = io_setxattr,
|
||||
.cleanup = io_xattr_cleanup,
|
||||
},
|
||||
[IORING_OP_FGETXATTR] = {
|
||||
.needs_file = 1,
|
||||
.name = "FGETXATTR",
|
||||
.prep = io_fgetxattr_prep,
|
||||
.issue = io_fgetxattr,
|
||||
.cleanup = io_xattr_cleanup,
|
||||
},
|
||||
[IORING_OP_GETXATTR] = {
|
||||
.name = "GETXATTR",
|
||||
.prep = io_getxattr_prep,
|
||||
.issue = io_getxattr,
|
||||
.cleanup = io_xattr_cleanup,
|
||||
},
|
||||
[IORING_OP_SOCKET] = {
|
||||
.audit_skip = 1,
|
||||
.name = "SOCKET",
|
||||
#if defined(CONFIG_NET)
|
||||
.prep = io_socket_prep,
|
||||
.issue = io_socket,
|
||||
#else
|
||||
.prep = io_eopnotsupp_prep,
|
||||
#endif
|
||||
},
|
||||
[IORING_OP_URING_CMD] = {
|
||||
.needs_file = 1,
|
||||
.plug = 1,
|
||||
.name = "URING_CMD",
|
||||
.async_size = uring_cmd_pdu_size(1),
|
||||
.prep = io_uring_cmd_prep,
|
||||
.issue = io_uring_cmd,
|
||||
.prep_async = io_uring_cmd_prep_async,
|
||||
},
|
||||
};
|
||||
|
||||
const char *io_uring_get_opcode(u8 opcode)
|
||||
{
|
||||
if (opcode < IORING_OP_LAST)
|
||||
return io_op_defs[opcode].name;
|
||||
return "INVALID";
|
||||
}
|
||||
|
||||
void __init io_uring_optable_init(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) {
|
||||
BUG_ON(!io_op_defs[i].prep);
|
||||
if (io_op_defs[i].prep != io_eopnotsupp_prep)
|
||||
BUG_ON(!io_op_defs[i].issue);
|
||||
WARN_ON_ONCE(!io_op_defs[i].name);
|
||||
}
|
||||
}
|
42
io_uring/opdef.h
Normal file
42
io_uring/opdef.h
Normal file
@ -0,0 +1,42 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#ifndef IOU_OP_DEF_H
|
||||
#define IOU_OP_DEF_H
|
||||
|
||||
struct io_op_def {
|
||||
/* needs req->file assigned */
|
||||
unsigned needs_file : 1;
|
||||
/* should block plug */
|
||||
unsigned plug : 1;
|
||||
/* hash wq insertion if file is a regular file */
|
||||
unsigned hash_reg_file : 1;
|
||||
/* unbound wq insertion if file is a non-regular file */
|
||||
unsigned unbound_nonreg_file : 1;
|
||||
/* set if opcode supports polled "wait" */
|
||||
unsigned pollin : 1;
|
||||
unsigned pollout : 1;
|
||||
unsigned poll_exclusive : 1;
|
||||
/* op supports buffer selection */
|
||||
unsigned buffer_select : 1;
|
||||
/* opcode is not supported by this kernel */
|
||||
unsigned not_supported : 1;
|
||||
/* skip auditing */
|
||||
unsigned audit_skip : 1;
|
||||
/* supports ioprio */
|
||||
unsigned ioprio : 1;
|
||||
/* supports iopoll */
|
||||
unsigned iopoll : 1;
|
||||
/* size of async data needed, if any */
|
||||
unsigned short async_size;
|
||||
|
||||
const char *name;
|
||||
|
||||
int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
|
||||
int (*issue)(struct io_kiocb *, unsigned int);
|
||||
int (*prep_async)(struct io_kiocb *);
|
||||
void (*cleanup)(struct io_kiocb *);
|
||||
};
|
||||
|
||||
extern const struct io_op_def io_op_defs[];
|
||||
|
||||
void io_uring_optable_init(void);
|
||||
#endif
|
256
io_uring/openclose.c
Normal file
256
io_uring/openclose.c
Normal file
@ -0,0 +1,256 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/fdtable.h>
|
||||
#include <linux/fsnotify.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "../fs/internal.h"
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "rsrc.h"
|
||||
#include "openclose.h"
|
||||
|
||||
struct io_open {
|
||||
struct file *file;
|
||||
int dfd;
|
||||
u32 file_slot;
|
||||
struct filename *filename;
|
||||
struct open_how how;
|
||||
unsigned long nofile;
|
||||
};
|
||||
|
||||
struct io_close {
|
||||
struct file *file;
|
||||
int fd;
|
||||
u32 file_slot;
|
||||
};
|
||||
|
||||
static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_open *open = io_kiocb_to_cmd(req);
|
||||
const char __user *fname;
|
||||
int ret;
|
||||
|
||||
if (unlikely(sqe->buf_index))
|
||||
return -EINVAL;
|
||||
if (unlikely(req->flags & REQ_F_FIXED_FILE))
|
||||
return -EBADF;
|
||||
|
||||
/* open.how should be already initialised */
|
||||
if (!(open->how.flags & O_PATH) && force_o_largefile())
|
||||
open->how.flags |= O_LARGEFILE;
|
||||
|
||||
open->dfd = READ_ONCE(sqe->fd);
|
||||
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
|
||||
open->filename = getname(fname);
|
||||
if (IS_ERR(open->filename)) {
|
||||
ret = PTR_ERR(open->filename);
|
||||
open->filename = NULL;
|
||||
return ret;
|
||||
}
|
||||
|
||||
open->file_slot = READ_ONCE(sqe->file_index);
|
||||
if (open->file_slot && (open->how.flags & O_CLOEXEC))
|
||||
return -EINVAL;
|
||||
|
||||
open->nofile = rlimit(RLIMIT_NOFILE);
|
||||
req->flags |= REQ_F_NEED_CLEANUP;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_open *open = io_kiocb_to_cmd(req);
|
||||
u64 mode = READ_ONCE(sqe->len);
|
||||
u64 flags = READ_ONCE(sqe->open_flags);
|
||||
|
||||
open->how = build_open_how(flags, mode);
|
||||
return __io_openat_prep(req, sqe);
|
||||
}
|
||||
|
||||
int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_open *open = io_kiocb_to_cmd(req);
|
||||
struct open_how __user *how;
|
||||
size_t len;
|
||||
int ret;
|
||||
|
||||
how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
|
||||
len = READ_ONCE(sqe->len);
|
||||
if (len < OPEN_HOW_SIZE_VER0)
|
||||
return -EINVAL;
|
||||
|
||||
ret = copy_struct_from_user(&open->how, sizeof(open->how), how, len);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return __io_openat_prep(req, sqe);
|
||||
}
|
||||
|
||||
int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_open *open = io_kiocb_to_cmd(req);
|
||||
struct open_flags op;
|
||||
struct file *file;
|
||||
bool resolve_nonblock, nonblock_set;
|
||||
bool fixed = !!open->file_slot;
|
||||
int ret;
|
||||
|
||||
ret = build_open_flags(&open->how, &op);
|
||||
if (ret)
|
||||
goto err;
|
||||
nonblock_set = op.open_flag & O_NONBLOCK;
|
||||
resolve_nonblock = open->how.resolve & RESOLVE_CACHED;
|
||||
if (issue_flags & IO_URING_F_NONBLOCK) {
|
||||
/*
|
||||
* Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
|
||||
* it'll always -EAGAIN
|
||||
*/
|
||||
if (open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
|
||||
return -EAGAIN;
|
||||
op.lookup_flags |= LOOKUP_CACHED;
|
||||
op.open_flag |= O_NONBLOCK;
|
||||
}
|
||||
|
||||
if (!fixed) {
|
||||
ret = __get_unused_fd_flags(open->how.flags, open->nofile);
|
||||
if (ret < 0)
|
||||
goto err;
|
||||
}
|
||||
|
||||
file = do_filp_open(open->dfd, open->filename, &op);
|
||||
if (IS_ERR(file)) {
|
||||
/*
|
||||
* We could hang on to this 'fd' on retrying, but seems like
|
||||
* marginal gain for something that is now known to be a slower
|
||||
* path. So just put it, and we'll get a new one when we retry.
|
||||
*/
|
||||
if (!fixed)
|
||||
put_unused_fd(ret);
|
||||
|
||||
ret = PTR_ERR(file);
|
||||
/* only retry if RESOLVE_CACHED wasn't already set by application */
|
||||
if (ret == -EAGAIN &&
|
||||
(!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
|
||||
return -EAGAIN;
|
||||
goto err;
|
||||
}
|
||||
|
||||
if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
|
||||
file->f_flags &= ~O_NONBLOCK;
|
||||
fsnotify_open(file);
|
||||
|
||||
if (!fixed)
|
||||
fd_install(ret, file);
|
||||
else
|
||||
ret = io_fixed_fd_install(req, issue_flags, file,
|
||||
open->file_slot);
|
||||
err:
|
||||
putname(open->filename);
|
||||
req->flags &= ~REQ_F_NEED_CLEANUP;
|
||||
if (ret < 0)
|
||||
req_set_fail(req);
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
int io_openat(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
return io_openat2(req, issue_flags);
|
||||
}
|
||||
|
||||
void io_open_cleanup(struct io_kiocb *req)
|
||||
{
|
||||
struct io_open *open = io_kiocb_to_cmd(req);
|
||||
|
||||
if (open->filename)
|
||||
putname(open->filename);
|
||||
}
|
||||
|
||||
int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags,
|
||||
unsigned int offset)
|
||||
{
|
||||
int ret;
|
||||
|
||||
io_ring_submit_lock(ctx, issue_flags);
|
||||
ret = io_fixed_fd_remove(ctx, offset);
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_close *close = io_kiocb_to_cmd(req);
|
||||
|
||||
return __io_close_fixed(req->ctx, issue_flags, close->file_slot - 1);
|
||||
}
|
||||
|
||||
int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_close *close = io_kiocb_to_cmd(req);
|
||||
|
||||
if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
|
||||
return -EINVAL;
|
||||
if (req->flags & REQ_F_FIXED_FILE)
|
||||
return -EBADF;
|
||||
|
||||
close->fd = READ_ONCE(sqe->fd);
|
||||
close->file_slot = READ_ONCE(sqe->file_index);
|
||||
if (close->file_slot && close->fd)
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_close(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct files_struct *files = current->files;
|
||||
struct io_close *close = io_kiocb_to_cmd(req);
|
||||
struct fdtable *fdt;
|
||||
struct file *file;
|
||||
int ret = -EBADF;
|
||||
|
||||
if (close->file_slot) {
|
||||
ret = io_close_fixed(req, issue_flags);
|
||||
goto err;
|
||||
}
|
||||
|
||||
spin_lock(&files->file_lock);
|
||||
fdt = files_fdtable(files);
|
||||
if (close->fd >= fdt->max_fds) {
|
||||
spin_unlock(&files->file_lock);
|
||||
goto err;
|
||||
}
|
||||
file = rcu_dereference_protected(fdt->fd[close->fd],
|
||||
lockdep_is_held(&files->file_lock));
|
||||
if (!file || io_is_uring_fops(file)) {
|
||||
spin_unlock(&files->file_lock);
|
||||
goto err;
|
||||
}
|
||||
|
||||
/* if the file has a flush method, be safe and punt to async */
|
||||
if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
|
||||
spin_unlock(&files->file_lock);
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
file = __close_fd_get_file(close->fd);
|
||||
spin_unlock(&files->file_lock);
|
||||
if (!file)
|
||||
goto err;
|
||||
|
||||
/* No ->flush() or already async, safely close from here */
|
||||
ret = filp_close(file, current->files);
|
||||
err:
|
||||
if (ret < 0)
|
||||
req_set_fail(req);
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
14
io_uring/openclose.h
Normal file
14
io_uring/openclose.h
Normal file
@ -0,0 +1,14 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags,
|
||||
unsigned int offset);
|
||||
|
||||
int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_openat(struct io_kiocb *req, unsigned int issue_flags);
|
||||
void io_open_cleanup(struct io_kiocb *req);
|
||||
|
||||
int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_openat2(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_close(struct io_kiocb *req, unsigned int issue_flags);
|
965
io_uring/poll.c
Normal file
965
io_uring/poll.c
Normal file
@ -0,0 +1,965 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/poll.h>
|
||||
#include <linux/hashtable.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include <trace/events/io_uring.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "refs.h"
|
||||
#include "opdef.h"
|
||||
#include "kbuf.h"
|
||||
#include "poll.h"
|
||||
#include "cancel.h"
|
||||
|
||||
struct io_poll_update {
|
||||
struct file *file;
|
||||
u64 old_user_data;
|
||||
u64 new_user_data;
|
||||
__poll_t events;
|
||||
bool update_events;
|
||||
bool update_user_data;
|
||||
};
|
||||
|
||||
struct io_poll_table {
|
||||
struct poll_table_struct pt;
|
||||
struct io_kiocb *req;
|
||||
int nr_entries;
|
||||
int error;
|
||||
bool owning;
|
||||
/* output value, set only if arm poll returns >0 */
|
||||
__poll_t result_mask;
|
||||
};
|
||||
|
||||
#define IO_POLL_CANCEL_FLAG BIT(31)
|
||||
#define IO_POLL_REF_MASK GENMASK(30, 0)
|
||||
|
||||
#define IO_WQE_F_DOUBLE 1
|
||||
|
||||
static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe)
|
||||
{
|
||||
unsigned long priv = (unsigned long)wqe->private;
|
||||
|
||||
return (struct io_kiocb *)(priv & ~IO_WQE_F_DOUBLE);
|
||||
}
|
||||
|
||||
static inline bool wqe_is_double(struct wait_queue_entry *wqe)
|
||||
{
|
||||
unsigned long priv = (unsigned long)wqe->private;
|
||||
|
||||
return priv & IO_WQE_F_DOUBLE;
|
||||
}
|
||||
|
||||
/*
|
||||
* If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
|
||||
* bump it and acquire ownership. It's disallowed to modify requests while not
|
||||
* owning it, that prevents from races for enqueueing task_work's and b/w
|
||||
* arming poll and wakeups.
|
||||
*/
|
||||
static inline bool io_poll_get_ownership(struct io_kiocb *req)
|
||||
{
|
||||
return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
|
||||
}
|
||||
|
||||
static void io_poll_mark_cancelled(struct io_kiocb *req)
|
||||
{
|
||||
atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
|
||||
}
|
||||
|
||||
static struct io_poll *io_poll_get_double(struct io_kiocb *req)
|
||||
{
|
||||
/* pure poll stashes this in ->async_data, poll driven retry elsewhere */
|
||||
if (req->opcode == IORING_OP_POLL_ADD)
|
||||
return req->async_data;
|
||||
return req->apoll->double_poll;
|
||||
}
|
||||
|
||||
static struct io_poll *io_poll_get_single(struct io_kiocb *req)
|
||||
{
|
||||
if (req->opcode == IORING_OP_POLL_ADD)
|
||||
return io_kiocb_to_cmd(req);
|
||||
return &req->apoll->poll;
|
||||
}
|
||||
|
||||
static void io_poll_req_insert(struct io_kiocb *req)
|
||||
{
|
||||
struct io_hash_table *table = &req->ctx->cancel_table;
|
||||
u32 index = hash_long(req->cqe.user_data, table->hash_bits);
|
||||
struct io_hash_bucket *hb = &table->hbs[index];
|
||||
|
||||
spin_lock(&hb->lock);
|
||||
hlist_add_head(&req->hash_node, &hb->list);
|
||||
spin_unlock(&hb->lock);
|
||||
}
|
||||
|
||||
static void io_poll_req_delete(struct io_kiocb *req, struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_hash_table *table = &req->ctx->cancel_table;
|
||||
u32 index = hash_long(req->cqe.user_data, table->hash_bits);
|
||||
spinlock_t *lock = &table->hbs[index].lock;
|
||||
|
||||
spin_lock(lock);
|
||||
hash_del(&req->hash_node);
|
||||
spin_unlock(lock);
|
||||
}
|
||||
|
||||
static void io_poll_req_insert_locked(struct io_kiocb *req)
|
||||
{
|
||||
struct io_hash_table *table = &req->ctx->cancel_table_locked;
|
||||
u32 index = hash_long(req->cqe.user_data, table->hash_bits);
|
||||
|
||||
hlist_add_head(&req->hash_node, &table->hbs[index].list);
|
||||
}
|
||||
|
||||
static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked)
|
||||
{
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
|
||||
if (req->flags & REQ_F_HASH_LOCKED) {
|
||||
/*
|
||||
* ->cancel_table_locked is protected by ->uring_lock in
|
||||
* contrast to per bucket spinlocks. Likely, tctx_task_work()
|
||||
* already grabbed the mutex for us, but there is a chance it
|
||||
* failed.
|
||||
*/
|
||||
io_tw_lock(ctx, locked);
|
||||
hash_del(&req->hash_node);
|
||||
req->flags &= ~REQ_F_HASH_LOCKED;
|
||||
} else {
|
||||
io_poll_req_delete(req, ctx);
|
||||
}
|
||||
}
|
||||
|
||||
static void io_init_poll_iocb(struct io_poll *poll, __poll_t events,
|
||||
wait_queue_func_t wake_func)
|
||||
{
|
||||
poll->head = NULL;
|
||||
#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
|
||||
/* mask in events that we always want/need */
|
||||
poll->events = events | IO_POLL_UNMASK;
|
||||
INIT_LIST_HEAD(&poll->wait.entry);
|
||||
init_waitqueue_func_entry(&poll->wait, wake_func);
|
||||
}
|
||||
|
||||
static inline void io_poll_remove_entry(struct io_poll *poll)
|
||||
{
|
||||
struct wait_queue_head *head = smp_load_acquire(&poll->head);
|
||||
|
||||
if (head) {
|
||||
spin_lock_irq(&head->lock);
|
||||
list_del_init(&poll->wait.entry);
|
||||
poll->head = NULL;
|
||||
spin_unlock_irq(&head->lock);
|
||||
}
|
||||
}
|
||||
|
||||
static void io_poll_remove_entries(struct io_kiocb *req)
|
||||
{
|
||||
/*
|
||||
* Nothing to do if neither of those flags are set. Avoid dipping
|
||||
* into the poll/apoll/double cachelines if we can.
|
||||
*/
|
||||
if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
|
||||
return;
|
||||
|
||||
/*
|
||||
* While we hold the waitqueue lock and the waitqueue is nonempty,
|
||||
* wake_up_pollfree() will wait for us. However, taking the waitqueue
|
||||
* lock in the first place can race with the waitqueue being freed.
|
||||
*
|
||||
* We solve this as eventpoll does: by taking advantage of the fact that
|
||||
* all users of wake_up_pollfree() will RCU-delay the actual free. If
|
||||
* we enter rcu_read_lock() and see that the pointer to the queue is
|
||||
* non-NULL, we can then lock it without the memory being freed out from
|
||||
* under us.
|
||||
*
|
||||
* Keep holding rcu_read_lock() as long as we hold the queue lock, in
|
||||
* case the caller deletes the entry from the queue, leaving it empty.
|
||||
* In that case, only RCU prevents the queue memory from being freed.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
if (req->flags & REQ_F_SINGLE_POLL)
|
||||
io_poll_remove_entry(io_poll_get_single(req));
|
||||
if (req->flags & REQ_F_DOUBLE_POLL)
|
||||
io_poll_remove_entry(io_poll_get_double(req));
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
enum {
|
||||
IOU_POLL_DONE = 0,
|
||||
IOU_POLL_NO_ACTION = 1,
|
||||
IOU_POLL_REMOVE_POLL_USE_RES = 2,
|
||||
};
|
||||
|
||||
/*
|
||||
* All poll tw should go through this. Checks for poll events, manages
|
||||
* references, does rewait, etc.
|
||||
*
|
||||
* Returns a negative error on failure. IOU_POLL_NO_ACTION when no action require,
|
||||
* which is either spurious wakeup or multishot CQE is served.
|
||||
* IOU_POLL_DONE when it's done with the request, then the mask is stored in req->cqe.res.
|
||||
* IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot poll and that the result
|
||||
* is stored in req->cqe.
|
||||
*/
|
||||
static int io_poll_check_events(struct io_kiocb *req, bool *locked)
|
||||
{
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
int v, ret;
|
||||
|
||||
/* req->task == current here, checking PF_EXITING is safe */
|
||||
if (unlikely(req->task->flags & PF_EXITING))
|
||||
return -ECANCELED;
|
||||
|
||||
do {
|
||||
v = atomic_read(&req->poll_refs);
|
||||
|
||||
/* tw handler should be the owner, and so have some references */
|
||||
if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
|
||||
return IOU_POLL_DONE;
|
||||
if (v & IO_POLL_CANCEL_FLAG)
|
||||
return -ECANCELED;
|
||||
|
||||
/* the mask was stashed in __io_poll_execute */
|
||||
if (!req->cqe.res) {
|
||||
struct poll_table_struct pt = { ._key = req->apoll_events };
|
||||
req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
|
||||
}
|
||||
|
||||
if ((unlikely(!req->cqe.res)))
|
||||
continue;
|
||||
if (req->apoll_events & EPOLLONESHOT)
|
||||
return IOU_POLL_DONE;
|
||||
|
||||
/* multishot, just fill a CQE and proceed */
|
||||
if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
|
||||
__poll_t mask = mangle_poll(req->cqe.res &
|
||||
req->apoll_events);
|
||||
|
||||
if (!io_post_aux_cqe(ctx, req->cqe.user_data,
|
||||
mask, IORING_CQE_F_MORE, false)) {
|
||||
io_req_set_res(req, mask, 0);
|
||||
return IOU_POLL_REMOVE_POLL_USE_RES;
|
||||
}
|
||||
} else {
|
||||
ret = io_poll_issue(req, locked);
|
||||
if (ret == IOU_STOP_MULTISHOT)
|
||||
return IOU_POLL_REMOVE_POLL_USE_RES;
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Release all references, retry if someone tried to restart
|
||||
* task_work while we were executing it.
|
||||
*/
|
||||
} while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
|
||||
|
||||
return IOU_POLL_NO_ACTION;
|
||||
}
|
||||
|
||||
static void io_poll_task_func(struct io_kiocb *req, bool *locked)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = io_poll_check_events(req, locked);
|
||||
if (ret == IOU_POLL_NO_ACTION)
|
||||
return;
|
||||
|
||||
if (ret == IOU_POLL_DONE) {
|
||||
struct io_poll *poll = io_kiocb_to_cmd(req);
|
||||
req->cqe.res = mangle_poll(req->cqe.res & poll->events);
|
||||
} else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) {
|
||||
req->cqe.res = ret;
|
||||
req_set_fail(req);
|
||||
}
|
||||
|
||||
io_poll_remove_entries(req);
|
||||
io_poll_tw_hash_eject(req, locked);
|
||||
|
||||
io_req_set_res(req, req->cqe.res, 0);
|
||||
io_req_task_complete(req, locked);
|
||||
}
|
||||
|
||||
static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = io_poll_check_events(req, locked);
|
||||
if (ret == IOU_POLL_NO_ACTION)
|
||||
return;
|
||||
|
||||
io_poll_remove_entries(req);
|
||||
io_poll_tw_hash_eject(req, locked);
|
||||
|
||||
if (ret == IOU_POLL_REMOVE_POLL_USE_RES)
|
||||
io_req_complete_post(req);
|
||||
else if (ret == IOU_POLL_DONE)
|
||||
io_req_task_submit(req, locked);
|
||||
else
|
||||
io_req_complete_failed(req, ret);
|
||||
}
|
||||
|
||||
static void __io_poll_execute(struct io_kiocb *req, int mask)
|
||||
{
|
||||
io_req_set_res(req, mask, 0);
|
||||
/*
|
||||
* This is useful for poll that is armed on behalf of another
|
||||
* request, and where the wakeup path could be on a different
|
||||
* CPU. We want to avoid pulling in req->apoll->events for that
|
||||
* case.
|
||||
*/
|
||||
if (req->opcode == IORING_OP_POLL_ADD)
|
||||
req->io_task_work.func = io_poll_task_func;
|
||||
else
|
||||
req->io_task_work.func = io_apoll_task_func;
|
||||
|
||||
trace_io_uring_task_add(req, mask);
|
||||
io_req_task_work_add(req);
|
||||
}
|
||||
|
||||
static inline void io_poll_execute(struct io_kiocb *req, int res)
|
||||
{
|
||||
if (io_poll_get_ownership(req))
|
||||
__io_poll_execute(req, res);
|
||||
}
|
||||
|
||||
static void io_poll_cancel_req(struct io_kiocb *req)
|
||||
{
|
||||
io_poll_mark_cancelled(req);
|
||||
/* kick tw, which should complete the request */
|
||||
io_poll_execute(req, 0);
|
||||
}
|
||||
|
||||
#define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI)
|
||||
|
||||
static __cold int io_pollfree_wake(struct io_kiocb *req, struct io_poll *poll)
|
||||
{
|
||||
io_poll_mark_cancelled(req);
|
||||
/* we have to kick tw in case it's not already */
|
||||
io_poll_execute(req, 0);
|
||||
|
||||
/*
|
||||
* If the waitqueue is being freed early but someone is already
|
||||
* holds ownership over it, we have to tear down the request as
|
||||
* best we can. That means immediately removing the request from
|
||||
* its waitqueue and preventing all further accesses to the
|
||||
* waitqueue via the request.
|
||||
*/
|
||||
list_del_init(&poll->wait.entry);
|
||||
|
||||
/*
|
||||
* Careful: this *must* be the last step, since as soon
|
||||
* as req->head is NULL'ed out, the request can be
|
||||
* completed and freed, since aio_poll_complete_work()
|
||||
* will no longer need to take the waitqueue lock.
|
||||
*/
|
||||
smp_store_release(&poll->head, NULL);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
|
||||
void *key)
|
||||
{
|
||||
struct io_kiocb *req = wqe_to_req(wait);
|
||||
struct io_poll *poll = container_of(wait, struct io_poll, wait);
|
||||
__poll_t mask = key_to_poll(key);
|
||||
|
||||
if (unlikely(mask & POLLFREE))
|
||||
return io_pollfree_wake(req, poll);
|
||||
|
||||
/* for instances that support it check for an event match first */
|
||||
if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON)))
|
||||
return 0;
|
||||
|
||||
if (io_poll_get_ownership(req)) {
|
||||
/* optional, saves extra locking for removal in tw handler */
|
||||
if (mask && poll->events & EPOLLONESHOT) {
|
||||
list_del_init(&poll->wait.entry);
|
||||
poll->head = NULL;
|
||||
if (wqe_is_double(wait))
|
||||
req->flags &= ~REQ_F_DOUBLE_POLL;
|
||||
else
|
||||
req->flags &= ~REQ_F_SINGLE_POLL;
|
||||
}
|
||||
__io_poll_execute(req, mask);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void io_poll_double_prepare(struct io_kiocb *req)
|
||||
{
|
||||
struct wait_queue_head *head;
|
||||
struct io_poll *poll = io_poll_get_single(req);
|
||||
|
||||
/* head is RCU protected, see io_poll_remove_entries() comments */
|
||||
rcu_read_lock();
|
||||
head = smp_load_acquire(&poll->head);
|
||||
/*
|
||||
* poll arm may not hold ownership and so race with
|
||||
* io_poll_wake() by modifying req->flags. There is only one
|
||||
* poll entry queued, serialise with it by taking its head lock.
|
||||
*/
|
||||
if (head)
|
||||
spin_lock_irq(&head->lock);
|
||||
|
||||
req->flags |= REQ_F_DOUBLE_POLL;
|
||||
if (req->opcode == IORING_OP_POLL_ADD)
|
||||
req->flags |= REQ_F_ASYNC_DATA;
|
||||
|
||||
if (head)
|
||||
spin_unlock_irq(&head->lock);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
|
||||
struct wait_queue_head *head,
|
||||
struct io_poll **poll_ptr)
|
||||
{
|
||||
struct io_kiocb *req = pt->req;
|
||||
unsigned long wqe_private = (unsigned long) req;
|
||||
|
||||
/*
|
||||
* The file being polled uses multiple waitqueues for poll handling
|
||||
* (e.g. one for read, one for write). Setup a separate io_poll
|
||||
* if this happens.
|
||||
*/
|
||||
if (unlikely(pt->nr_entries)) {
|
||||
struct io_poll *first = poll;
|
||||
|
||||
/* double add on the same waitqueue head, ignore */
|
||||
if (first->head == head)
|
||||
return;
|
||||
/* already have a 2nd entry, fail a third attempt */
|
||||
if (*poll_ptr) {
|
||||
if ((*poll_ptr)->head == head)
|
||||
return;
|
||||
pt->error = -EINVAL;
|
||||
return;
|
||||
}
|
||||
|
||||
poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
|
||||
if (!poll) {
|
||||
pt->error = -ENOMEM;
|
||||
return;
|
||||
}
|
||||
|
||||
/* mark as double wq entry */
|
||||
wqe_private |= IO_WQE_F_DOUBLE;
|
||||
io_init_poll_iocb(poll, first->events, first->wait.func);
|
||||
io_poll_double_prepare(req);
|
||||
*poll_ptr = poll;
|
||||
} else {
|
||||
/* fine to modify, there is no poll queued to race with us */
|
||||
req->flags |= REQ_F_SINGLE_POLL;
|
||||
}
|
||||
|
||||
pt->nr_entries++;
|
||||
poll->head = head;
|
||||
poll->wait.private = (void *) wqe_private;
|
||||
|
||||
if (poll->events & EPOLLEXCLUSIVE)
|
||||
add_wait_queue_exclusive(head, &poll->wait);
|
||||
else
|
||||
add_wait_queue(head, &poll->wait);
|
||||
}
|
||||
|
||||
static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
|
||||
struct poll_table_struct *p)
|
||||
{
|
||||
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
|
||||
struct io_poll *poll = io_kiocb_to_cmd(pt->req);
|
||||
|
||||
__io_queue_proc(poll, pt, head,
|
||||
(struct io_poll **) &pt->req->async_data);
|
||||
}
|
||||
|
||||
static bool io_poll_can_finish_inline(struct io_kiocb *req,
|
||||
struct io_poll_table *pt)
|
||||
{
|
||||
return pt->owning || io_poll_get_ownership(req);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns 0 when it's handed over for polling. The caller owns the requests if
|
||||
* it returns non-zero, but otherwise should not touch it. Negative values
|
||||
* contain an error code. When the result is >0, the polling has completed
|
||||
* inline and ipt.result_mask is set to the mask.
|
||||
*/
|
||||
static int __io_arm_poll_handler(struct io_kiocb *req,
|
||||
struct io_poll *poll,
|
||||
struct io_poll_table *ipt, __poll_t mask,
|
||||
unsigned issue_flags)
|
||||
{
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
int v;
|
||||
|
||||
INIT_HLIST_NODE(&req->hash_node);
|
||||
req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
|
||||
io_init_poll_iocb(poll, mask, io_poll_wake);
|
||||
poll->file = req->file;
|
||||
req->apoll_events = poll->events;
|
||||
|
||||
ipt->pt._key = mask;
|
||||
ipt->req = req;
|
||||
ipt->error = 0;
|
||||
ipt->nr_entries = 0;
|
||||
/*
|
||||
* Polling is either completed here or via task_work, so if we're in the
|
||||
* task context we're naturally serialised with tw by merit of running
|
||||
* the same task. When it's io-wq, take the ownership to prevent tw
|
||||
* from running. However, when we're in the task context, skip taking
|
||||
* it as an optimisation.
|
||||
*
|
||||
* Note: even though the request won't be completed/freed, without
|
||||
* ownership we still can race with io_poll_wake().
|
||||
* io_poll_can_finish_inline() tries to deal with that.
|
||||
*/
|
||||
ipt->owning = issue_flags & IO_URING_F_UNLOCKED;
|
||||
atomic_set(&req->poll_refs, (int)ipt->owning);
|
||||
|
||||
/* io-wq doesn't hold uring_lock */
|
||||
if (issue_flags & IO_URING_F_UNLOCKED)
|
||||
req->flags &= ~REQ_F_HASH_LOCKED;
|
||||
|
||||
mask = vfs_poll(req->file, &ipt->pt) & poll->events;
|
||||
|
||||
if (unlikely(ipt->error || !ipt->nr_entries)) {
|
||||
io_poll_remove_entries(req);
|
||||
|
||||
if (!io_poll_can_finish_inline(req, ipt)) {
|
||||
io_poll_mark_cancelled(req);
|
||||
return 0;
|
||||
} else if (mask && (poll->events & EPOLLET)) {
|
||||
ipt->result_mask = mask;
|
||||
return 1;
|
||||
}
|
||||
return ipt->error ?: -EINVAL;
|
||||
}
|
||||
|
||||
if (mask &&
|
||||
((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) {
|
||||
if (!io_poll_can_finish_inline(req, ipt))
|
||||
return 0;
|
||||
io_poll_remove_entries(req);
|
||||
ipt->result_mask = mask;
|
||||
/* no one else has access to the req, forget about the ref */
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (req->flags & REQ_F_HASH_LOCKED)
|
||||
io_poll_req_insert_locked(req);
|
||||
else
|
||||
io_poll_req_insert(req);
|
||||
|
||||
if (mask && (poll->events & EPOLLET) &&
|
||||
io_poll_can_finish_inline(req, ipt)) {
|
||||
__io_poll_execute(req, mask);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (ipt->owning) {
|
||||
/*
|
||||
* Release ownership. If someone tried to queue a tw while it was
|
||||
* locked, kick it off for them.
|
||||
*/
|
||||
v = atomic_dec_return(&req->poll_refs);
|
||||
if (unlikely(v & IO_POLL_REF_MASK))
|
||||
__io_poll_execute(req, 0);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
|
||||
struct poll_table_struct *p)
|
||||
{
|
||||
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
|
||||
struct async_poll *apoll = pt->req->apoll;
|
||||
|
||||
__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
|
||||
}
|
||||
|
||||
static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
|
||||
unsigned issue_flags)
|
||||
{
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
struct io_cache_entry *entry;
|
||||
struct async_poll *apoll;
|
||||
|
||||
if (req->flags & REQ_F_POLLED) {
|
||||
apoll = req->apoll;
|
||||
kfree(apoll->double_poll);
|
||||
} else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
|
||||
(entry = io_alloc_cache_get(&ctx->apoll_cache)) != NULL) {
|
||||
apoll = container_of(entry, struct async_poll, cache);
|
||||
} else {
|
||||
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
|
||||
if (unlikely(!apoll))
|
||||
return NULL;
|
||||
}
|
||||
apoll->double_poll = NULL;
|
||||
req->apoll = apoll;
|
||||
return apoll;
|
||||
}
|
||||
|
||||
int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
|
||||
{
|
||||
const struct io_op_def *def = &io_op_defs[req->opcode];
|
||||
struct async_poll *apoll;
|
||||
struct io_poll_table ipt;
|
||||
__poll_t mask = POLLPRI | POLLERR | EPOLLET;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* apoll requests already grab the mutex to complete in the tw handler,
|
||||
* so removal from the mutex-backed hash is free, use it by default.
|
||||
*/
|
||||
req->flags |= REQ_F_HASH_LOCKED;
|
||||
|
||||
if (!def->pollin && !def->pollout)
|
||||
return IO_APOLL_ABORTED;
|
||||
if (!file_can_poll(req->file))
|
||||
return IO_APOLL_ABORTED;
|
||||
if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
|
||||
return IO_APOLL_ABORTED;
|
||||
if (!(req->flags & REQ_F_APOLL_MULTISHOT))
|
||||
mask |= EPOLLONESHOT;
|
||||
|
||||
if (def->pollin) {
|
||||
mask |= EPOLLIN | EPOLLRDNORM;
|
||||
|
||||
/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
|
||||
if (req->flags & REQ_F_CLEAR_POLLIN)
|
||||
mask &= ~EPOLLIN;
|
||||
} else {
|
||||
mask |= EPOLLOUT | EPOLLWRNORM;
|
||||
}
|
||||
if (def->poll_exclusive)
|
||||
mask |= EPOLLEXCLUSIVE;
|
||||
|
||||
apoll = io_req_alloc_apoll(req, issue_flags);
|
||||
if (!apoll)
|
||||
return IO_APOLL_ABORTED;
|
||||
req->flags |= REQ_F_POLLED;
|
||||
ipt.pt._qproc = io_async_queue_proc;
|
||||
|
||||
io_kbuf_recycle(req, issue_flags);
|
||||
|
||||
ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags);
|
||||
if (ret)
|
||||
return ret > 0 ? IO_APOLL_READY : IO_APOLL_ABORTED;
|
||||
trace_io_uring_poll_arm(req, mask, apoll->poll.events);
|
||||
return IO_APOLL_OK;
|
||||
}
|
||||
|
||||
static __cold bool io_poll_remove_all_table(struct task_struct *tsk,
|
||||
struct io_hash_table *table,
|
||||
bool cancel_all)
|
||||
{
|
||||
unsigned nr_buckets = 1U << table->hash_bits;
|
||||
struct hlist_node *tmp;
|
||||
struct io_kiocb *req;
|
||||
bool found = false;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < nr_buckets; i++) {
|
||||
struct io_hash_bucket *hb = &table->hbs[i];
|
||||
|
||||
spin_lock(&hb->lock);
|
||||
hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) {
|
||||
if (io_match_task_safe(req, tsk, cancel_all)) {
|
||||
hlist_del_init(&req->hash_node);
|
||||
io_poll_cancel_req(req);
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
spin_unlock(&hb->lock);
|
||||
}
|
||||
return found;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if we found and killed one or more poll requests
|
||||
*/
|
||||
__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
|
||||
bool cancel_all)
|
||||
__must_hold(&ctx->uring_lock)
|
||||
{
|
||||
bool ret;
|
||||
|
||||
ret = io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all);
|
||||
ret |= io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
|
||||
struct io_cancel_data *cd,
|
||||
struct io_hash_table *table,
|
||||
struct io_hash_bucket **out_bucket)
|
||||
{
|
||||
struct io_kiocb *req;
|
||||
u32 index = hash_long(cd->data, table->hash_bits);
|
||||
struct io_hash_bucket *hb = &table->hbs[index];
|
||||
|
||||
*out_bucket = NULL;
|
||||
|
||||
spin_lock(&hb->lock);
|
||||
hlist_for_each_entry(req, &hb->list, hash_node) {
|
||||
if (cd->data != req->cqe.user_data)
|
||||
continue;
|
||||
if (poll_only && req->opcode != IORING_OP_POLL_ADD)
|
||||
continue;
|
||||
if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
|
||||
if (cd->seq == req->work.cancel_seq)
|
||||
continue;
|
||||
req->work.cancel_seq = cd->seq;
|
||||
}
|
||||
*out_bucket = hb;
|
||||
return req;
|
||||
}
|
||||
spin_unlock(&hb->lock);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
|
||||
struct io_cancel_data *cd,
|
||||
struct io_hash_table *table,
|
||||
struct io_hash_bucket **out_bucket)
|
||||
{
|
||||
unsigned nr_buckets = 1U << table->hash_bits;
|
||||
struct io_kiocb *req;
|
||||
int i;
|
||||
|
||||
*out_bucket = NULL;
|
||||
|
||||
for (i = 0; i < nr_buckets; i++) {
|
||||
struct io_hash_bucket *hb = &table->hbs[i];
|
||||
|
||||
spin_lock(&hb->lock);
|
||||
hlist_for_each_entry(req, &hb->list, hash_node) {
|
||||
if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
|
||||
req->file != cd->file)
|
||||
continue;
|
||||
if (cd->seq == req->work.cancel_seq)
|
||||
continue;
|
||||
req->work.cancel_seq = cd->seq;
|
||||
*out_bucket = hb;
|
||||
return req;
|
||||
}
|
||||
spin_unlock(&hb->lock);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int io_poll_disarm(struct io_kiocb *req)
|
||||
{
|
||||
if (!req)
|
||||
return -ENOENT;
|
||||
if (!io_poll_get_ownership(req))
|
||||
return -EALREADY;
|
||||
io_poll_remove_entries(req);
|
||||
hash_del(&req->hash_node);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
|
||||
struct io_hash_table *table)
|
||||
{
|
||||
struct io_hash_bucket *bucket;
|
||||
struct io_kiocb *req;
|
||||
|
||||
if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
|
||||
req = io_poll_file_find(ctx, cd, table, &bucket);
|
||||
else
|
||||
req = io_poll_find(ctx, false, cd, table, &bucket);
|
||||
|
||||
if (req)
|
||||
io_poll_cancel_req(req);
|
||||
if (bucket)
|
||||
spin_unlock(&bucket->lock);
|
||||
return req ? 0 : -ENOENT;
|
||||
}
|
||||
|
||||
int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
|
||||
unsigned issue_flags)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table);
|
||||
if (ret != -ENOENT)
|
||||
return ret;
|
||||
|
||||
io_ring_submit_lock(ctx, issue_flags);
|
||||
ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table_locked);
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
|
||||
unsigned int flags)
|
||||
{
|
||||
u32 events;
|
||||
|
||||
events = READ_ONCE(sqe->poll32_events);
|
||||
#ifdef __BIG_ENDIAN
|
||||
events = swahw32(events);
|
||||
#endif
|
||||
if (!(flags & IORING_POLL_ADD_MULTI))
|
||||
events |= EPOLLONESHOT;
|
||||
if (!(flags & IORING_POLL_ADD_LEVEL))
|
||||
events |= EPOLLET;
|
||||
return demangle_poll(events) |
|
||||
(events & (EPOLLEXCLUSIVE|EPOLLONESHOT|EPOLLET));
|
||||
}
|
||||
|
||||
int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_poll_update *upd = io_kiocb_to_cmd(req);
|
||||
u32 flags;
|
||||
|
||||
if (sqe->buf_index || sqe->splice_fd_in)
|
||||
return -EINVAL;
|
||||
flags = READ_ONCE(sqe->len);
|
||||
if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
|
||||
IORING_POLL_ADD_MULTI))
|
||||
return -EINVAL;
|
||||
/* meaningless without update */
|
||||
if (flags == IORING_POLL_ADD_MULTI)
|
||||
return -EINVAL;
|
||||
|
||||
upd->old_user_data = READ_ONCE(sqe->addr);
|
||||
upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
|
||||
upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
|
||||
|
||||
upd->new_user_data = READ_ONCE(sqe->off);
|
||||
if (!upd->update_user_data && upd->new_user_data)
|
||||
return -EINVAL;
|
||||
if (upd->update_events)
|
||||
upd->events = io_poll_parse_events(sqe, flags);
|
||||
else if (sqe->poll32_events)
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_poll *poll = io_kiocb_to_cmd(req);
|
||||
u32 flags;
|
||||
|
||||
if (sqe->buf_index || sqe->off || sqe->addr)
|
||||
return -EINVAL;
|
||||
flags = READ_ONCE(sqe->len);
|
||||
if (flags & ~(IORING_POLL_ADD_MULTI|IORING_POLL_ADD_LEVEL))
|
||||
return -EINVAL;
|
||||
if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
|
||||
return -EINVAL;
|
||||
|
||||
poll->events = io_poll_parse_events(sqe, flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_poll *poll = io_kiocb_to_cmd(req);
|
||||
struct io_poll_table ipt;
|
||||
int ret;
|
||||
|
||||
ipt.pt._qproc = io_poll_queue_proc;
|
||||
|
||||
/*
|
||||
* If sqpoll or single issuer, there is no contention for ->uring_lock
|
||||
* and we'll end up holding it in tw handlers anyway.
|
||||
*/
|
||||
if (req->ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_SINGLE_ISSUER))
|
||||
req->flags |= REQ_F_HASH_LOCKED;
|
||||
|
||||
ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags);
|
||||
if (ret > 0) {
|
||||
io_req_set_res(req, ipt.result_mask, 0);
|
||||
return IOU_OK;
|
||||
}
|
||||
return ret ?: IOU_ISSUE_SKIP_COMPLETE;
|
||||
}
|
||||
|
||||
int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_poll_update *poll_update = io_kiocb_to_cmd(req);
|
||||
struct io_cancel_data cd = { .data = poll_update->old_user_data, };
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
struct io_hash_bucket *bucket;
|
||||
struct io_kiocb *preq;
|
||||
int ret2, ret = 0;
|
||||
bool locked;
|
||||
|
||||
preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket);
|
||||
ret2 = io_poll_disarm(preq);
|
||||
if (bucket)
|
||||
spin_unlock(&bucket->lock);
|
||||
if (!ret2)
|
||||
goto found;
|
||||
if (ret2 != -ENOENT) {
|
||||
ret = ret2;
|
||||
goto out;
|
||||
}
|
||||
|
||||
io_ring_submit_lock(ctx, issue_flags);
|
||||
preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table_locked, &bucket);
|
||||
ret2 = io_poll_disarm(preq);
|
||||
if (bucket)
|
||||
spin_unlock(&bucket->lock);
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
if (ret2) {
|
||||
ret = ret2;
|
||||
goto out;
|
||||
}
|
||||
|
||||
found:
|
||||
if (WARN_ON_ONCE(preq->opcode != IORING_OP_POLL_ADD)) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (poll_update->update_events || poll_update->update_user_data) {
|
||||
/* only mask one event flags, keep behavior flags */
|
||||
if (poll_update->update_events) {
|
||||
struct io_poll *poll = io_kiocb_to_cmd(preq);
|
||||
|
||||
poll->events &= ~0xffff;
|
||||
poll->events |= poll_update->events & 0xffff;
|
||||
poll->events |= IO_POLL_UNMASK;
|
||||
}
|
||||
if (poll_update->update_user_data)
|
||||
preq->cqe.user_data = poll_update->new_user_data;
|
||||
|
||||
ret2 = io_poll_add(preq, issue_flags);
|
||||
/* successfully updated, don't complete poll request */
|
||||
if (!ret2 || ret2 == -EIOCBQUEUED)
|
||||
goto out;
|
||||
}
|
||||
|
||||
req_set_fail(preq);
|
||||
io_req_set_res(preq, -ECANCELED, 0);
|
||||
locked = !(issue_flags & IO_URING_F_UNLOCKED);
|
||||
io_req_task_complete(preq, &locked);
|
||||
out:
|
||||
if (ret < 0) {
|
||||
req_set_fail(req);
|
||||
return ret;
|
||||
}
|
||||
/* complete update request, we're done with it */
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
void io_apoll_cache_free(struct io_cache_entry *entry)
|
||||
{
|
||||
kfree(container_of(entry, struct async_poll, cache));
|
||||
}
|
39
io_uring/poll.h
Normal file
39
io_uring/poll.h
Normal file
@ -0,0 +1,39 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "alloc_cache.h"
|
||||
|
||||
enum {
|
||||
IO_APOLL_OK,
|
||||
IO_APOLL_ABORTED,
|
||||
IO_APOLL_READY
|
||||
};
|
||||
|
||||
struct io_poll {
|
||||
struct file *file;
|
||||
struct wait_queue_head *head;
|
||||
__poll_t events;
|
||||
struct wait_queue_entry wait;
|
||||
};
|
||||
|
||||
struct async_poll {
|
||||
union {
|
||||
struct io_poll poll;
|
||||
struct io_cache_entry cache;
|
||||
};
|
||||
struct io_poll *double_poll;
|
||||
};
|
||||
|
||||
int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_poll_add(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
struct io_cancel_data;
|
||||
int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
|
||||
unsigned issue_flags);
|
||||
int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags);
|
||||
bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
|
||||
bool cancel_all);
|
||||
|
||||
void io_apoll_cache_free(struct io_cache_entry *entry);
|
48
io_uring/refs.h
Normal file
48
io_uring/refs.h
Normal file
@ -0,0 +1,48 @@
|
||||
#ifndef IOU_REQ_REF_H
|
||||
#define IOU_REQ_REF_H
|
||||
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/io_uring_types.h>
|
||||
|
||||
/*
|
||||
* Shamelessly stolen from the mm implementation of page reference checking,
|
||||
* see commit f958d7b528b1 for details.
|
||||
*/
|
||||
#define req_ref_zero_or_close_to_overflow(req) \
|
||||
((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
|
||||
|
||||
static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
|
||||
{
|
||||
WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
|
||||
return atomic_inc_not_zero(&req->refs);
|
||||
}
|
||||
|
||||
static inline bool req_ref_put_and_test(struct io_kiocb *req)
|
||||
{
|
||||
if (likely(!(req->flags & REQ_F_REFCOUNT)))
|
||||
return true;
|
||||
|
||||
WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
|
||||
return atomic_dec_and_test(&req->refs);
|
||||
}
|
||||
|
||||
static inline void req_ref_get(struct io_kiocb *req)
|
||||
{
|
||||
WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
|
||||
WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
|
||||
atomic_inc(&req->refs);
|
||||
}
|
||||
|
||||
static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
|
||||
{
|
||||
if (!(req->flags & REQ_F_REFCOUNT)) {
|
||||
req->flags |= REQ_F_REFCOUNT;
|
||||
atomic_set(&req->refs, nr);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void io_req_set_refcount(struct io_kiocb *req)
|
||||
{
|
||||
__io_req_set_refcount(req, 1);
|
||||
}
|
||||
#endif
|
1373
io_uring/rsrc.c
Normal file
1373
io_uring/rsrc.c
Normal file
File diff suppressed because it is too large
Load Diff
166
io_uring/rsrc.h
Normal file
166
io_uring/rsrc.h
Normal file
@ -0,0 +1,166 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#ifndef IOU_RSRC_H
|
||||
#define IOU_RSRC_H
|
||||
|
||||
#include <net/af_unix.h>
|
||||
|
||||
#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
|
||||
#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
|
||||
#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
|
||||
|
||||
enum {
|
||||
IORING_RSRC_FILE = 0,
|
||||
IORING_RSRC_BUFFER = 1,
|
||||
};
|
||||
|
||||
struct io_rsrc_put {
|
||||
struct list_head list;
|
||||
u64 tag;
|
||||
union {
|
||||
void *rsrc;
|
||||
struct file *file;
|
||||
struct io_mapped_ubuf *buf;
|
||||
};
|
||||
};
|
||||
|
||||
typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
|
||||
|
||||
struct io_rsrc_data {
|
||||
struct io_ring_ctx *ctx;
|
||||
|
||||
u64 **tags;
|
||||
unsigned int nr;
|
||||
rsrc_put_fn *do_put;
|
||||
atomic_t refs;
|
||||
struct completion done;
|
||||
bool quiesce;
|
||||
};
|
||||
|
||||
struct io_rsrc_node {
|
||||
struct percpu_ref refs;
|
||||
struct list_head node;
|
||||
struct list_head rsrc_list;
|
||||
struct io_rsrc_data *rsrc_data;
|
||||
struct llist_node llist;
|
||||
bool done;
|
||||
};
|
||||
|
||||
struct io_mapped_ubuf {
|
||||
u64 ubuf;
|
||||
u64 ubuf_end;
|
||||
unsigned int nr_bvecs;
|
||||
unsigned long acct_pages;
|
||||
struct bio_vec bvec[];
|
||||
};
|
||||
|
||||
void io_rsrc_put_work(struct work_struct *work);
|
||||
void io_rsrc_refs_refill(struct io_ring_ctx *ctx);
|
||||
void io_wait_rsrc_data(struct io_rsrc_data *data);
|
||||
void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
|
||||
void io_rsrc_refs_drop(struct io_ring_ctx *ctx);
|
||||
int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
|
||||
int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
|
||||
struct io_rsrc_node *node, void *rsrc);
|
||||
void io_rsrc_node_switch(struct io_ring_ctx *ctx,
|
||||
struct io_rsrc_data *data_to_kill);
|
||||
|
||||
int io_import_fixed(int ddir, struct iov_iter *iter,
|
||||
struct io_mapped_ubuf *imu,
|
||||
u64 buf_addr, size_t len);
|
||||
|
||||
void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
|
||||
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
|
||||
int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
|
||||
unsigned int nr_args, u64 __user *tags);
|
||||
void __io_sqe_files_unregister(struct io_ring_ctx *ctx);
|
||||
int io_sqe_files_unregister(struct io_ring_ctx *ctx);
|
||||
int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
|
||||
unsigned nr_args, u64 __user *tags);
|
||||
|
||||
int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file);
|
||||
|
||||
#if defined(CONFIG_UNIX)
|
||||
static inline bool io_file_need_scm(struct file *filp)
|
||||
{
|
||||
#if defined(IO_URING_SCM_ALL)
|
||||
return true;
|
||||
#else
|
||||
return !!unix_get_socket(filp);
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
static inline bool io_file_need_scm(struct file *filp)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int io_scm_file_account(struct io_ring_ctx *ctx,
|
||||
struct file *file)
|
||||
{
|
||||
if (likely(!io_file_need_scm(file)))
|
||||
return 0;
|
||||
return __io_scm_file_account(ctx, file);
|
||||
}
|
||||
|
||||
int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
|
||||
unsigned nr_args);
|
||||
int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
|
||||
unsigned size, unsigned type);
|
||||
int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
|
||||
unsigned int size, unsigned int type);
|
||||
|
||||
static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
|
||||
{
|
||||
percpu_ref_put_many(&node->refs, nr);
|
||||
}
|
||||
|
||||
static inline void io_req_put_rsrc(struct io_kiocb *req)
|
||||
{
|
||||
if (req->rsrc_node)
|
||||
io_rsrc_put_node(req->rsrc_node, 1);
|
||||
}
|
||||
|
||||
static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
|
||||
struct io_ring_ctx *ctx)
|
||||
__must_hold(&ctx->uring_lock)
|
||||
{
|
||||
struct io_rsrc_node *node = req->rsrc_node;
|
||||
|
||||
if (node) {
|
||||
if (node == ctx->rsrc_node)
|
||||
ctx->rsrc_cached_refs++;
|
||||
else
|
||||
io_rsrc_put_node(node, 1);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void io_req_set_rsrc_node(struct io_kiocb *req,
|
||||
struct io_ring_ctx *ctx,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
if (!req->rsrc_node) {
|
||||
req->rsrc_node = ctx->rsrc_node;
|
||||
|
||||
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
|
||||
lockdep_assert_held(&ctx->uring_lock);
|
||||
ctx->rsrc_cached_refs--;
|
||||
if (unlikely(ctx->rsrc_cached_refs < 0))
|
||||
io_rsrc_refs_refill(ctx);
|
||||
} else {
|
||||
percpu_ref_get(&req->rsrc_node->refs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
|
||||
{
|
||||
unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
|
||||
unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
|
||||
|
||||
return &data->tags[table_idx][off];
|
||||
}
|
||||
|
||||
int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
|
||||
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
#endif
|
1020
io_uring/rw.c
Normal file
1020
io_uring/rw.c
Normal file
File diff suppressed because it is too large
Load Diff
23
io_uring/rw.h
Normal file
23
io_uring/rw.h
Normal file
@ -0,0 +1,23 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include <linux/pagemap.h>
|
||||
|
||||
struct io_rw_state {
|
||||
struct iov_iter iter;
|
||||
struct iov_iter_state iter_state;
|
||||
struct iovec fast_iov[UIO_FASTIOV];
|
||||
};
|
||||
|
||||
struct io_async_rw {
|
||||
struct io_rw_state s;
|
||||
const struct iovec *free_iovec;
|
||||
size_t bytes_done;
|
||||
struct wait_page_queue wpq;
|
||||
};
|
||||
|
||||
int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_read(struct io_kiocb *req, unsigned int issue_flags);
|
||||
int io_readv_prep_async(struct io_kiocb *req);
|
||||
int io_write(struct io_kiocb *req, unsigned int issue_flags);
|
||||
int io_writev_prep_async(struct io_kiocb *req);
|
||||
void io_readv_writev_cleanup(struct io_kiocb *req);
|
@ -1,33 +1,7 @@
|
||||
#ifndef INTERNAL_IO_WQ_H
|
||||
#define INTERNAL_IO_WQ_H
|
||||
#ifndef INTERNAL_IO_SLIST_H
|
||||
#define INTERNAL_IO_SLIST_H
|
||||
|
||||
#include <linux/refcount.h>
|
||||
|
||||
struct io_wq;
|
||||
|
||||
enum {
|
||||
IO_WQ_WORK_CANCEL = 1,
|
||||
IO_WQ_WORK_HASHED = 2,
|
||||
IO_WQ_WORK_UNBOUND = 4,
|
||||
IO_WQ_WORK_CONCURRENT = 16,
|
||||
|
||||
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
|
||||
};
|
||||
|
||||
enum io_wq_cancel {
|
||||
IO_WQ_CANCEL_OK, /* cancelled before started */
|
||||
IO_WQ_CANCEL_RUNNING, /* found, running, and attempted cancelled */
|
||||
IO_WQ_CANCEL_NOTFOUND, /* work not found */
|
||||
};
|
||||
|
||||
struct io_wq_work_node {
|
||||
struct io_wq_work_node *next;
|
||||
};
|
||||
|
||||
struct io_wq_work_list {
|
||||
struct io_wq_work_node *first;
|
||||
struct io_wq_work_node *last;
|
||||
};
|
||||
#include <linux/io_uring_types.h>
|
||||
|
||||
#define wq_list_for_each(pos, prv, head) \
|
||||
for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
|
||||
@ -36,6 +10,7 @@ struct io_wq_work_list {
|
||||
for (; pos; prv = pos, pos = (pos)->next)
|
||||
|
||||
#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL)
|
||||
|
||||
#define INIT_WQ_LIST(list) do { \
|
||||
(list)->first = NULL; \
|
||||
} while (0)
|
||||
@ -152,12 +127,6 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
|
||||
return node;
|
||||
}
|
||||
|
||||
struct io_wq_work {
|
||||
struct io_wq_work_node list;
|
||||
unsigned flags;
|
||||
int cancel_seq;
|
||||
};
|
||||
|
||||
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
|
||||
{
|
||||
if (!work->list.next)
|
||||
@ -166,63 +135,4 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
|
||||
return container_of(work->list.next, struct io_wq_work, list);
|
||||
}
|
||||
|
||||
typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
|
||||
typedef void (io_wq_work_fn)(struct io_wq_work *);
|
||||
|
||||
struct io_wq_hash {
|
||||
refcount_t refs;
|
||||
unsigned long map;
|
||||
struct wait_queue_head wait;
|
||||
};
|
||||
|
||||
static inline void io_wq_put_hash(struct io_wq_hash *hash)
|
||||
{
|
||||
if (refcount_dec_and_test(&hash->refs))
|
||||
kfree(hash);
|
||||
}
|
||||
|
||||
struct io_wq_data {
|
||||
struct io_wq_hash *hash;
|
||||
struct task_struct *task;
|
||||
io_wq_work_fn *do_work;
|
||||
free_work_fn *free_work;
|
||||
};
|
||||
|
||||
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
|
||||
void io_wq_exit_start(struct io_wq *wq);
|
||||
void io_wq_put_and_exit(struct io_wq *wq);
|
||||
|
||||
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
|
||||
void io_wq_hash_work(struct io_wq_work *work, void *val);
|
||||
|
||||
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
|
||||
int io_wq_max_workers(struct io_wq *wq, int *new_count);
|
||||
|
||||
static inline bool io_wq_is_hashed(struct io_wq_work *work)
|
||||
{
|
||||
return work->flags & IO_WQ_WORK_HASHED;
|
||||
}
|
||||
|
||||
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
|
||||
|
||||
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
|
||||
void *data, bool cancel_all);
|
||||
|
||||
#if defined(CONFIG_IO_WQ)
|
||||
extern void io_wq_worker_sleeping(struct task_struct *);
|
||||
extern void io_wq_worker_running(struct task_struct *);
|
||||
#else
|
||||
static inline void io_wq_worker_sleeping(struct task_struct *tsk)
|
||||
{
|
||||
}
|
||||
static inline void io_wq_worker_running(struct task_struct *tsk)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline bool io_wq_current_is_worker(void)
|
||||
{
|
||||
return in_task() && (current->flags & PF_IO_WORKER) &&
|
||||
current->worker_private;
|
||||
}
|
||||
#endif
|
||||
#endif // INTERNAL_IO_SLIST_H
|
122
io_uring/splice.c
Normal file
122
io_uring/splice.c
Normal file
@ -0,0 +1,122 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/io_uring.h>
|
||||
#include <linux/splice.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "splice.h"
|
||||
|
||||
struct io_splice {
|
||||
struct file *file_out;
|
||||
loff_t off_out;
|
||||
loff_t off_in;
|
||||
u64 len;
|
||||
int splice_fd_in;
|
||||
unsigned int flags;
|
||||
};
|
||||
|
||||
static int __io_splice_prep(struct io_kiocb *req,
|
||||
const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_splice *sp = io_kiocb_to_cmd(req);
|
||||
unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
|
||||
|
||||
sp->len = READ_ONCE(sqe->len);
|
||||
sp->flags = READ_ONCE(sqe->splice_flags);
|
||||
if (unlikely(sp->flags & ~valid_flags))
|
||||
return -EINVAL;
|
||||
sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
|
||||
return -EINVAL;
|
||||
return __io_splice_prep(req, sqe);
|
||||
}
|
||||
|
||||
int io_tee(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_splice *sp = io_kiocb_to_cmd(req);
|
||||
struct file *out = sp->file_out;
|
||||
unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
|
||||
struct file *in;
|
||||
long ret = 0;
|
||||
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
return -EAGAIN;
|
||||
|
||||
if (sp->flags & SPLICE_F_FD_IN_FIXED)
|
||||
in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
|
||||
else
|
||||
in = io_file_get_normal(req, sp->splice_fd_in);
|
||||
if (!in) {
|
||||
ret = -EBADF;
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (sp->len)
|
||||
ret = do_tee(in, out, sp->len, flags);
|
||||
|
||||
if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
|
||||
io_put_file(in);
|
||||
done:
|
||||
if (ret != sp->len)
|
||||
req_set_fail(req);
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_splice *sp = io_kiocb_to_cmd(req);
|
||||
|
||||
sp->off_in = READ_ONCE(sqe->splice_off_in);
|
||||
sp->off_out = READ_ONCE(sqe->off);
|
||||
return __io_splice_prep(req, sqe);
|
||||
}
|
||||
|
||||
int io_splice(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_splice *sp = io_kiocb_to_cmd(req);
|
||||
struct file *out = sp->file_out;
|
||||
unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
|
||||
loff_t *poff_in, *poff_out;
|
||||
struct file *in;
|
||||
long ret = 0;
|
||||
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
return -EAGAIN;
|
||||
|
||||
if (sp->flags & SPLICE_F_FD_IN_FIXED)
|
||||
in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
|
||||
else
|
||||
in = io_file_get_normal(req, sp->splice_fd_in);
|
||||
if (!in) {
|
||||
ret = -EBADF;
|
||||
goto done;
|
||||
}
|
||||
|
||||
poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
|
||||
poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
|
||||
|
||||
if (sp->len)
|
||||
ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
|
||||
|
||||
if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
|
||||
io_put_file(in);
|
||||
done:
|
||||
if (ret != sp->len)
|
||||
req_set_fail(req);
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
7
io_uring/splice.h
Normal file
7
io_uring/splice.h
Normal file
@ -0,0 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_tee(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_splice(struct io_kiocb *req, unsigned int issue_flags);
|
425
io_uring/sqpoll.c
Normal file
425
io_uring/sqpoll.c
Normal file
@ -0,0 +1,425 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Contains the core associated with submission side polling of the SQ
|
||||
* ring, offloading submissions from the application to a kernel thread.
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/audit.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "sqpoll.h"
|
||||
|
||||
#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
|
||||
|
||||
enum {
|
||||
IO_SQ_THREAD_SHOULD_STOP = 0,
|
||||
IO_SQ_THREAD_SHOULD_PARK,
|
||||
};
|
||||
|
||||
void io_sq_thread_unpark(struct io_sq_data *sqd)
|
||||
__releases(&sqd->lock)
|
||||
{
|
||||
WARN_ON_ONCE(sqd->thread == current);
|
||||
|
||||
/*
|
||||
* Do the dance but not conditional clear_bit() because it'd race with
|
||||
* other threads incrementing park_pending and setting the bit.
|
||||
*/
|
||||
clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
|
||||
if (atomic_dec_return(&sqd->park_pending))
|
||||
set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
|
||||
mutex_unlock(&sqd->lock);
|
||||
}
|
||||
|
||||
void io_sq_thread_park(struct io_sq_data *sqd)
|
||||
__acquires(&sqd->lock)
|
||||
{
|
||||
WARN_ON_ONCE(sqd->thread == current);
|
||||
|
||||
atomic_inc(&sqd->park_pending);
|
||||
set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
|
||||
mutex_lock(&sqd->lock);
|
||||
if (sqd->thread)
|
||||
wake_up_process(sqd->thread);
|
||||
}
|
||||
|
||||
void io_sq_thread_stop(struct io_sq_data *sqd)
|
||||
{
|
||||
WARN_ON_ONCE(sqd->thread == current);
|
||||
WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
|
||||
|
||||
set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
|
||||
mutex_lock(&sqd->lock);
|
||||
if (sqd->thread)
|
||||
wake_up_process(sqd->thread);
|
||||
mutex_unlock(&sqd->lock);
|
||||
wait_for_completion(&sqd->exited);
|
||||
}
|
||||
|
||||
void io_put_sq_data(struct io_sq_data *sqd)
|
||||
{
|
||||
if (refcount_dec_and_test(&sqd->refs)) {
|
||||
WARN_ON_ONCE(atomic_read(&sqd->park_pending));
|
||||
|
||||
io_sq_thread_stop(sqd);
|
||||
kfree(sqd);
|
||||
}
|
||||
}
|
||||
|
||||
static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
|
||||
{
|
||||
struct io_ring_ctx *ctx;
|
||||
unsigned sq_thread_idle = 0;
|
||||
|
||||
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
|
||||
sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
|
||||
sqd->sq_thread_idle = sq_thread_idle;
|
||||
}
|
||||
|
||||
void io_sq_thread_finish(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_sq_data *sqd = ctx->sq_data;
|
||||
|
||||
if (sqd) {
|
||||
io_sq_thread_park(sqd);
|
||||
list_del_init(&ctx->sqd_list);
|
||||
io_sqd_update_thread_idle(sqd);
|
||||
io_sq_thread_unpark(sqd);
|
||||
|
||||
io_put_sq_data(sqd);
|
||||
ctx->sq_data = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
|
||||
{
|
||||
struct io_ring_ctx *ctx_attach;
|
||||
struct io_sq_data *sqd;
|
||||
struct fd f;
|
||||
|
||||
f = fdget(p->wq_fd);
|
||||
if (!f.file)
|
||||
return ERR_PTR(-ENXIO);
|
||||
if (!io_is_uring_fops(f.file)) {
|
||||
fdput(f);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
ctx_attach = f.file->private_data;
|
||||
sqd = ctx_attach->sq_data;
|
||||
if (!sqd) {
|
||||
fdput(f);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
if (sqd->task_tgid != current->tgid) {
|
||||
fdput(f);
|
||||
return ERR_PTR(-EPERM);
|
||||
}
|
||||
|
||||
refcount_inc(&sqd->refs);
|
||||
fdput(f);
|
||||
return sqd;
|
||||
}
|
||||
|
||||
static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
|
||||
bool *attached)
|
||||
{
|
||||
struct io_sq_data *sqd;
|
||||
|
||||
*attached = false;
|
||||
if (p->flags & IORING_SETUP_ATTACH_WQ) {
|
||||
sqd = io_attach_sq_data(p);
|
||||
if (!IS_ERR(sqd)) {
|
||||
*attached = true;
|
||||
return sqd;
|
||||
}
|
||||
/* fall through for EPERM case, setup new sqd/task */
|
||||
if (PTR_ERR(sqd) != -EPERM)
|
||||
return sqd;
|
||||
}
|
||||
|
||||
sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
|
||||
if (!sqd)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
atomic_set(&sqd->park_pending, 0);
|
||||
refcount_set(&sqd->refs, 1);
|
||||
INIT_LIST_HEAD(&sqd->ctx_list);
|
||||
mutex_init(&sqd->lock);
|
||||
init_waitqueue_head(&sqd->wait);
|
||||
init_completion(&sqd->exited);
|
||||
return sqd;
|
||||
}
|
||||
|
||||
static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
|
||||
{
|
||||
return READ_ONCE(sqd->state);
|
||||
}
|
||||
|
||||
static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
|
||||
{
|
||||
unsigned int to_submit;
|
||||
int ret = 0;
|
||||
|
||||
to_submit = io_sqring_entries(ctx);
|
||||
/* if we're handling multiple rings, cap submit size for fairness */
|
||||
if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
|
||||
to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
|
||||
|
||||
if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
|
||||
const struct cred *creds = NULL;
|
||||
|
||||
if (ctx->sq_creds != current_cred())
|
||||
creds = override_creds(ctx->sq_creds);
|
||||
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
if (!wq_list_empty(&ctx->iopoll_list))
|
||||
io_do_iopoll(ctx, true);
|
||||
|
||||
/*
|
||||
* Don't submit if refs are dying, good for io_uring_register(),
|
||||
* but also it is relied upon by io_ring_exit_work()
|
||||
*/
|
||||
if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
|
||||
!(ctx->flags & IORING_SETUP_R_DISABLED))
|
||||
ret = io_submit_sqes(ctx, to_submit);
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
|
||||
if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
|
||||
wake_up(&ctx->sqo_sq_wait);
|
||||
if (creds)
|
||||
revert_creds(creds);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool io_sqd_handle_event(struct io_sq_data *sqd)
|
||||
{
|
||||
bool did_sig = false;
|
||||
struct ksignal ksig;
|
||||
|
||||
if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
|
||||
signal_pending(current)) {
|
||||
mutex_unlock(&sqd->lock);
|
||||
if (signal_pending(current))
|
||||
did_sig = get_signal(&ksig);
|
||||
cond_resched();
|
||||
mutex_lock(&sqd->lock);
|
||||
}
|
||||
return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
|
||||
}
|
||||
|
||||
static int io_sq_thread(void *data)
|
||||
{
|
||||
struct io_sq_data *sqd = data;
|
||||
struct io_ring_ctx *ctx;
|
||||
unsigned long timeout = 0;
|
||||
char buf[TASK_COMM_LEN];
|
||||
DEFINE_WAIT(wait);
|
||||
|
||||
snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
|
||||
set_task_comm(current, buf);
|
||||
|
||||
if (sqd->sq_cpu != -1)
|
||||
set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
|
||||
else
|
||||
set_cpus_allowed_ptr(current, cpu_online_mask);
|
||||
current->flags |= PF_NO_SETAFFINITY;
|
||||
|
||||
audit_alloc_kernel(current);
|
||||
|
||||
mutex_lock(&sqd->lock);
|
||||
while (1) {
|
||||
bool cap_entries, sqt_spin = false;
|
||||
|
||||
if (io_sqd_events_pending(sqd) || signal_pending(current)) {
|
||||
if (io_sqd_handle_event(sqd))
|
||||
break;
|
||||
timeout = jiffies + sqd->sq_thread_idle;
|
||||
}
|
||||
|
||||
cap_entries = !list_is_singular(&sqd->ctx_list);
|
||||
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
|
||||
int ret = __io_sq_thread(ctx, cap_entries);
|
||||
|
||||
if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
|
||||
sqt_spin = true;
|
||||
}
|
||||
if (io_run_task_work())
|
||||
sqt_spin = true;
|
||||
|
||||
if (sqt_spin || !time_after(jiffies, timeout)) {
|
||||
cond_resched();
|
||||
if (sqt_spin)
|
||||
timeout = jiffies + sqd->sq_thread_idle;
|
||||
continue;
|
||||
}
|
||||
|
||||
prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
|
||||
if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) {
|
||||
bool needs_sched = true;
|
||||
|
||||
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
|
||||
atomic_or(IORING_SQ_NEED_WAKEUP,
|
||||
&ctx->rings->sq_flags);
|
||||
if ((ctx->flags & IORING_SETUP_IOPOLL) &&
|
||||
!wq_list_empty(&ctx->iopoll_list)) {
|
||||
needs_sched = false;
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensure the store of the wakeup flag is not
|
||||
* reordered with the load of the SQ tail
|
||||
*/
|
||||
smp_mb__after_atomic();
|
||||
|
||||
if (io_sqring_entries(ctx)) {
|
||||
needs_sched = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (needs_sched) {
|
||||
mutex_unlock(&sqd->lock);
|
||||
schedule();
|
||||
mutex_lock(&sqd->lock);
|
||||
}
|
||||
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
|
||||
atomic_andnot(IORING_SQ_NEED_WAKEUP,
|
||||
&ctx->rings->sq_flags);
|
||||
}
|
||||
|
||||
finish_wait(&sqd->wait, &wait);
|
||||
timeout = jiffies + sqd->sq_thread_idle;
|
||||
}
|
||||
|
||||
io_uring_cancel_generic(true, sqd);
|
||||
sqd->thread = NULL;
|
||||
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
|
||||
atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);
|
||||
io_run_task_work();
|
||||
mutex_unlock(&sqd->lock);
|
||||
|
||||
audit_free(current);
|
||||
|
||||
complete(&sqd->exited);
|
||||
do_exit(0);
|
||||
}
|
||||
|
||||
int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
|
||||
{
|
||||
DEFINE_WAIT(wait);
|
||||
|
||||
do {
|
||||
if (!io_sqring_full(ctx))
|
||||
break;
|
||||
prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
|
||||
|
||||
if (!io_sqring_full(ctx))
|
||||
break;
|
||||
schedule();
|
||||
} while (!signal_pending(current));
|
||||
|
||||
finish_wait(&ctx->sqo_sq_wait, &wait);
|
||||
return 0;
|
||||
}
|
||||
|
||||
__cold int io_sq_offload_create(struct io_ring_ctx *ctx,
|
||||
struct io_uring_params *p)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/* Retain compatibility with failing for an invalid attach attempt */
|
||||
if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
|
||||
IORING_SETUP_ATTACH_WQ) {
|
||||
struct fd f;
|
||||
|
||||
f = fdget(p->wq_fd);
|
||||
if (!f.file)
|
||||
return -ENXIO;
|
||||
if (!io_is_uring_fops(f.file)) {
|
||||
fdput(f);
|
||||
return -EINVAL;
|
||||
}
|
||||
fdput(f);
|
||||
}
|
||||
if (ctx->flags & IORING_SETUP_SQPOLL) {
|
||||
struct task_struct *tsk;
|
||||
struct io_sq_data *sqd;
|
||||
bool attached;
|
||||
|
||||
ret = security_uring_sqpoll();
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
sqd = io_get_sq_data(p, &attached);
|
||||
if (IS_ERR(sqd)) {
|
||||
ret = PTR_ERR(sqd);
|
||||
goto err;
|
||||
}
|
||||
|
||||
ctx->sq_creds = get_current_cred();
|
||||
ctx->sq_data = sqd;
|
||||
ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
|
||||
if (!ctx->sq_thread_idle)
|
||||
ctx->sq_thread_idle = HZ;
|
||||
|
||||
io_sq_thread_park(sqd);
|
||||
list_add(&ctx->sqd_list, &sqd->ctx_list);
|
||||
io_sqd_update_thread_idle(sqd);
|
||||
/* don't attach to a dying SQPOLL thread, would be racy */
|
||||
ret = (attached && !sqd->thread) ? -ENXIO : 0;
|
||||
io_sq_thread_unpark(sqd);
|
||||
|
||||
if (ret < 0)
|
||||
goto err;
|
||||
if (attached)
|
||||
return 0;
|
||||
|
||||
if (p->flags & IORING_SETUP_SQ_AFF) {
|
||||
int cpu = p->sq_thread_cpu;
|
||||
|
||||
ret = -EINVAL;
|
||||
if (cpu >= nr_cpu_ids || !cpu_online(cpu))
|
||||
goto err_sqpoll;
|
||||
sqd->sq_cpu = cpu;
|
||||
} else {
|
||||
sqd->sq_cpu = -1;
|
||||
}
|
||||
|
||||
sqd->task_pid = current->pid;
|
||||
sqd->task_tgid = current->tgid;
|
||||
tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
|
||||
if (IS_ERR(tsk)) {
|
||||
ret = PTR_ERR(tsk);
|
||||
goto err_sqpoll;
|
||||
}
|
||||
|
||||
sqd->thread = tsk;
|
||||
ret = io_uring_alloc_task_context(tsk, ctx);
|
||||
wake_up_new_task(tsk);
|
||||
if (ret)
|
||||
goto err;
|
||||
} else if (p->flags & IORING_SETUP_SQ_AFF) {
|
||||
/* Can't have SQ_AFF without SQPOLL */
|
||||
ret = -EINVAL;
|
||||
goto err;
|
||||
}
|
||||
|
||||
return 0;
|
||||
err_sqpoll:
|
||||
complete(&ctx->sq_data->exited);
|
||||
err:
|
||||
io_sq_thread_finish(ctx);
|
||||
return ret;
|
||||
}
|
29
io_uring/sqpoll.h
Normal file
29
io_uring/sqpoll.h
Normal file
@ -0,0 +1,29 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
struct io_sq_data {
|
||||
refcount_t refs;
|
||||
atomic_t park_pending;
|
||||
struct mutex lock;
|
||||
|
||||
/* ctx's that are using this sqd */
|
||||
struct list_head ctx_list;
|
||||
|
||||
struct task_struct *thread;
|
||||
struct wait_queue_head wait;
|
||||
|
||||
unsigned sq_thread_idle;
|
||||
int sq_cpu;
|
||||
pid_t task_pid;
|
||||
pid_t task_tgid;
|
||||
|
||||
unsigned long state;
|
||||
struct completion exited;
|
||||
};
|
||||
|
||||
int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p);
|
||||
void io_sq_thread_finish(struct io_ring_ctx *ctx);
|
||||
void io_sq_thread_stop(struct io_sq_data *sqd);
|
||||
void io_sq_thread_park(struct io_sq_data *sqd);
|
||||
void io_sq_thread_unpark(struct io_sq_data *sqd);
|
||||
void io_put_sq_data(struct io_sq_data *sqd);
|
||||
int io_sqpoll_wait_sq(struct io_ring_ctx *ctx);
|
73
io_uring/statx.c
Normal file
73
io_uring/statx.c
Normal file
@ -0,0 +1,73 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "../fs/internal.h"
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "statx.h"
|
||||
|
||||
struct io_statx {
|
||||
struct file *file;
|
||||
int dfd;
|
||||
unsigned int mask;
|
||||
unsigned int flags;
|
||||
struct filename *filename;
|
||||
struct statx __user *buffer;
|
||||
};
|
||||
|
||||
int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_statx *sx = io_kiocb_to_cmd(req);
|
||||
const char __user *path;
|
||||
|
||||
if (sqe->buf_index || sqe->splice_fd_in)
|
||||
return -EINVAL;
|
||||
if (req->flags & REQ_F_FIXED_FILE)
|
||||
return -EBADF;
|
||||
|
||||
sx->dfd = READ_ONCE(sqe->fd);
|
||||
sx->mask = READ_ONCE(sqe->len);
|
||||
path = u64_to_user_ptr(READ_ONCE(sqe->addr));
|
||||
sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
|
||||
sx->flags = READ_ONCE(sqe->statx_flags);
|
||||
|
||||
sx->filename = getname_flags(path,
|
||||
getname_statx_lookup_flags(sx->flags),
|
||||
NULL);
|
||||
|
||||
if (IS_ERR(sx->filename)) {
|
||||
int ret = PTR_ERR(sx->filename);
|
||||
|
||||
sx->filename = NULL;
|
||||
return ret;
|
||||
}
|
||||
|
||||
req->flags |= REQ_F_NEED_CLEANUP;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_statx(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_statx *sx = io_kiocb_to_cmd(req);
|
||||
int ret;
|
||||
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
return -EAGAIN;
|
||||
|
||||
ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer);
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
void io_statx_cleanup(struct io_kiocb *req)
|
||||
{
|
||||
struct io_statx *sx = io_kiocb_to_cmd(req);
|
||||
|
||||
if (sx->filename)
|
||||
putname(sx->filename);
|
||||
}
|
5
io_uring/statx.h
Normal file
5
io_uring/statx.h
Normal file
@ -0,0 +1,5 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_statx(struct io_kiocb *req, unsigned int issue_flags);
|
||||
void io_statx_cleanup(struct io_kiocb *req);
|
110
io_uring/sync.c
Normal file
110
io_uring/sync.c
Normal file
@ -0,0 +1,110 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/io_uring.h>
|
||||
#include <linux/fsnotify.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "sync.h"
|
||||
|
||||
struct io_sync {
|
||||
struct file *file;
|
||||
loff_t len;
|
||||
loff_t off;
|
||||
int flags;
|
||||
int mode;
|
||||
};
|
||||
|
||||
int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_sync *sync = io_kiocb_to_cmd(req);
|
||||
|
||||
if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
|
||||
return -EINVAL;
|
||||
|
||||
sync->off = READ_ONCE(sqe->off);
|
||||
sync->len = READ_ONCE(sqe->len);
|
||||
sync->flags = READ_ONCE(sqe->sync_range_flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_sync *sync = io_kiocb_to_cmd(req);
|
||||
int ret;
|
||||
|
||||
/* sync_file_range always requires a blocking context */
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
return -EAGAIN;
|
||||
|
||||
ret = sync_file_range(req->file, sync->off, sync->len, sync->flags);
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_sync *sync = io_kiocb_to_cmd(req);
|
||||
|
||||
if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
|
||||
return -EINVAL;
|
||||
|
||||
sync->flags = READ_ONCE(sqe->fsync_flags);
|
||||
if (unlikely(sync->flags & ~IORING_FSYNC_DATASYNC))
|
||||
return -EINVAL;
|
||||
|
||||
sync->off = READ_ONCE(sqe->off);
|
||||
sync->len = READ_ONCE(sqe->len);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_sync *sync = io_kiocb_to_cmd(req);
|
||||
loff_t end = sync->off + sync->len;
|
||||
int ret;
|
||||
|
||||
/* fsync always requires a blocking context */
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
return -EAGAIN;
|
||||
|
||||
ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX,
|
||||
sync->flags & IORING_FSYNC_DATASYNC);
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_sync *sync = io_kiocb_to_cmd(req);
|
||||
|
||||
if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
|
||||
return -EINVAL;
|
||||
|
||||
sync->off = READ_ONCE(sqe->off);
|
||||
sync->len = READ_ONCE(sqe->addr);
|
||||
sync->mode = READ_ONCE(sqe->len);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_sync *sync = io_kiocb_to_cmd(req);
|
||||
int ret;
|
||||
|
||||
/* fallocate always requiring blocking context */
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
return -EAGAIN;
|
||||
ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len);
|
||||
if (ret >= 0)
|
||||
fsnotify_modify(req->file);
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
10
io_uring/sync.h
Normal file
10
io_uring/sync.h
Normal file
@ -0,0 +1,10 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_fsync(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_fallocate(struct io_kiocb *req, unsigned int issue_flags);
|
||||
int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
340
io_uring/tctx.c
Normal file
340
io_uring/tctx.c
Normal file
@ -0,0 +1,340 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/nospec.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "tctx.h"
|
||||
|
||||
static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
|
||||
struct task_struct *task)
|
||||
{
|
||||
struct io_wq_hash *hash;
|
||||
struct io_wq_data data;
|
||||
unsigned int concurrency;
|
||||
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
hash = ctx->hash_map;
|
||||
if (!hash) {
|
||||
hash = kzalloc(sizeof(*hash), GFP_KERNEL);
|
||||
if (!hash) {
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
refcount_set(&hash->refs, 1);
|
||||
init_waitqueue_head(&hash->wait);
|
||||
ctx->hash_map = hash;
|
||||
}
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
|
||||
data.hash = hash;
|
||||
data.task = task;
|
||||
data.free_work = io_wq_free_work;
|
||||
data.do_work = io_wq_submit_work;
|
||||
|
||||
/* Do QD, or 4 * CPUS, whatever is smallest */
|
||||
concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
|
||||
|
||||
return io_wq_create(concurrency, &data);
|
||||
}
|
||||
|
||||
void __io_uring_free(struct task_struct *tsk)
|
||||
{
|
||||
struct io_uring_task *tctx = tsk->io_uring;
|
||||
|
||||
WARN_ON_ONCE(!xa_empty(&tctx->xa));
|
||||
WARN_ON_ONCE(tctx->io_wq);
|
||||
WARN_ON_ONCE(tctx->cached_refs);
|
||||
|
||||
percpu_counter_destroy(&tctx->inflight);
|
||||
kfree(tctx);
|
||||
tsk->io_uring = NULL;
|
||||
}
|
||||
|
||||
__cold int io_uring_alloc_task_context(struct task_struct *task,
|
||||
struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_uring_task *tctx;
|
||||
int ret;
|
||||
|
||||
tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
|
||||
if (unlikely(!tctx))
|
||||
return -ENOMEM;
|
||||
|
||||
ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
|
||||
if (unlikely(ret)) {
|
||||
kfree(tctx);
|
||||
return ret;
|
||||
}
|
||||
|
||||
tctx->io_wq = io_init_wq_offload(ctx, task);
|
||||
if (IS_ERR(tctx->io_wq)) {
|
||||
ret = PTR_ERR(tctx->io_wq);
|
||||
percpu_counter_destroy(&tctx->inflight);
|
||||
kfree(tctx);
|
||||
return ret;
|
||||
}
|
||||
|
||||
xa_init(&tctx->xa);
|
||||
init_waitqueue_head(&tctx->wait);
|
||||
atomic_set(&tctx->in_idle, 0);
|
||||
atomic_set(&tctx->inflight_tracked, 0);
|
||||
task->io_uring = tctx;
|
||||
init_llist_head(&tctx->task_list);
|
||||
init_task_work(&tctx->task_work, tctx_task_work);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_register_submitter(struct io_ring_ctx *ctx)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
if (!ctx->submitter_task)
|
||||
ctx->submitter_task = get_task_struct(current);
|
||||
else if (ctx->submitter_task != current)
|
||||
ret = -EEXIST;
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool submitter)
|
||||
{
|
||||
struct io_uring_task *tctx = current->io_uring;
|
||||
struct io_tctx_node *node;
|
||||
int ret;
|
||||
|
||||
if ((ctx->flags & IORING_SETUP_SINGLE_ISSUER) && submitter) {
|
||||
ret = io_register_submitter(ctx);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (unlikely(!tctx)) {
|
||||
ret = io_uring_alloc_task_context(current, ctx);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
||||
tctx = current->io_uring;
|
||||
if (ctx->iowq_limits_set) {
|
||||
unsigned int limits[2] = { ctx->iowq_limits[0],
|
||||
ctx->iowq_limits[1], };
|
||||
|
||||
ret = io_wq_max_workers(tctx->io_wq, limits);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
|
||||
node = kmalloc(sizeof(*node), GFP_KERNEL);
|
||||
if (!node)
|
||||
return -ENOMEM;
|
||||
node->ctx = ctx;
|
||||
node->task = current;
|
||||
|
||||
ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
|
||||
node, GFP_KERNEL));
|
||||
if (ret) {
|
||||
kfree(node);
|
||||
return ret;
|
||||
}
|
||||
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
list_add(&node->ctx_node, &ctx->tctx_list);
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
}
|
||||
if (submitter)
|
||||
tctx->last = ctx;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove this io_uring_file -> task mapping.
|
||||
*/
|
||||
__cold void io_uring_del_tctx_node(unsigned long index)
|
||||
{
|
||||
struct io_uring_task *tctx = current->io_uring;
|
||||
struct io_tctx_node *node;
|
||||
|
||||
if (!tctx)
|
||||
return;
|
||||
node = xa_erase(&tctx->xa, index);
|
||||
if (!node)
|
||||
return;
|
||||
|
||||
WARN_ON_ONCE(current != node->task);
|
||||
WARN_ON_ONCE(list_empty(&node->ctx_node));
|
||||
|
||||
mutex_lock(&node->ctx->uring_lock);
|
||||
list_del(&node->ctx_node);
|
||||
mutex_unlock(&node->ctx->uring_lock);
|
||||
|
||||
if (tctx->last == node->ctx)
|
||||
tctx->last = NULL;
|
||||
kfree(node);
|
||||
}
|
||||
|
||||
__cold void io_uring_clean_tctx(struct io_uring_task *tctx)
|
||||
{
|
||||
struct io_wq *wq = tctx->io_wq;
|
||||
struct io_tctx_node *node;
|
||||
unsigned long index;
|
||||
|
||||
xa_for_each(&tctx->xa, index, node) {
|
||||
io_uring_del_tctx_node(index);
|
||||
cond_resched();
|
||||
}
|
||||
if (wq) {
|
||||
/*
|
||||
* Must be after io_uring_del_tctx_node() (removes nodes under
|
||||
* uring_lock) to avoid race with io_uring_try_cancel_iowq().
|
||||
*/
|
||||
io_wq_put_and_exit(wq);
|
||||
tctx->io_wq = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void io_uring_unreg_ringfd(void)
|
||||
{
|
||||
struct io_uring_task *tctx = current->io_uring;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
|
||||
if (tctx->registered_rings[i]) {
|
||||
fput(tctx->registered_rings[i]);
|
||||
tctx->registered_rings[i] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
|
||||
int start, int end)
|
||||
{
|
||||
struct file *file;
|
||||
int offset;
|
||||
|
||||
for (offset = start; offset < end; offset++) {
|
||||
offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
|
||||
if (tctx->registered_rings[offset])
|
||||
continue;
|
||||
|
||||
file = fget(fd);
|
||||
if (!file) {
|
||||
return -EBADF;
|
||||
} else if (!io_is_uring_fops(file)) {
|
||||
fput(file);
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
tctx->registered_rings[offset] = file;
|
||||
return offset;
|
||||
}
|
||||
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
/*
|
||||
* Register a ring fd to avoid fdget/fdput for each io_uring_enter()
|
||||
* invocation. User passes in an array of struct io_uring_rsrc_update
|
||||
* with ->data set to the ring_fd, and ->offset given for the desired
|
||||
* index. If no index is desired, application may set ->offset == -1U
|
||||
* and we'll find an available index. Returns number of entries
|
||||
* successfully processed, or < 0 on error if none were processed.
|
||||
*/
|
||||
int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
|
||||
unsigned nr_args)
|
||||
{
|
||||
struct io_uring_rsrc_update __user *arg = __arg;
|
||||
struct io_uring_rsrc_update reg;
|
||||
struct io_uring_task *tctx;
|
||||
int ret, i;
|
||||
|
||||
if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
|
||||
return -EINVAL;
|
||||
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
ret = __io_uring_add_tctx_node(ctx, false);
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
tctx = current->io_uring;
|
||||
for (i = 0; i < nr_args; i++) {
|
||||
int start, end;
|
||||
|
||||
if (copy_from_user(®, &arg[i], sizeof(reg))) {
|
||||
ret = -EFAULT;
|
||||
break;
|
||||
}
|
||||
|
||||
if (reg.resv) {
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
|
||||
if (reg.offset == -1U) {
|
||||
start = 0;
|
||||
end = IO_RINGFD_REG_MAX;
|
||||
} else {
|
||||
if (reg.offset >= IO_RINGFD_REG_MAX) {
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
start = reg.offset;
|
||||
end = start + 1;
|
||||
}
|
||||
|
||||
ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
reg.offset = ret;
|
||||
if (copy_to_user(&arg[i], ®, sizeof(reg))) {
|
||||
fput(tctx->registered_rings[reg.offset]);
|
||||
tctx->registered_rings[reg.offset] = NULL;
|
||||
ret = -EFAULT;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return i ? i : ret;
|
||||
}
|
||||
|
||||
int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
|
||||
unsigned nr_args)
|
||||
{
|
||||
struct io_uring_rsrc_update __user *arg = __arg;
|
||||
struct io_uring_task *tctx = current->io_uring;
|
||||
struct io_uring_rsrc_update reg;
|
||||
int ret = 0, i;
|
||||
|
||||
if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
|
||||
return -EINVAL;
|
||||
if (!tctx)
|
||||
return 0;
|
||||
|
||||
for (i = 0; i < nr_args; i++) {
|
||||
if (copy_from_user(®, &arg[i], sizeof(reg))) {
|
||||
ret = -EFAULT;
|
||||
break;
|
||||
}
|
||||
if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
|
||||
reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
|
||||
if (tctx->registered_rings[reg.offset]) {
|
||||
fput(tctx->registered_rings[reg.offset]);
|
||||
tctx->registered_rings[reg.offset] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return i ? i : ret;
|
||||
}
|
57
io_uring/tctx.h
Normal file
57
io_uring/tctx.h
Normal file
@ -0,0 +1,57 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include <linux/llist.h>
|
||||
|
||||
/*
|
||||
* Arbitrary limit, can be raised if need be
|
||||
*/
|
||||
#define IO_RINGFD_REG_MAX 16
|
||||
|
||||
struct io_uring_task {
|
||||
/* submission side */
|
||||
int cached_refs;
|
||||
const struct io_ring_ctx *last;
|
||||
struct io_wq *io_wq;
|
||||
struct file *registered_rings[IO_RINGFD_REG_MAX];
|
||||
|
||||
struct xarray xa;
|
||||
struct wait_queue_head wait;
|
||||
atomic_t in_idle;
|
||||
atomic_t inflight_tracked;
|
||||
struct percpu_counter inflight;
|
||||
|
||||
struct { /* task_work */
|
||||
struct llist_head task_list;
|
||||
struct callback_head task_work;
|
||||
} ____cacheline_aligned_in_smp;
|
||||
};
|
||||
|
||||
struct io_tctx_node {
|
||||
struct list_head ctx_node;
|
||||
struct task_struct *task;
|
||||
struct io_ring_ctx *ctx;
|
||||
};
|
||||
|
||||
int io_uring_alloc_task_context(struct task_struct *task,
|
||||
struct io_ring_ctx *ctx);
|
||||
void io_uring_del_tctx_node(unsigned long index);
|
||||
int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool submitter);
|
||||
void io_uring_clean_tctx(struct io_uring_task *tctx);
|
||||
|
||||
void io_uring_unreg_ringfd(void);
|
||||
int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
|
||||
unsigned nr_args);
|
||||
int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
|
||||
unsigned nr_args);
|
||||
|
||||
/*
|
||||
* Note that this task has used io_uring. We use it for cancelation purposes.
|
||||
*/
|
||||
static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_uring_task *tctx = current->io_uring;
|
||||
|
||||
if (likely(tctx && tctx->last == ctx))
|
||||
return 0;
|
||||
return __io_uring_add_tctx_node(ctx, true);
|
||||
}
|
644
io_uring/timeout.c
Normal file
644
io_uring/timeout.c
Normal file
@ -0,0 +1,644 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include <trace/events/io_uring.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "refs.h"
|
||||
#include "cancel.h"
|
||||
#include "timeout.h"
|
||||
|
||||
struct io_timeout {
|
||||
struct file *file;
|
||||
u32 off;
|
||||
u32 target_seq;
|
||||
struct list_head list;
|
||||
/* head of the link, used by linked timeouts only */
|
||||
struct io_kiocb *head;
|
||||
/* for linked completions */
|
||||
struct io_kiocb *prev;
|
||||
};
|
||||
|
||||
struct io_timeout_rem {
|
||||
struct file *file;
|
||||
u64 addr;
|
||||
|
||||
/* timeout update */
|
||||
struct timespec64 ts;
|
||||
u32 flags;
|
||||
bool ltimeout;
|
||||
};
|
||||
|
||||
static inline bool io_is_timeout_noseq(struct io_kiocb *req)
|
||||
{
|
||||
struct io_timeout *timeout = io_kiocb_to_cmd(req);
|
||||
|
||||
return !timeout->off;
|
||||
}
|
||||
|
||||
static inline void io_put_req(struct io_kiocb *req)
|
||||
{
|
||||
if (req_ref_put_and_test(req)) {
|
||||
io_queue_next(req);
|
||||
io_free_req(req);
|
||||
}
|
||||
}
|
||||
|
||||
static bool io_kill_timeout(struct io_kiocb *req, int status)
|
||||
__must_hold(&req->ctx->completion_lock)
|
||||
__must_hold(&req->ctx->timeout_lock)
|
||||
{
|
||||
struct io_timeout_data *io = req->async_data;
|
||||
|
||||
if (hrtimer_try_to_cancel(&io->timer) != -1) {
|
||||
struct io_timeout *timeout = io_kiocb_to_cmd(req);
|
||||
|
||||
if (status)
|
||||
req_set_fail(req);
|
||||
atomic_set(&req->ctx->cq_timeouts,
|
||||
atomic_read(&req->ctx->cq_timeouts) + 1);
|
||||
list_del_init(&timeout->list);
|
||||
io_req_tw_post_queue(req, status, 0);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
__cold void io_flush_timeouts(struct io_ring_ctx *ctx)
|
||||
__must_hold(&ctx->completion_lock)
|
||||
{
|
||||
u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
|
||||
struct io_timeout *timeout, *tmp;
|
||||
|
||||
spin_lock_irq(&ctx->timeout_lock);
|
||||
list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) {
|
||||
struct io_kiocb *req = cmd_to_io_kiocb(timeout);
|
||||
u32 events_needed, events_got;
|
||||
|
||||
if (io_is_timeout_noseq(req))
|
||||
break;
|
||||
|
||||
/*
|
||||
* Since seq can easily wrap around over time, subtract
|
||||
* the last seq at which timeouts were flushed before comparing.
|
||||
* Assuming not more than 2^31-1 events have happened since,
|
||||
* these subtractions won't have wrapped, so we can check if
|
||||
* target is in [last_seq, current_seq] by comparing the two.
|
||||
*/
|
||||
events_needed = timeout->target_seq - ctx->cq_last_tm_flush;
|
||||
events_got = seq - ctx->cq_last_tm_flush;
|
||||
if (events_got < events_needed)
|
||||
break;
|
||||
|
||||
io_kill_timeout(req, 0);
|
||||
}
|
||||
ctx->cq_last_tm_flush = seq;
|
||||
spin_unlock_irq(&ctx->timeout_lock);
|
||||
}
|
||||
|
||||
static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked)
|
||||
{
|
||||
io_tw_lock(link->ctx, locked);
|
||||
while (link) {
|
||||
struct io_kiocb *nxt = link->link;
|
||||
long res = -ECANCELED;
|
||||
|
||||
if (link->flags & REQ_F_FAIL)
|
||||
res = link->cqe.res;
|
||||
link->link = NULL;
|
||||
io_req_set_res(link, res, 0);
|
||||
io_req_task_complete(link, locked);
|
||||
link = nxt;
|
||||
}
|
||||
}
|
||||
|
||||
static void io_fail_links(struct io_kiocb *req)
|
||||
__must_hold(&req->ctx->completion_lock)
|
||||
{
|
||||
struct io_kiocb *link = req->link;
|
||||
bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
|
||||
|
||||
if (!link)
|
||||
return;
|
||||
|
||||
while (link) {
|
||||
if (ignore_cqes)
|
||||
link->flags |= REQ_F_CQE_SKIP;
|
||||
else
|
||||
link->flags &= ~REQ_F_CQE_SKIP;
|
||||
trace_io_uring_fail_link(req, link);
|
||||
link = link->link;
|
||||
}
|
||||
|
||||
link = req->link;
|
||||
link->io_task_work.func = io_req_tw_fail_links;
|
||||
io_req_task_work_add(link);
|
||||
req->link = NULL;
|
||||
}
|
||||
|
||||
static inline void io_remove_next_linked(struct io_kiocb *req)
|
||||
{
|
||||
struct io_kiocb *nxt = req->link;
|
||||
|
||||
req->link = nxt->link;
|
||||
nxt->link = NULL;
|
||||
}
|
||||
|
||||
bool io_disarm_next(struct io_kiocb *req)
|
||||
__must_hold(&req->ctx->completion_lock)
|
||||
{
|
||||
struct io_kiocb *link = NULL;
|
||||
bool posted = false;
|
||||
|
||||
if (req->flags & REQ_F_ARM_LTIMEOUT) {
|
||||
link = req->link;
|
||||
req->flags &= ~REQ_F_ARM_LTIMEOUT;
|
||||
if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
|
||||
io_remove_next_linked(req);
|
||||
io_req_tw_post_queue(link, -ECANCELED, 0);
|
||||
posted = true;
|
||||
}
|
||||
} else if (req->flags & REQ_F_LINK_TIMEOUT) {
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
|
||||
spin_lock_irq(&ctx->timeout_lock);
|
||||
link = io_disarm_linked_timeout(req);
|
||||
spin_unlock_irq(&ctx->timeout_lock);
|
||||
if (link) {
|
||||
posted = true;
|
||||
io_req_tw_post_queue(link, -ECANCELED, 0);
|
||||
}
|
||||
}
|
||||
if (unlikely((req->flags & REQ_F_FAIL) &&
|
||||
!(req->flags & REQ_F_HARDLINK))) {
|
||||
posted |= (req->link != NULL);
|
||||
io_fail_links(req);
|
||||
}
|
||||
return posted;
|
||||
}
|
||||
|
||||
struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
|
||||
struct io_kiocb *link)
|
||||
__must_hold(&req->ctx->completion_lock)
|
||||
__must_hold(&req->ctx->timeout_lock)
|
||||
{
|
||||
struct io_timeout_data *io = link->async_data;
|
||||
struct io_timeout *timeout = io_kiocb_to_cmd(link);
|
||||
|
||||
io_remove_next_linked(req);
|
||||
timeout->head = NULL;
|
||||
if (hrtimer_try_to_cancel(&io->timer) != -1) {
|
||||
list_del(&timeout->list);
|
||||
return link;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
|
||||
{
|
||||
struct io_timeout_data *data = container_of(timer,
|
||||
struct io_timeout_data, timer);
|
||||
struct io_kiocb *req = data->req;
|
||||
struct io_timeout *timeout = io_kiocb_to_cmd(req);
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&ctx->timeout_lock, flags);
|
||||
list_del_init(&timeout->list);
|
||||
atomic_set(&req->ctx->cq_timeouts,
|
||||
atomic_read(&req->ctx->cq_timeouts) + 1);
|
||||
spin_unlock_irqrestore(&ctx->timeout_lock, flags);
|
||||
|
||||
if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
|
||||
req_set_fail(req);
|
||||
|
||||
io_req_set_res(req, -ETIME, 0);
|
||||
req->io_task_work.func = io_req_task_complete;
|
||||
io_req_task_work_add(req);
|
||||
return HRTIMER_NORESTART;
|
||||
}
|
||||
|
||||
static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
|
||||
struct io_cancel_data *cd)
|
||||
__must_hold(&ctx->timeout_lock)
|
||||
{
|
||||
struct io_timeout *timeout;
|
||||
struct io_timeout_data *io;
|
||||
struct io_kiocb *req = NULL;
|
||||
|
||||
list_for_each_entry(timeout, &ctx->timeout_list, list) {
|
||||
struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);
|
||||
|
||||
if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
|
||||
cd->data != tmp->cqe.user_data)
|
||||
continue;
|
||||
if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
|
||||
if (cd->seq == tmp->work.cancel_seq)
|
||||
continue;
|
||||
tmp->work.cancel_seq = cd->seq;
|
||||
}
|
||||
req = tmp;
|
||||
break;
|
||||
}
|
||||
if (!req)
|
||||
return ERR_PTR(-ENOENT);
|
||||
|
||||
io = req->async_data;
|
||||
if (hrtimer_try_to_cancel(&io->timer) == -1)
|
||||
return ERR_PTR(-EALREADY);
|
||||
timeout = io_kiocb_to_cmd(req);
|
||||
list_del_init(&timeout->list);
|
||||
return req;
|
||||
}
|
||||
|
||||
int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
|
||||
__must_hold(&ctx->completion_lock)
|
||||
{
|
||||
struct io_kiocb *req;
|
||||
|
||||
spin_lock_irq(&ctx->timeout_lock);
|
||||
req = io_timeout_extract(ctx, cd);
|
||||
spin_unlock_irq(&ctx->timeout_lock);
|
||||
|
||||
if (IS_ERR(req))
|
||||
return PTR_ERR(req);
|
||||
io_req_task_queue_fail(req, -ECANCELED);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
|
||||
{
|
||||
unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
|
||||
struct io_timeout *timeout = io_kiocb_to_cmd(req);
|
||||
struct io_kiocb *prev = timeout->prev;
|
||||
int ret = -ENOENT;
|
||||
|
||||
if (prev) {
|
||||
if (!(req->task->flags & PF_EXITING)) {
|
||||
struct io_cancel_data cd = {
|
||||
.ctx = req->ctx,
|
||||
.data = prev->cqe.user_data,
|
||||
};
|
||||
|
||||
ret = io_try_cancel(req->task->io_uring, &cd, issue_flags);
|
||||
}
|
||||
io_req_set_res(req, ret ?: -ETIME, 0);
|
||||
io_req_complete_post(req);
|
||||
io_put_req(prev);
|
||||
} else {
|
||||
io_req_set_res(req, -ETIME, 0);
|
||||
io_req_complete_post(req);
|
||||
}
|
||||
}
|
||||
|
||||
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
|
||||
{
|
||||
struct io_timeout_data *data = container_of(timer,
|
||||
struct io_timeout_data, timer);
|
||||
struct io_kiocb *prev, *req = data->req;
|
||||
struct io_timeout *timeout = io_kiocb_to_cmd(req);
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&ctx->timeout_lock, flags);
|
||||
prev = timeout->head;
|
||||
timeout->head = NULL;
|
||||
|
||||
/*
|
||||
* We don't expect the list to be empty, that will only happen if we
|
||||
* race with the completion of the linked work.
|
||||
*/
|
||||
if (prev) {
|
||||
io_remove_next_linked(prev);
|
||||
if (!req_ref_inc_not_zero(prev))
|
||||
prev = NULL;
|
||||
}
|
||||
list_del(&timeout->list);
|
||||
timeout->prev = prev;
|
||||
spin_unlock_irqrestore(&ctx->timeout_lock, flags);
|
||||
|
||||
req->io_task_work.func = io_req_task_link_timeout;
|
||||
io_req_task_work_add(req);
|
||||
return HRTIMER_NORESTART;
|
||||
}
|
||||
|
||||
static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
|
||||
{
|
||||
switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
|
||||
case IORING_TIMEOUT_BOOTTIME:
|
||||
return CLOCK_BOOTTIME;
|
||||
case IORING_TIMEOUT_REALTIME:
|
||||
return CLOCK_REALTIME;
|
||||
default:
|
||||
/* can't happen, vetted at prep time */
|
||||
WARN_ON_ONCE(1);
|
||||
fallthrough;
|
||||
case 0:
|
||||
return CLOCK_MONOTONIC;
|
||||
}
|
||||
}
|
||||
|
||||
static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
|
||||
struct timespec64 *ts, enum hrtimer_mode mode)
|
||||
__must_hold(&ctx->timeout_lock)
|
||||
{
|
||||
struct io_timeout_data *io;
|
||||
struct io_timeout *timeout;
|
||||
struct io_kiocb *req = NULL;
|
||||
|
||||
list_for_each_entry(timeout, &ctx->ltimeout_list, list) {
|
||||
struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);
|
||||
|
||||
if (user_data == tmp->cqe.user_data) {
|
||||
req = tmp;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!req)
|
||||
return -ENOENT;
|
||||
|
||||
io = req->async_data;
|
||||
if (hrtimer_try_to_cancel(&io->timer) == -1)
|
||||
return -EALREADY;
|
||||
hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
|
||||
io->timer.function = io_link_timeout_fn;
|
||||
hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
|
||||
struct timespec64 *ts, enum hrtimer_mode mode)
|
||||
__must_hold(&ctx->timeout_lock)
|
||||
{
|
||||
struct io_cancel_data cd = { .data = user_data, };
|
||||
struct io_kiocb *req = io_timeout_extract(ctx, &cd);
|
||||
struct io_timeout *timeout = io_kiocb_to_cmd(req);
|
||||
struct io_timeout_data *data;
|
||||
|
||||
if (IS_ERR(req))
|
||||
return PTR_ERR(req);
|
||||
|
||||
timeout->off = 0; /* noseq */
|
||||
data = req->async_data;
|
||||
list_add_tail(&timeout->list, &ctx->timeout_list);
|
||||
hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
|
||||
data->timer.function = io_timeout_fn;
|
||||
hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_timeout_rem *tr = io_kiocb_to_cmd(req);
|
||||
|
||||
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
|
||||
return -EINVAL;
|
||||
if (sqe->buf_index || sqe->len || sqe->splice_fd_in)
|
||||
return -EINVAL;
|
||||
|
||||
tr->ltimeout = false;
|
||||
tr->addr = READ_ONCE(sqe->addr);
|
||||
tr->flags = READ_ONCE(sqe->timeout_flags);
|
||||
if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
|
||||
if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
|
||||
return -EINVAL;
|
||||
if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
|
||||
tr->ltimeout = true;
|
||||
if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
|
||||
return -EINVAL;
|
||||
if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
|
||||
return -EFAULT;
|
||||
if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
|
||||
return -EINVAL;
|
||||
} else if (tr->flags) {
|
||||
/* timeout removal doesn't support flags */
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
|
||||
{
|
||||
return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
|
||||
: HRTIMER_MODE_REL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove or update an existing timeout command
|
||||
*/
|
||||
int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_timeout_rem *tr = io_kiocb_to_cmd(req);
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
int ret;
|
||||
|
||||
if (!(tr->flags & IORING_TIMEOUT_UPDATE)) {
|
||||
struct io_cancel_data cd = { .data = tr->addr, };
|
||||
|
||||
spin_lock(&ctx->completion_lock);
|
||||
ret = io_timeout_cancel(ctx, &cd);
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
} else {
|
||||
enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
|
||||
|
||||
spin_lock_irq(&ctx->timeout_lock);
|
||||
if (tr->ltimeout)
|
||||
ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
|
||||
else
|
||||
ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
|
||||
spin_unlock_irq(&ctx->timeout_lock);
|
||||
}
|
||||
|
||||
if (ret < 0)
|
||||
req_set_fail(req);
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
static int __io_timeout_prep(struct io_kiocb *req,
|
||||
const struct io_uring_sqe *sqe,
|
||||
bool is_timeout_link)
|
||||
{
|
||||
struct io_timeout *timeout = io_kiocb_to_cmd(req);
|
||||
struct io_timeout_data *data;
|
||||
unsigned flags;
|
||||
u32 off = READ_ONCE(sqe->off);
|
||||
|
||||
if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in)
|
||||
return -EINVAL;
|
||||
if (off && is_timeout_link)
|
||||
return -EINVAL;
|
||||
flags = READ_ONCE(sqe->timeout_flags);
|
||||
if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
|
||||
IORING_TIMEOUT_ETIME_SUCCESS))
|
||||
return -EINVAL;
|
||||
/* more than one clock specified is invalid, obviously */
|
||||
if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
|
||||
return -EINVAL;
|
||||
|
||||
INIT_LIST_HEAD(&timeout->list);
|
||||
timeout->off = off;
|
||||
if (unlikely(off && !req->ctx->off_timeout_used))
|
||||
req->ctx->off_timeout_used = true;
|
||||
|
||||
if (WARN_ON_ONCE(req_has_async_data(req)))
|
||||
return -EFAULT;
|
||||
if (io_alloc_async_data(req))
|
||||
return -ENOMEM;
|
||||
|
||||
data = req->async_data;
|
||||
data->req = req;
|
||||
data->flags = flags;
|
||||
|
||||
if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
|
||||
return -EFAULT;
|
||||
|
||||
if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
|
||||
return -EINVAL;
|
||||
|
||||
INIT_LIST_HEAD(&timeout->list);
|
||||
data->mode = io_translate_timeout_mode(flags);
|
||||
hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
|
||||
|
||||
if (is_timeout_link) {
|
||||
struct io_submit_link *link = &req->ctx->submit_state.link;
|
||||
|
||||
if (!link->head)
|
||||
return -EINVAL;
|
||||
if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
|
||||
return -EINVAL;
|
||||
timeout->head = link->last;
|
||||
link->last->flags |= REQ_F_ARM_LTIMEOUT;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
return __io_timeout_prep(req, sqe, false);
|
||||
}
|
||||
|
||||
int io_link_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
return __io_timeout_prep(req, sqe, true);
|
||||
}
|
||||
|
||||
int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_timeout *timeout = io_kiocb_to_cmd(req);
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
struct io_timeout_data *data = req->async_data;
|
||||
struct list_head *entry;
|
||||
u32 tail, off = timeout->off;
|
||||
|
||||
spin_lock_irq(&ctx->timeout_lock);
|
||||
|
||||
/*
|
||||
* sqe->off holds how many events that need to occur for this
|
||||
* timeout event to be satisfied. If it isn't set, then this is
|
||||
* a pure timeout request, sequence isn't used.
|
||||
*/
|
||||
if (io_is_timeout_noseq(req)) {
|
||||
entry = ctx->timeout_list.prev;
|
||||
goto add;
|
||||
}
|
||||
|
||||
tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
|
||||
timeout->target_seq = tail + off;
|
||||
|
||||
/* Update the last seq here in case io_flush_timeouts() hasn't.
|
||||
* This is safe because ->completion_lock is held, and submissions
|
||||
* and completions are never mixed in the same ->completion_lock section.
|
||||
*/
|
||||
ctx->cq_last_tm_flush = tail;
|
||||
|
||||
/*
|
||||
* Insertion sort, ensuring the first entry in the list is always
|
||||
* the one we need first.
|
||||
*/
|
||||
list_for_each_prev(entry, &ctx->timeout_list) {
|
||||
struct io_timeout *nextt = list_entry(entry, struct io_timeout, list);
|
||||
struct io_kiocb *nxt = cmd_to_io_kiocb(nextt);
|
||||
|
||||
if (io_is_timeout_noseq(nxt))
|
||||
continue;
|
||||
/* nxt.seq is behind @tail, otherwise would've been completed */
|
||||
if (off >= nextt->target_seq - tail)
|
||||
break;
|
||||
}
|
||||
add:
|
||||
list_add(&timeout->list, entry);
|
||||
data->timer.function = io_timeout_fn;
|
||||
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
|
||||
spin_unlock_irq(&ctx->timeout_lock);
|
||||
return IOU_ISSUE_SKIP_COMPLETE;
|
||||
}
|
||||
|
||||
void io_queue_linked_timeout(struct io_kiocb *req)
|
||||
{
|
||||
struct io_timeout *timeout = io_kiocb_to_cmd(req);
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
|
||||
spin_lock_irq(&ctx->timeout_lock);
|
||||
/*
|
||||
* If the back reference is NULL, then our linked request finished
|
||||
* before we got a chance to setup the timer
|
||||
*/
|
||||
if (timeout->head) {
|
||||
struct io_timeout_data *data = req->async_data;
|
||||
|
||||
data->timer.function = io_link_timeout_fn;
|
||||
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
|
||||
data->mode);
|
||||
list_add_tail(&timeout->list, &ctx->ltimeout_list);
|
||||
}
|
||||
spin_unlock_irq(&ctx->timeout_lock);
|
||||
/* drop submission reference */
|
||||
io_put_req(req);
|
||||
}
|
||||
|
||||
static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
|
||||
bool cancel_all)
|
||||
__must_hold(&req->ctx->timeout_lock)
|
||||
{
|
||||
struct io_kiocb *req;
|
||||
|
||||
if (task && head->task != task)
|
||||
return false;
|
||||
if (cancel_all)
|
||||
return true;
|
||||
|
||||
io_for_each_link(req, head) {
|
||||
if (req->flags & REQ_F_INFLIGHT)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Returns true if we found and killed one or more timeouts */
|
||||
__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
|
||||
bool cancel_all)
|
||||
{
|
||||
struct io_timeout *timeout, *tmp;
|
||||
int canceled = 0;
|
||||
|
||||
io_cq_lock(ctx);
|
||||
spin_lock_irq(&ctx->timeout_lock);
|
||||
list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) {
|
||||
struct io_kiocb *req = cmd_to_io_kiocb(timeout);
|
||||
|
||||
if (io_match_task(req, tsk, cancel_all) &&
|
||||
io_kill_timeout(req, -ECANCELED))
|
||||
canceled++;
|
||||
}
|
||||
spin_unlock_irq(&ctx->timeout_lock);
|
||||
io_cq_unlock_post(ctx);
|
||||
return canceled != 0;
|
||||
}
|
36
io_uring/timeout.h
Normal file
36
io_uring/timeout.h
Normal file
@ -0,0 +1,36 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
struct io_timeout_data {
|
||||
struct io_kiocb *req;
|
||||
struct hrtimer timer;
|
||||
struct timespec64 ts;
|
||||
enum hrtimer_mode mode;
|
||||
u32 flags;
|
||||
};
|
||||
|
||||
struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
|
||||
struct io_kiocb *link);
|
||||
|
||||
static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
|
||||
{
|
||||
struct io_kiocb *link = req->link;
|
||||
|
||||
if (link && link->opcode == IORING_OP_LINK_TIMEOUT)
|
||||
return __io_disarm_linked_timeout(req, link);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
__cold void io_flush_timeouts(struct io_ring_ctx *ctx);
|
||||
struct io_cancel_data;
|
||||
int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd);
|
||||
__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
|
||||
bool cancel_all);
|
||||
void io_queue_linked_timeout(struct io_kiocb *req);
|
||||
bool io_disarm_next(struct io_kiocb *req);
|
||||
|
||||
int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_link_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_timeout(struct io_kiocb *req, unsigned int issue_flags);
|
||||
int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags);
|
114
io_uring/uring_cmd.c
Normal file
114
io_uring/uring_cmd.c
Normal file
@ -0,0 +1,114 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "uring_cmd.h"
|
||||
|
||||
static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
|
||||
{
|
||||
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
|
||||
|
||||
ioucmd->task_work_cb(ioucmd);
|
||||
}
|
||||
|
||||
void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
|
||||
void (*task_work_cb)(struct io_uring_cmd *))
|
||||
{
|
||||
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
|
||||
|
||||
ioucmd->task_work_cb = task_work_cb;
|
||||
req->io_task_work.func = io_uring_cmd_work;
|
||||
io_req_task_work_add(req);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task);
|
||||
|
||||
static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
|
||||
u64 extra1, u64 extra2)
|
||||
{
|
||||
req->extra1 = extra1;
|
||||
req->extra2 = extra2;
|
||||
req->flags |= REQ_F_CQE32_INIT;
|
||||
}
|
||||
|
||||
/*
|
||||
* Called by consumers of io_uring_cmd, if they originally returned
|
||||
* -EIOCBQUEUED upon receiving the command.
|
||||
*/
|
||||
void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
|
||||
{
|
||||
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
|
||||
|
||||
if (ret < 0)
|
||||
req_set_fail(req);
|
||||
|
||||
io_req_set_res(req, 0, ret);
|
||||
if (req->ctx->flags & IORING_SETUP_CQE32)
|
||||
io_req_set_cqe32_extra(req, res2, 0);
|
||||
__io_req_complete(req, 0);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(io_uring_cmd_done);
|
||||
|
||||
int io_uring_cmd_prep_async(struct io_kiocb *req)
|
||||
{
|
||||
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
|
||||
size_t cmd_size;
|
||||
|
||||
cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
|
||||
|
||||
memcpy(req->async_data, ioucmd->cmd, cmd_size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
|
||||
|
||||
if (sqe->rw_flags || sqe->__pad1)
|
||||
return -EINVAL;
|
||||
ioucmd->cmd = sqe->cmd;
|
||||
ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
struct file *file = req->file;
|
||||
int ret;
|
||||
|
||||
if (!req->file->f_op->uring_cmd)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
if (ctx->flags & IORING_SETUP_SQE128)
|
||||
issue_flags |= IO_URING_F_SQE128;
|
||||
if (ctx->flags & IORING_SETUP_CQE32)
|
||||
issue_flags |= IO_URING_F_CQE32;
|
||||
if (ctx->flags & IORING_SETUP_IOPOLL)
|
||||
issue_flags |= IO_URING_F_IOPOLL;
|
||||
|
||||
if (req_has_async_data(req))
|
||||
ioucmd->cmd = req->async_data;
|
||||
|
||||
ret = file->f_op->uring_cmd(ioucmd, issue_flags);
|
||||
if (ret == -EAGAIN) {
|
||||
if (!req_has_async_data(req)) {
|
||||
if (io_alloc_async_data(req))
|
||||
return -ENOMEM;
|
||||
io_uring_cmd_prep_async(req);
|
||||
}
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
if (ret != -EIOCBQUEUED) {
|
||||
io_uring_cmd_done(ioucmd, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
return IOU_ISSUE_SKIP_COMPLETE;
|
||||
}
|
13
io_uring/uring_cmd.h
Normal file
13
io_uring/uring_cmd.h
Normal file
@ -0,0 +1,13 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags);
|
||||
int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_uring_cmd_prep_async(struct io_kiocb *req);
|
||||
|
||||
/*
|
||||
* The URING_CMD payload starts at 'cmd' in the first sqe, and continues into
|
||||
* the following sqe if SQE128 is used.
|
||||
*/
|
||||
#define uring_cmd_pdu_size(is_sqe128) \
|
||||
((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \
|
||||
offsetof(struct io_uring_sqe, cmd))
|
258
io_uring/xattr.c
Normal file
258
io_uring/xattr.c
Normal file
@ -0,0 +1,258 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/io_uring.h>
|
||||
#include <linux/xattr.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "../fs/internal.h"
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "xattr.h"
|
||||
|
||||
struct io_xattr {
|
||||
struct file *file;
|
||||
struct xattr_ctx ctx;
|
||||
struct filename *filename;
|
||||
};
|
||||
|
||||
void io_xattr_cleanup(struct io_kiocb *req)
|
||||
{
|
||||
struct io_xattr *ix = io_kiocb_to_cmd(req);
|
||||
|
||||
if (ix->filename)
|
||||
putname(ix->filename);
|
||||
|
||||
kfree(ix->ctx.kname);
|
||||
kvfree(ix->ctx.kvalue);
|
||||
}
|
||||
|
||||
static void io_xattr_finish(struct io_kiocb *req, int ret)
|
||||
{
|
||||
req->flags &= ~REQ_F_NEED_CLEANUP;
|
||||
|
||||
io_xattr_cleanup(req);
|
||||
io_req_set_res(req, ret, 0);
|
||||
}
|
||||
|
||||
static int __io_getxattr_prep(struct io_kiocb *req,
|
||||
const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_xattr *ix = io_kiocb_to_cmd(req);
|
||||
const char __user *name;
|
||||
int ret;
|
||||
|
||||
if (unlikely(req->flags & REQ_F_FIXED_FILE))
|
||||
return -EBADF;
|
||||
|
||||
ix->filename = NULL;
|
||||
ix->ctx.kvalue = NULL;
|
||||
name = u64_to_user_ptr(READ_ONCE(sqe->addr));
|
||||
ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
|
||||
ix->ctx.size = READ_ONCE(sqe->len);
|
||||
ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
|
||||
|
||||
if (ix->ctx.flags)
|
||||
return -EINVAL;
|
||||
|
||||
ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
|
||||
if (!ix->ctx.kname)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = strncpy_from_user(ix->ctx.kname->name, name,
|
||||
sizeof(ix->ctx.kname->name));
|
||||
if (!ret || ret == sizeof(ix->ctx.kname->name))
|
||||
ret = -ERANGE;
|
||||
if (ret < 0) {
|
||||
kfree(ix->ctx.kname);
|
||||
return ret;
|
||||
}
|
||||
|
||||
req->flags |= REQ_F_NEED_CLEANUP;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_fgetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
return __io_getxattr_prep(req, sqe);
|
||||
}
|
||||
|
||||
int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_xattr *ix = io_kiocb_to_cmd(req);
|
||||
const char __user *path;
|
||||
int ret;
|
||||
|
||||
ret = __io_getxattr_prep(req, sqe);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
|
||||
|
||||
ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
|
||||
if (IS_ERR(ix->filename)) {
|
||||
ret = PTR_ERR(ix->filename);
|
||||
ix->filename = NULL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_xattr *ix = io_kiocb_to_cmd(req);
|
||||
int ret;
|
||||
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
return -EAGAIN;
|
||||
|
||||
ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt),
|
||||
req->file->f_path.dentry,
|
||||
&ix->ctx);
|
||||
|
||||
io_xattr_finish(req, ret);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_xattr *ix = io_kiocb_to_cmd(req);
|
||||
unsigned int lookup_flags = LOOKUP_FOLLOW;
|
||||
struct path path;
|
||||
int ret;
|
||||
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
return -EAGAIN;
|
||||
|
||||
retry:
|
||||
ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
|
||||
if (!ret) {
|
||||
ret = do_getxattr(mnt_user_ns(path.mnt),
|
||||
path.dentry,
|
||||
&ix->ctx);
|
||||
|
||||
path_put(&path);
|
||||
if (retry_estale(ret, lookup_flags)) {
|
||||
lookup_flags |= LOOKUP_REVAL;
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
|
||||
io_xattr_finish(req, ret);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
static int __io_setxattr_prep(struct io_kiocb *req,
|
||||
const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_xattr *ix = io_kiocb_to_cmd(req);
|
||||
const char __user *name;
|
||||
int ret;
|
||||
|
||||
if (unlikely(req->flags & REQ_F_FIXED_FILE))
|
||||
return -EBADF;
|
||||
|
||||
ix->filename = NULL;
|
||||
name = u64_to_user_ptr(READ_ONCE(sqe->addr));
|
||||
ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
|
||||
ix->ctx.kvalue = NULL;
|
||||
ix->ctx.size = READ_ONCE(sqe->len);
|
||||
ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
|
||||
|
||||
ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
|
||||
if (!ix->ctx.kname)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = setxattr_copy(name, &ix->ctx);
|
||||
if (ret) {
|
||||
kfree(ix->ctx.kname);
|
||||
return ret;
|
||||
}
|
||||
|
||||
req->flags |= REQ_F_NEED_CLEANUP;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_xattr *ix = io_kiocb_to_cmd(req);
|
||||
const char __user *path;
|
||||
int ret;
|
||||
|
||||
ret = __io_setxattr_prep(req, sqe);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
|
||||
|
||||
ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
|
||||
if (IS_ERR(ix->filename)) {
|
||||
ret = PTR_ERR(ix->filename);
|
||||
ix->filename = NULL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
return __io_setxattr_prep(req, sqe);
|
||||
}
|
||||
|
||||
static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags,
|
||||
struct path *path)
|
||||
{
|
||||
struct io_xattr *ix = io_kiocb_to_cmd(req);
|
||||
int ret;
|
||||
|
||||
ret = mnt_want_write(path->mnt);
|
||||
if (!ret) {
|
||||
ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx);
|
||||
mnt_drop_write(path->mnt);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
return -EAGAIN;
|
||||
|
||||
ret = __io_setxattr(req, issue_flags, &req->file->f_path);
|
||||
io_xattr_finish(req, ret);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_xattr *ix = io_kiocb_to_cmd(req);
|
||||
unsigned int lookup_flags = LOOKUP_FOLLOW;
|
||||
struct path path;
|
||||
int ret;
|
||||
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
return -EAGAIN;
|
||||
|
||||
retry:
|
||||
ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
|
||||
if (!ret) {
|
||||
ret = __io_setxattr(req, issue_flags, &path);
|
||||
path_put(&path);
|
||||
if (retry_estale(ret, lookup_flags)) {
|
||||
lookup_flags |= LOOKUP_REVAL;
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
|
||||
io_xattr_finish(req, ret);
|
||||
return IOU_OK;
|
||||
}
|
15
io_uring/xattr.h
Normal file
15
io_uring/xattr.h
Normal file
@ -0,0 +1,15 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
void io_xattr_cleanup(struct io_kiocb *req);
|
||||
|
||||
int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_setxattr(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_fgetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_getxattr(struct io_kiocb *req, unsigned int issue_flags);
|
@ -91,7 +91,7 @@
|
||||
#include "stats.h"
|
||||
|
||||
#include "../workqueue_internal.h"
|
||||
#include "../../fs/io-wq.h"
|
||||
#include "../../io_uring/io-wq.h"
|
||||
#include "../smpboot.h"
|
||||
|
||||
/*
|
||||
|
39
net/compat.c
39
net/compat.c
@ -34,20 +34,15 @@
|
||||
#include <net/compat.h>
|
||||
|
||||
int __get_compat_msghdr(struct msghdr *kmsg,
|
||||
struct compat_msghdr __user *umsg,
|
||||
struct sockaddr __user **save_addr,
|
||||
compat_uptr_t *ptr, compat_size_t *len)
|
||||
struct compat_msghdr *msg,
|
||||
struct sockaddr __user **save_addr)
|
||||
{
|
||||
struct compat_msghdr msg;
|
||||
ssize_t err;
|
||||
|
||||
if (copy_from_user(&msg, umsg, sizeof(*umsg)))
|
||||
return -EFAULT;
|
||||
kmsg->msg_flags = msg->msg_flags;
|
||||
kmsg->msg_namelen = msg->msg_namelen;
|
||||
|
||||
kmsg->msg_flags = msg.msg_flags;
|
||||
kmsg->msg_namelen = msg.msg_namelen;
|
||||
|
||||
if (!msg.msg_name)
|
||||
if (!msg->msg_name)
|
||||
kmsg->msg_namelen = 0;
|
||||
|
||||
if (kmsg->msg_namelen < 0)
|
||||
@ -57,15 +52,15 @@ int __get_compat_msghdr(struct msghdr *kmsg,
|
||||
kmsg->msg_namelen = sizeof(struct sockaddr_storage);
|
||||
|
||||
kmsg->msg_control_is_user = true;
|
||||
kmsg->msg_control_user = compat_ptr(msg.msg_control);
|
||||
kmsg->msg_controllen = msg.msg_controllen;
|
||||
kmsg->msg_control_user = compat_ptr(msg->msg_control);
|
||||
kmsg->msg_controllen = msg->msg_controllen;
|
||||
|
||||
if (save_addr)
|
||||
*save_addr = compat_ptr(msg.msg_name);
|
||||
*save_addr = compat_ptr(msg->msg_name);
|
||||
|
||||
if (msg.msg_name && kmsg->msg_namelen) {
|
||||
if (msg->msg_name && kmsg->msg_namelen) {
|
||||
if (!save_addr) {
|
||||
err = move_addr_to_kernel(compat_ptr(msg.msg_name),
|
||||
err = move_addr_to_kernel(compat_ptr(msg->msg_name),
|
||||
kmsg->msg_namelen,
|
||||
kmsg->msg_name);
|
||||
if (err < 0)
|
||||
@ -76,12 +71,10 @@ int __get_compat_msghdr(struct msghdr *kmsg,
|
||||
kmsg->msg_namelen = 0;
|
||||
}
|
||||
|
||||
if (msg.msg_iovlen > UIO_MAXIOV)
|
||||
if (msg->msg_iovlen > UIO_MAXIOV)
|
||||
return -EMSGSIZE;
|
||||
|
||||
kmsg->msg_iocb = NULL;
|
||||
*ptr = msg.msg_iov;
|
||||
*len = msg.msg_iovlen;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -90,15 +83,17 @@ int get_compat_msghdr(struct msghdr *kmsg,
|
||||
struct sockaddr __user **save_addr,
|
||||
struct iovec **iov)
|
||||
{
|
||||
compat_uptr_t ptr;
|
||||
compat_size_t len;
|
||||
struct compat_msghdr msg;
|
||||
ssize_t err;
|
||||
|
||||
err = __get_compat_msghdr(kmsg, umsg, save_addr, &ptr, &len);
|
||||
if (copy_from_user(&msg, umsg, sizeof(*umsg)))
|
||||
return -EFAULT;
|
||||
|
||||
err = __get_compat_msghdr(kmsg, &msg, save_addr);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = import_iovec(save_addr ? READ : WRITE, compat_ptr(ptr), len,
|
||||
err = import_iovec(save_addr ? READ : WRITE, compat_ptr(msg.msg_iov), msg.msg_iovlen,
|
||||
UIO_FASTIOV, iov, &kmsg->msg_iter);
|
||||
return err < 0 ? err : 0;
|
||||
}
|
||||
|
37
net/socket.c
37
net/socket.c
@ -2358,25 +2358,20 @@ struct used_address {
|
||||
unsigned int name_len;
|
||||
};
|
||||
|
||||
int __copy_msghdr_from_user(struct msghdr *kmsg,
|
||||
struct user_msghdr __user *umsg,
|
||||
struct sockaddr __user **save_addr,
|
||||
struct iovec __user **uiov, size_t *nsegs)
|
||||
int __copy_msghdr(struct msghdr *kmsg,
|
||||
struct user_msghdr *msg,
|
||||
struct sockaddr __user **save_addr)
|
||||
{
|
||||
struct user_msghdr msg;
|
||||
ssize_t err;
|
||||
|
||||
if (copy_from_user(&msg, umsg, sizeof(*umsg)))
|
||||
return -EFAULT;
|
||||
|
||||
kmsg->msg_control_is_user = true;
|
||||
kmsg->msg_get_inq = 0;
|
||||
kmsg->msg_control_user = msg.msg_control;
|
||||
kmsg->msg_controllen = msg.msg_controllen;
|
||||
kmsg->msg_flags = msg.msg_flags;
|
||||
kmsg->msg_control_user = msg->msg_control;
|
||||
kmsg->msg_controllen = msg->msg_controllen;
|
||||
kmsg->msg_flags = msg->msg_flags;
|
||||
|
||||
kmsg->msg_namelen = msg.msg_namelen;
|
||||
if (!msg.msg_name)
|
||||
kmsg->msg_namelen = msg->msg_namelen;
|
||||
if (!msg->msg_name)
|
||||
kmsg->msg_namelen = 0;
|
||||
|
||||
if (kmsg->msg_namelen < 0)
|
||||
@ -2386,11 +2381,11 @@ int __copy_msghdr_from_user(struct msghdr *kmsg,
|
||||
kmsg->msg_namelen = sizeof(struct sockaddr_storage);
|
||||
|
||||
if (save_addr)
|
||||
*save_addr = msg.msg_name;
|
||||
*save_addr = msg->msg_name;
|
||||
|
||||
if (msg.msg_name && kmsg->msg_namelen) {
|
||||
if (msg->msg_name && kmsg->msg_namelen) {
|
||||
if (!save_addr) {
|
||||
err = move_addr_to_kernel(msg.msg_name,
|
||||
err = move_addr_to_kernel(msg->msg_name,
|
||||
kmsg->msg_namelen,
|
||||
kmsg->msg_name);
|
||||
if (err < 0)
|
||||
@ -2401,12 +2396,10 @@ int __copy_msghdr_from_user(struct msghdr *kmsg,
|
||||
kmsg->msg_namelen = 0;
|
||||
}
|
||||
|
||||
if (msg.msg_iovlen > UIO_MAXIOV)
|
||||
if (msg->msg_iovlen > UIO_MAXIOV)
|
||||
return -EMSGSIZE;
|
||||
|
||||
kmsg->msg_iocb = NULL;
|
||||
*uiov = msg.msg_iov;
|
||||
*nsegs = msg.msg_iovlen;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -2418,8 +2411,10 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
|
||||
struct user_msghdr msg;
|
||||
ssize_t err;
|
||||
|
||||
err = __copy_msghdr_from_user(kmsg, umsg, save_addr, &msg.msg_iov,
|
||||
&msg.msg_iovlen);
|
||||
if (copy_from_user(&msg, umsg, sizeof(*umsg)))
|
||||
return -EFAULT;
|
||||
|
||||
err = __copy_msghdr(kmsg, &msg, save_addr);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user