for-5.18/io_uring-2022-03-18

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmI09cgQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpjI1D/4uo1FnG+mF9ZEPAy/FKBh6I74gHj7eBJyw
 obKyH2oszN19VI+HLYYQgBrPwlc4lOzxmYM0kbAQWfB+PCgxAExX+OkJrGrxWW3R
 Hlc1RSQkravh+CBzEc1VIYIWK3XF+guZIwqLPa3gS0hwea8LYBae9h3gKjhvb2kH
 4BbSh7Ax5k9klVUcNJuUXnG7nytyaiaQAbMl88iGV3S2bpseWNi0WTHvJ3Eizbyz
 sLponJcgUb0/H2G00MXf3M3+V40s7AiLXeEMwFcv29PZUQIEyp5P8Rx43//48nbE
 0Rms88Uz4tFHRyFe/48IKa4tjJgk6fy1Cjd3+Thyn8TeJaYhD6vZu4LA/uGo8fkm
 EZEmLFHw5mqn7Xz3OA31ErnnpSmwT/T164vMPcvrlLao/1ZSHD1fK2PjyDMTX+aO
 1wec5+mMsTbOFgzYJRH3tyEH/GfWxpsNJIPfdCjV5BRPd5HwFBBTvRwFEVTX6+Go
 CS4JAGWkTgmr2u1OeyzaQ1lYPZWD1GTt/HJj7rEvuqPzlreX9nRlvTtIcVubw39a
 dgx7nWLLCOCN3o4tFHEvmf05/92TiQd1Vw8wEQT0NdmkLuqUuJJOnVo53YWzHA5Y
 qqMwPcciiYMi92btrJdsZOgj3GCKxP6rLxyHJm1mY5kFSjEB5e0swV7xryme40r4
 ZJ6sUvEyHg==
 =OBma
 -----END PGP SIGNATURE-----

Merge tag 'for-5.18/io_uring-2022-03-18' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:

 - Fixes for current file position. Still doesn't have the f_pos_lock
   sorted, but it's a step in the right direction (Dylan)

 - Tracing updates (Dylan, Stefan)

 - Improvements to io-wq locking (Hao)

 - Improvements for provided buffers (me, Pavel)

 - Support for registered file descriptors (me, Xiaoguang)

 - Support for ring messages (me)

 - Poll improvements (me)

 - Fix for fixed buffers and non-iterator reads/writes (me)

 - Support for NAPI on sockets (Olivier)

 - Ring quiesce improvements (Usama)

 - Misc fixes (Olivier, Pavel)

* tag 'for-5.18/io_uring-2022-03-18' of git://git.kernel.dk/linux-block: (42 commits)
  io_uring: terminate manual loop iterator loop correctly for non-vecs
  io_uring: don't check unrelated req->open.how in accept request
  io_uring: manage provided buffers strictly ordered
  io_uring: fold evfd signalling under a slower path
  io_uring: thin down io_commit_cqring()
  io_uring: shuffle io_eventfd_signal() bits around
  io_uring: remove extra barrier for non-sqpoll iopoll
  io_uring: fix provided buffer return on failure for kiocb_done()
  io_uring: extend provided buf return to fails
  io_uring: refactor timeout cancellation cqe posting
  io_uring: normilise naming for fill_cqe*
  io_uring: cache poll/double-poll state with a request flag
  io_uring: cache req->apoll->events in req->cflags
  io_uring: move req->poll_refs into previous struct hole
  io_uring: make tracing format consistent
  io_uring: recycle apoll_poll entries
  io_uring: remove duplicated member check for io_msg_ring_prep()
  io_uring: allow submissions to continue on error
  io_uring: recycle provided buffers if request goes async
  io_uring: ensure reads re-import for selected buffers
  ...
This commit is contained in:
Linus Torvalds 2022-03-21 16:24:45 -07:00
commit af472a9efd
5 changed files with 1200 additions and 520 deletions

View File

@ -76,6 +76,7 @@ struct io_wqe_acct {
unsigned max_workers;
int index;
atomic_t nr_running;
raw_spinlock_t lock;
struct io_wq_work_list work_list;
unsigned long flags;
};
@ -91,7 +92,7 @@ enum {
*/
struct io_wqe {
raw_spinlock_t lock;
struct io_wqe_acct acct[2];
struct io_wqe_acct acct[IO_WQ_ACCT_NR];
int node;
@ -224,12 +225,12 @@ static void io_worker_exit(struct io_worker *worker)
if (worker->flags & IO_WORKER_F_FREE)
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
preempt_disable();
raw_spin_unlock(&wqe->lock);
io_wqe_dec_running(worker);
worker->flags = 0;
preempt_disable();
current->flags &= ~PF_IO_WORKER;
preempt_enable();
raw_spin_unlock(&wqe->lock);
kfree_rcu(worker, rcu);
io_worker_ref_put(wqe->wq);
@ -238,10 +239,15 @@ static void io_worker_exit(struct io_worker *worker)
static inline bool io_acct_run_queue(struct io_wqe_acct *acct)
{
bool ret = false;
raw_spin_lock(&acct->lock);
if (!wq_list_empty(&acct->work_list) &&
!test_bit(IO_ACCT_STALLED_BIT, &acct->flags))
return true;
return false;
ret = true;
raw_spin_unlock(&acct->lock);
return ret;
}
/*
@ -385,7 +391,6 @@ fail:
}
static void io_wqe_dec_running(struct io_worker *worker)
__must_hold(wqe->lock)
{
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
@ -393,13 +398,14 @@ static void io_wqe_dec_running(struct io_worker *worker)
if (!(worker->flags & IO_WORKER_F_UP))
return;
if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) {
atomic_inc(&acct->nr_running);
atomic_inc(&wqe->wq->worker_refs);
raw_spin_unlock(&wqe->lock);
io_queue_worker_create(worker, acct, create_worker_cb);
raw_spin_lock(&wqe->lock);
}
if (!atomic_dec_and_test(&acct->nr_running))
return;
if (!io_acct_run_queue(acct))
return;
atomic_inc(&acct->nr_running);
atomic_inc(&wqe->wq->worker_refs);
io_queue_worker_create(worker, acct, create_worker_cb);
}
/*
@ -407,11 +413,12 @@ static void io_wqe_dec_running(struct io_worker *worker)
* it's currently on the freelist
*/
static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker)
__must_hold(wqe->lock)
{
if (worker->flags & IO_WORKER_F_FREE) {
worker->flags &= ~IO_WORKER_F_FREE;
raw_spin_lock(&wqe->lock);
hlist_nulls_del_init_rcu(&worker->nulls_node);
raw_spin_unlock(&wqe->lock);
}
}
@ -456,7 +463,7 @@ static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
struct io_worker *worker)
__must_hold(wqe->lock)
__must_hold(acct->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work, *tail;
@ -498,9 +505,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
* work being added and clearing the stalled bit.
*/
set_bit(IO_ACCT_STALLED_BIT, &acct->flags);
raw_spin_unlock(&wqe->lock);
raw_spin_unlock(&acct->lock);
unstalled = io_wait_on_hash(wqe, stall_hash);
raw_spin_lock(&wqe->lock);
raw_spin_lock(&acct->lock);
if (unstalled) {
clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
if (wq_has_sleeper(&wqe->wq->hash->wait))
@ -538,7 +545,6 @@ static void io_assign_current_work(struct io_worker *worker,
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
static void io_worker_handle_work(struct io_worker *worker)
__releases(wqe->lock)
{
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
@ -555,7 +561,9 @@ static void io_worker_handle_work(struct io_worker *worker)
* can't make progress, any work completion or insertion will
* clear the stalled flag.
*/
raw_spin_lock(&acct->lock);
work = io_get_next_work(acct, worker);
raw_spin_unlock(&acct->lock);
if (work) {
__io_worker_busy(wqe, worker);
@ -569,10 +577,9 @@ static void io_worker_handle_work(struct io_worker *worker)
raw_spin_lock(&worker->lock);
worker->next_work = work;
raw_spin_unlock(&worker->lock);
}
raw_spin_unlock(&wqe->lock);
if (!work)
} else {
break;
}
io_assign_current_work(worker, work);
__set_current_state(TASK_RUNNING);
@ -608,8 +615,6 @@ static void io_worker_handle_work(struct io_worker *worker)
wake_up(&wq->hash->wait);
}
} while (work);
raw_spin_lock(&wqe->lock);
} while (1);
}
@ -633,12 +638,10 @@ static int io_wqe_worker(void *data)
long ret;
set_current_state(TASK_INTERRUPTIBLE);
loop:
raw_spin_lock(&wqe->lock);
if (io_acct_run_queue(acct)) {
while (io_acct_run_queue(acct))
io_worker_handle_work(worker);
goto loop;
}
raw_spin_lock(&wqe->lock);
/* timed out, exit unless we're the last worker */
if (last_timeout && acct->nr_workers > 1) {
acct->nr_workers--;
@ -662,10 +665,8 @@ loop:
last_timeout = !ret;
}
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
raw_spin_lock(&wqe->lock);
if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
io_worker_handle_work(worker);
}
audit_free(current);
io_worker_exit(worker);
@ -705,10 +706,7 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
return;
worker->flags &= ~IO_WORKER_F_RUNNING;
raw_spin_lock(&worker->wqe->lock);
io_wqe_dec_running(worker);
raw_spin_unlock(&worker->wqe->lock);
}
static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker,
@ -778,10 +776,12 @@ static void create_worker_cont(struct callback_head *cb)
.cancel_all = true,
};
raw_spin_unlock(&wqe->lock);
while (io_acct_cancel_pending_work(wqe, acct, &match))
raw_spin_lock(&wqe->lock);
;
} else {
raw_spin_unlock(&wqe->lock);
}
raw_spin_unlock(&wqe->lock);
io_worker_ref_put(wqe->wq);
kfree(worker);
return;
@ -914,6 +914,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
struct io_cb_cancel_data match;
unsigned work_flags = work->flags;
bool do_create;
@ -927,10 +928,12 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
}
raw_spin_lock(&wqe->lock);
raw_spin_lock(&acct->lock);
io_wqe_insert_work(wqe, work);
clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
raw_spin_unlock(&acct->lock);
raw_spin_lock(&wqe->lock);
rcu_read_lock();
do_create = !io_wqe_activate_free_worker(wqe, acct);
rcu_read_unlock();
@ -946,18 +949,18 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
raw_spin_lock(&wqe->lock);
/* fatal condition, failed to create the first worker */
if (!acct->nr_workers) {
struct io_cb_cancel_data match = {
.fn = io_wq_work_match_item,
.data = work,
.cancel_all = false,
};
if (io_acct_cancel_pending_work(wqe, acct, &match))
raw_spin_lock(&wqe->lock);
if (acct->nr_workers) {
raw_spin_unlock(&wqe->lock);
return;
}
raw_spin_unlock(&wqe->lock);
/* fatal condition, failed to create the first worker */
match.fn = io_wq_work_match_item,
match.data = work,
match.cancel_all = false,
io_acct_cancel_pending_work(wqe, acct, &match);
}
}
@ -1032,22 +1035,23 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
struct io_wqe_acct *acct,
struct io_cb_cancel_data *match)
__releases(wqe->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
raw_spin_lock(&acct->lock);
wq_list_for_each(node, prev, &acct->work_list) {
work = container_of(node, struct io_wq_work, list);
if (!match->fn(work, match->data))
continue;
io_wqe_remove_pending(wqe, work, prev);
raw_spin_unlock(&wqe->lock);
raw_spin_unlock(&acct->lock);
io_run_cancel(work, wqe);
match->nr_pending++;
/* not safe to continue after unlock */
return true;
}
raw_spin_unlock(&acct->lock);
return false;
}
@ -1061,7 +1065,6 @@ retry:
struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);
if (io_acct_cancel_pending_work(wqe, acct, match)) {
raw_spin_lock(&wqe->lock);
if (match->cancel_all)
goto retry;
break;
@ -1103,13 +1106,11 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
raw_spin_lock(&wqe->lock);
io_wqe_cancel_pending_work(wqe, &match);
if (match.nr_pending && !match.cancel_all) {
raw_spin_unlock(&wqe->lock);
if (match.nr_pending && !match.cancel_all)
return IO_WQ_CANCEL_OK;
}
raw_spin_lock(&wqe->lock);
io_wqe_cancel_running_work(wqe, &match);
raw_spin_unlock(&wqe->lock);
if (match.nr_running && !match.cancel_all)
@ -1190,6 +1191,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
acct->index = i;
atomic_set(&acct->nr_running, 0);
INIT_WQ_LIST(&acct->work_list);
raw_spin_lock_init(&acct->lock);
}
wqe->wq = wq;
raw_spin_lock_init(&wqe->lock);
@ -1282,9 +1284,7 @@ static void io_wq_destroy(struct io_wq *wq)
.fn = io_wq_work_match_all,
.cancel_all = true,
};
raw_spin_lock(&wqe->lock);
io_wqe_cancel_pending_work(wqe, &match);
raw_spin_unlock(&wqe->lock);
free_cpumask_var(wqe->cpu_mask);
kfree(wqe);
}
@ -1376,7 +1376,7 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count)
BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
BUILD_BUG_ON((int) IO_WQ_ACCT_NR != 2);
for (i = 0; i < 2; i++) {
for (i = 0; i < IO_WQ_ACCT_NR; i++) {
if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
new_count[i] = task_rlimit(current, RLIMIT_NPROC);
}

File diff suppressed because it is too large Load Diff

View File

@ -9,11 +9,14 @@
struct sock *io_uring_get_socket(struct file *file);
void __io_uring_cancel(bool cancel_all);
void __io_uring_free(struct task_struct *tsk);
void io_uring_unreg_ringfd(void);
static inline void io_uring_files_cancel(void)
{
if (current->io_uring)
if (current->io_uring) {
io_uring_unreg_ringfd();
__io_uring_cancel(false);
}
}
static inline void io_uring_task_cancel(void)
{

View File

@ -29,22 +29,22 @@ TRACE_EVENT(io_uring_create,
TP_ARGS(fd, ctx, sq_entries, cq_entries, flags),
TP_STRUCT__entry (
__field( int, fd )
__field( void *, ctx )
__field( int, fd )
__field( void *, ctx )
__field( u32, sq_entries )
__field( u32, cq_entries )
__field( u32, flags )
),
TP_fast_assign(
__entry->fd = fd;
__entry->fd = fd;
__entry->ctx = ctx;
__entry->sq_entries = sq_entries;
__entry->cq_entries = cq_entries;
__entry->flags = flags;
),
TP_printk("ring %p, fd %d sq size %d, cq size %d, flags %d",
TP_printk("ring %p, fd %d sq size %d, cq size %d, flags 0x%x",
__entry->ctx, __entry->fd, __entry->sq_entries,
__entry->cq_entries, __entry->flags)
);
@ -57,10 +57,9 @@ TRACE_EVENT(io_uring_create,
* @opcode: describes which operation to perform
* @nr_user_files: number of registered files
* @nr_user_bufs: number of registered buffers
* @cq_ev_fd: whether eventfs registered or not
* @ret: return code
*
* Allows to trace fixed files/buffers/eventfds, that could be registered to
* Allows to trace fixed files/buffers, that could be registered to
* avoid an overhead of getting references to them for every operation. This
* event, together with io_uring_file_get, can provide a full picture of how
* much overhead one can reduce via fixing.
@ -68,17 +67,16 @@ TRACE_EVENT(io_uring_create,
TRACE_EVENT(io_uring_register,
TP_PROTO(void *ctx, unsigned opcode, unsigned nr_files,
unsigned nr_bufs, bool eventfd, long ret),
unsigned nr_bufs, long ret),
TP_ARGS(ctx, opcode, nr_files, nr_bufs, eventfd, ret),
TP_ARGS(ctx, opcode, nr_files, nr_bufs, ret),
TP_STRUCT__entry (
__field( void *, ctx )
__field( unsigned, opcode )
__field( unsigned, nr_files )
__field( unsigned, nr_bufs )
__field( bool, eventfd )
__field( long, ret )
__field( void *, ctx )
__field( unsigned, opcode )
__field( unsigned, nr_files)
__field( unsigned, nr_bufs )
__field( long, ret )
),
TP_fast_assign(
@ -86,20 +84,21 @@ TRACE_EVENT(io_uring_register,
__entry->opcode = opcode;
__entry->nr_files = nr_files;
__entry->nr_bufs = nr_bufs;
__entry->eventfd = eventfd;
__entry->ret = ret;
),
TP_printk("ring %p, opcode %d, nr_user_files %d, nr_user_bufs %d, "
"eventfd %d, ret %ld",
"ret %ld",
__entry->ctx, __entry->opcode, __entry->nr_files,
__entry->nr_bufs, __entry->eventfd, __entry->ret)
__entry->nr_bufs, __entry->ret)
);
/**
* io_uring_file_get - called before getting references to an SQE file
*
* @ctx: pointer to a ring context structure
* @req: pointer to a submitted request
* @user_data: user data associated with the request
* @fd: SQE file descriptor
*
* Allows to trace out how often an SQE file reference is obtained, which can
@ -108,59 +107,71 @@ TRACE_EVENT(io_uring_register,
*/
TRACE_EVENT(io_uring_file_get,
TP_PROTO(void *ctx, int fd),
TP_PROTO(void *ctx, void *req, unsigned long long user_data, int fd),
TP_ARGS(ctx, fd),
TP_ARGS(ctx, req, user_data, fd),
TP_STRUCT__entry (
__field( void *, ctx )
__field( int, fd )
__field( void *, ctx )
__field( void *, req )
__field( u64, user_data )
__field( int, fd )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = ctx;
__entry->req = req;
__entry->user_data = user_data;
__entry->fd = fd;
),
TP_printk("ring %p, fd %d", __entry->ctx, __entry->fd)
TP_printk("ring %p, req %p, user_data 0x%llx, fd %d",
__entry->ctx, __entry->req, __entry->user_data, __entry->fd)
);
/**
* io_uring_queue_async_work - called before submitting a new async work
*
* @ctx: pointer to a ring context structure
* @hashed: type of workqueue, hashed or normal
* @req: pointer to a submitted request
* @user_data: user data associated with the request
* @opcode: opcode of request
* @flags request flags
* @work: pointer to a submitted io_wq_work
* @rw: type of workqueue, hashed or normal
*
* Allows to trace asynchronous work submission.
*/
TRACE_EVENT(io_uring_queue_async_work,
TP_PROTO(void *ctx, int rw, void * req, struct io_wq_work *work,
unsigned int flags),
TP_PROTO(void *ctx, void * req, unsigned long long user_data, u8 opcode,
unsigned int flags, struct io_wq_work *work, int rw),
TP_ARGS(ctx, rw, req, work, flags),
TP_ARGS(ctx, req, user_data, flags, opcode, work, rw),
TP_STRUCT__entry (
__field( void *, ctx )
__field( int, rw )
__field( void *, req )
__field( struct io_wq_work *, work )
__field( unsigned int, flags )
__field( void *, ctx )
__field( void *, req )
__field( u64, user_data )
__field( u8, opcode )
__field( unsigned int, flags )
__field( struct io_wq_work *, work )
__field( int, rw )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->rw = rw;
__entry->req = req;
__entry->work = work;
__entry->flags = flags;
__entry->ctx = ctx;
__entry->req = req;
__entry->user_data = user_data;
__entry->flags = flags;
__entry->opcode = opcode;
__entry->work = work;
__entry->rw = rw;
),
TP_printk("ring %p, request %p, flags %d, %s queue, work %p",
__entry->ctx, __entry->req, __entry->flags,
__entry->rw ? "hashed" : "normal", __entry->work)
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d, flags 0x%x, %s queue, work %p",
__entry->ctx, __entry->req, __entry->user_data, __entry->opcode,
__entry->flags, __entry->rw ? "hashed" : "normal", __entry->work)
);
/**
@ -169,30 +180,33 @@ TRACE_EVENT(io_uring_queue_async_work,
* @ctx: pointer to a ring context structure
* @req: pointer to a deferred request
* @user_data: user data associated with the request
* @opcode: opcode of request
*
* Allows to track deferred requests, to get an insight about what requests are
* not started immediately.
*/
TRACE_EVENT(io_uring_defer,
TP_PROTO(void *ctx, void *req, unsigned long long user_data),
TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode),
TP_ARGS(ctx, req, user_data),
TP_ARGS(ctx, req, user_data, opcode),
TP_STRUCT__entry (
__field( void *, ctx )
__field( void *, req )
__field( unsigned long long, data )
__field( void *, ctx )
__field( void *, req )
__field( unsigned long long, data )
__field( u8, opcode )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->req = req;
__entry->data = user_data;
__entry->opcode = opcode;
),
TP_printk("ring %p, request %p user_data %llu", __entry->ctx,
__entry->req, __entry->data)
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d",
__entry->ctx, __entry->req, __entry->data, __entry->opcode)
);
/**
@ -250,7 +264,7 @@ TRACE_EVENT(io_uring_cqring_wait,
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = ctx;
__entry->min_events = min_events;
),
@ -260,7 +274,10 @@ TRACE_EVENT(io_uring_cqring_wait,
/**
* io_uring_fail_link - called before failing a linked request
*
* @ctx: pointer to a ring context structure
* @req: request, which links were cancelled
* @user_data: user data associated with the request
* @opcode: opcode of request
* @link: cancelled link
*
* Allows to track linked requests cancellation, to see not only that some work
@ -268,27 +285,36 @@ TRACE_EVENT(io_uring_cqring_wait,
*/
TRACE_EVENT(io_uring_fail_link,
TP_PROTO(void *req, void *link),
TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, void *link),
TP_ARGS(req, link),
TP_ARGS(ctx, req, user_data, opcode, link),
TP_STRUCT__entry (
__field( void *, req )
__field( void *, link )
__field( void *, ctx )
__field( void *, req )
__field( unsigned long long, user_data )
__field( u8, opcode )
__field( void *, link )
),
TP_fast_assign(
__entry->req = req;
__entry->link = link;
__entry->ctx = ctx;
__entry->req = req;
__entry->user_data = user_data;
__entry->opcode = opcode;
__entry->link = link;
),
TP_printk("request %p, link %p", __entry->req, __entry->link)
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d, link %p",
__entry->ctx, __entry->req, __entry->user_data, __entry->opcode,
__entry->link)
);
/**
* io_uring_complete - called when completing an SQE
*
* @ctx: pointer to a ring context structure
* @req: pointer to a submitted request
* @user_data: user data associated with the request
* @res: result of the request
* @cflags: completion flags
@ -296,12 +322,13 @@ TRACE_EVENT(io_uring_fail_link,
*/
TRACE_EVENT(io_uring_complete,
TP_PROTO(void *ctx, u64 user_data, int res, unsigned cflags),
TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags),
TP_ARGS(ctx, user_data, res, cflags),
TP_ARGS(ctx, req, user_data, res, cflags),
TP_STRUCT__entry (
__field( void *, ctx )
__field( void *, req )
__field( u64, user_data )
__field( int, res )
__field( unsigned, cflags )
@ -309,14 +336,16 @@ TRACE_EVENT(io_uring_complete,
TP_fast_assign(
__entry->ctx = ctx;
__entry->req = req;
__entry->user_data = user_data;
__entry->res = res;
__entry->cflags = cflags;
),
TP_printk("ring %p, user_data 0x%llx, result %d, cflags %x",
__entry->ctx, (unsigned long long)__entry->user_data,
__entry->res, __entry->cflags)
TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x",
__entry->ctx, __entry->req,
__entry->user_data,
__entry->res, __entry->cflags)
);
/**
@ -324,8 +353,8 @@ TRACE_EVENT(io_uring_complete,
*
* @ctx: pointer to a ring context structure
* @req: pointer to a submitted request
* @opcode: opcode of request
* @user_data: user data associated with the request
* @opcode: opcode of request
* @flags request flags
* @force_nonblock: whether a context blocking or not
* @sq_thread: true if sq_thread has submitted this SQE
@ -335,34 +364,34 @@ TRACE_EVENT(io_uring_complete,
*/
TRACE_EVENT(io_uring_submit_sqe,
TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data, u32 flags,
TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, u32 flags,
bool force_nonblock, bool sq_thread),
TP_ARGS(ctx, req, opcode, user_data, flags, force_nonblock, sq_thread),
TP_ARGS(ctx, req, user_data, opcode, flags, force_nonblock, sq_thread),
TP_STRUCT__entry (
__field( void *, ctx )
__field( void *, req )
__field( u8, opcode )
__field( u64, user_data )
__field( u32, flags )
__field( bool, force_nonblock )
__field( bool, sq_thread )
__field( void *, ctx )
__field( void *, req )
__field( unsigned long long, user_data )
__field( u8, opcode )
__field( u32, flags )
__field( bool, force_nonblock )
__field( bool, sq_thread )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->req = req;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->opcode = opcode;
__entry->flags = flags;
__entry->force_nonblock = force_nonblock;
__entry->sq_thread = sq_thread;
),
TP_printk("ring %p, req %p, op %d, data 0x%llx, flags %u, "
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, flags 0x%x, "
"non block %d, sq_thread %d", __entry->ctx, __entry->req,
__entry->opcode, (unsigned long long)__entry->user_data,
__entry->user_data, __entry->opcode,
__entry->flags, __entry->force_nonblock, __entry->sq_thread)
);
@ -371,8 +400,8 @@ TRACE_EVENT(io_uring_submit_sqe,
*
* @ctx: pointer to a ring context structure
* @req: pointer to the armed request
* @opcode: opcode of request
* @user_data: user data associated with the request
* @opcode: opcode of request
* @mask: request poll events mask
* @events: registered events of interest
*
@ -381,155 +410,110 @@ TRACE_EVENT(io_uring_submit_sqe,
*/
TRACE_EVENT(io_uring_poll_arm,
TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data,
TP_PROTO(void *ctx, void *req, u64 user_data, u8 opcode,
int mask, int events),
TP_ARGS(ctx, req, opcode, user_data, mask, events),
TP_ARGS(ctx, req, user_data, opcode, mask, events),
TP_STRUCT__entry (
__field( void *, ctx )
__field( void *, req )
__field( u8, opcode )
__field( u64, user_data )
__field( int, mask )
__field( int, events )
__field( void *, ctx )
__field( void *, req )
__field( unsigned long long, user_data )
__field( u8, opcode )
__field( int, mask )
__field( int, events )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->req = req;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->opcode = opcode;
__entry->mask = mask;
__entry->events = events;
),
TP_printk("ring %p, req %p, op %d, data 0x%llx, mask 0x%x, events 0x%x",
__entry->ctx, __entry->req, __entry->opcode,
(unsigned long long) __entry->user_data,
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, mask 0x%x, events 0x%x",
__entry->ctx, __entry->req, __entry->user_data, __entry->opcode,
__entry->mask, __entry->events)
);
TRACE_EVENT(io_uring_poll_wake,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask),
TP_ARGS(ctx, opcode, user_data, mask),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
__field( int, mask )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->mask = mask;
),
TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->mask)
);
TRACE_EVENT(io_uring_task_add,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask),
TP_ARGS(ctx, opcode, user_data, mask),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
__field( int, mask )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->mask = mask;
),
TP_printk("ring %p, op %d, data 0x%llx, mask %x",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->mask)
);
/*
* io_uring_task_run - called when task_work_run() executes the poll events
* notification callbacks
* io_uring_task_add - called after adding a task
*
* @ctx: pointer to a ring context structure
* @req: pointer to the armed request
* @opcode: opcode of request
* @req: pointer to request
* @user_data: user data associated with the request
* @opcode: opcode of request
* @mask: request poll events mask
*
* Allows to track when notified poll events are processed
*/
TRACE_EVENT(io_uring_task_run,
TRACE_EVENT(io_uring_task_add,
TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data),
TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, int mask),
TP_ARGS(ctx, req, opcode, user_data),
TP_ARGS(ctx, req, user_data, opcode, mask),
TP_STRUCT__entry (
__field( void *, ctx )
__field( void *, req )
__field( u8, opcode )
__field( u64, user_data )
__field( void *, ctx )
__field( void *, req )
__field( unsigned long long, user_data )
__field( u8, opcode )
__field( int, mask )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->req = req;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->opcode = opcode;
__entry->mask = mask;
),
TP_printk("ring %p, req %p, op %d, data 0x%llx",
__entry->ctx, __entry->req, __entry->opcode,
(unsigned long long) __entry->user_data)
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, mask %x",
__entry->ctx, __entry->req, __entry->user_data, __entry->opcode,
__entry->mask)
);
/*
* io_uring_req_failed - called when an sqe is errored dring submission
*
* @sqe: pointer to the io_uring_sqe that failed
* @ctx: pointer to a ring context structure
* @req: pointer to request
* @error: error it failed with
*
* Allows easier diagnosing of malformed requests in production systems.
*/
TRACE_EVENT(io_uring_req_failed,
TP_PROTO(const struct io_uring_sqe *sqe, int error),
TP_PROTO(const struct io_uring_sqe *sqe, void *ctx, void *req, int error),
TP_ARGS(sqe, error),
TP_ARGS(sqe, ctx, req, error),
TP_STRUCT__entry (
__field( u8, opcode )
__field( u8, flags )
__field( u8, ioprio )
__field( u64, off )
__field( u64, addr )
__field( u32, len )
__field( u32, op_flags )
__field( u64, user_data )
__field( u16, buf_index )
__field( u16, personality )
__field( u32, file_index )
__field( u64, pad1 )
__field( u64, pad2 )
__field( int, error )
__field( void *, ctx )
__field( void *, req )
__field( unsigned long long, user_data )
__field( u8, opcode )
__field( u8, flags )
__field( u8, ioprio )
__field( u64, off )
__field( u64, addr )
__field( u32, len )
__field( u32, op_flags )
__field( u16, buf_index )
__field( u16, personality )
__field( u32, file_index )
__field( u64, pad1 )
__field( u64, pad2 )
__field( int, error )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->req = req;
__entry->user_data = sqe->user_data;
__entry->opcode = sqe->opcode;
__entry->flags = sqe->flags;
__entry->ioprio = sqe->ioprio;
@ -537,7 +521,6 @@ TRACE_EVENT(io_uring_req_failed,
__entry->addr = sqe->addr;
__entry->len = sqe->len;
__entry->op_flags = sqe->rw_flags;
__entry->user_data = sqe->user_data;
__entry->buf_index = sqe->buf_index;
__entry->personality = sqe->personality;
__entry->file_index = sqe->file_index;
@ -546,13 +529,15 @@ TRACE_EVENT(io_uring_req_failed,
__entry->error = error;
),
TP_printk("op %d, flags=0x%x, prio=%d, off=%llu, addr=%llu, "
"len=%u, rw_flags=0x%x, user_data=0x%llx, buf_index=%d, "
TP_printk("ring %p, req %p, user_data 0x%llx, "
"op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, "
"len=%u, rw_flags=0x%x, buf_index=%d, "
"personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d",
__entry->ctx, __entry->req, __entry->user_data,
__entry->opcode, __entry->flags, __entry->ioprio,
(unsigned long long)__entry->off,
(unsigned long long) __entry->addr, __entry->len,
__entry->op_flags, (unsigned long long) __entry->user_data,
__entry->op_flags,
__entry->buf_index, __entry->personality, __entry->file_index,
(unsigned long long) __entry->pad1,
(unsigned long long) __entry->pad2, __entry->error)

View File

@ -101,6 +101,7 @@ enum {
#define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */
#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
#define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */
#define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */
enum {
IORING_OP_NOP,
@ -143,6 +144,7 @@ enum {
IORING_OP_MKDIRAT,
IORING_OP_SYMLINKAT,
IORING_OP_LINKAT,
IORING_OP_MSG_RING,
/* this goes last, obviously */
IORING_OP_LAST,
@ -199,9 +201,11 @@ struct io_uring_cqe {
*
* IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
* IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries
* IORING_CQE_F_MSG If set, CQE was generated with IORING_OP_MSG_RING
*/
#define IORING_CQE_F_BUFFER (1U << 0)
#define IORING_CQE_F_MORE (1U << 1)
#define IORING_CQE_F_MSG (1U << 2)
enum {
IORING_CQE_BUFFER_SHIFT = 16,
@ -257,10 +261,11 @@ struct io_cqring_offsets {
/*
* io_uring_enter(2) flags
*/
#define IORING_ENTER_GETEVENTS (1U << 0)
#define IORING_ENTER_SQ_WAKEUP (1U << 1)
#define IORING_ENTER_SQ_WAIT (1U << 2)
#define IORING_ENTER_EXT_ARG (1U << 3)
#define IORING_ENTER_GETEVENTS (1U << 0)
#define IORING_ENTER_SQ_WAKEUP (1U << 1)
#define IORING_ENTER_SQ_WAIT (1U << 2)
#define IORING_ENTER_EXT_ARG (1U << 3)
#define IORING_ENTER_REGISTERED_RING (1U << 4)
/*
* Passed in for io_uring_setup(2). Copied back with updated info on success
@ -325,6 +330,10 @@ enum {
/* set/get max number of io-wq workers */
IORING_REGISTER_IOWQ_MAX_WORKERS = 19,
/* register/unregister io_uring fd with the ring */
IORING_REGISTER_RING_FDS = 20,
IORING_UNREGISTER_RING_FDS = 21,
/* this goes last */
IORING_REGISTER_LAST
};