io_uring/kbuf: add helpers for getting/peeking multiple buffers

Our provided buffer interface only allows selection of a single buffer.
Add an API that allows getting/peeking multiple buffers at the same time.

This is only implemented for the ring provided buffers. It could be added
for the legacy provided buffers as well, but since it's strongly
encouraged to use the new interface, let's keep it simpler and just
provide it for the new API. The legacy interface will always just select
a single buffer.

There are two new main functions:

io_buffers_select(), which selects up as many buffers as it can. The
caller supplies the iovec array, and io_buffers_select() may allocate a
bigger array if the 'out_len' being passed in is non-zero and bigger
than what fits in the provided iovec. Buffers grabbed with this helper
are permanently assigned.

io_buffers_peek(), which works like io_buffers_select(), except they can
be recycled, if needed. Callers using either of these functions should
call io_put_kbufs() rather than io_put_kbuf() at completion time. The
peek interface must be called with the ctx locked from peek to
completion.

This add a bit state for the request:

- REQ_F_BUFFERS_COMMIT, which means that the the buffers have been
  peeked and should be committed to the buffer ring head when they are
  put as part of completion. Prior to this, req->buf_list was cleared to
  NULL when committed.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Jens Axboe 2024-03-05 07:31:52 -07:00
parent ac5f71a3d9
commit 35c8711c8f
3 changed files with 201 additions and 12 deletions

View File

@ -472,6 +472,7 @@ enum {
REQ_F_CAN_POLL_BIT,
REQ_F_BL_EMPTY_BIT,
REQ_F_BL_NO_RECYCLE_BIT,
REQ_F_BUFFERS_COMMIT_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@ -550,6 +551,8 @@ enum {
REQ_F_BL_EMPTY = IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT),
/* don't recycle provided buffers for this request */
REQ_F_BL_NO_RECYCLE = IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT),
/* buffer ring head needs incrementing on put */
REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT),
};
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);

View File

@ -117,6 +117,27 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
return NULL;
}
static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
struct io_buffer_list *bl,
struct iovec *iov)
{
void __user *buf;
buf = io_provided_buffer_select(req, len, bl);
if (unlikely(!buf))
return -ENOBUFS;
iov[0].iov_base = buf;
iov[0].iov_len = *len;
return 0;
}
static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br,
__u16 head, __u16 mask)
{
return &br->bufs[head & mask];
}
static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
struct io_buffer_list *bl,
unsigned int issue_flags)
@ -132,11 +153,10 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
if (head + 1 == tail)
req->flags |= REQ_F_BL_EMPTY;
head &= bl->mask;
buf = &br->bufs[head];
buf = io_ring_head_to_buf(br, head, bl->mask);
if (*len == 0 || *len > buf->len)
*len = buf->len;
req->flags |= REQ_F_BUFFER_RING;
req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
req->buf_list = bl;
req->buf_index = buf->bid;
@ -151,6 +171,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
* the transfer completes (or if we get -EAGAIN and must poll of
* retry).
*/
req->flags &= ~REQ_F_BUFFERS_COMMIT;
req->buf_list = NULL;
bl->head++;
}
@ -177,6 +198,136 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
return ret;
}
/* cap it at a reasonable 256, will be one page even for 4K */
#define PEEK_MAX_IMPORT 256
static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
struct io_buffer_list *bl)
{
struct io_uring_buf_ring *br = bl->buf_ring;
struct iovec *iov = arg->iovs;
int nr_iovs = arg->nr_iovs;
__u16 nr_avail, tail, head;
struct io_uring_buf *buf;
tail = smp_load_acquire(&br->tail);
head = bl->head;
nr_avail = min_t(__u16, tail - head, UIO_MAXIOV);
if (unlikely(!nr_avail))
return -ENOBUFS;
buf = io_ring_head_to_buf(br, head, bl->mask);
if (arg->max_len) {
int needed;
needed = (arg->max_len + buf->len - 1) / buf->len;
needed = min(needed, PEEK_MAX_IMPORT);
if (nr_avail > needed)
nr_avail = needed;
}
/*
* only alloc a bigger array if we know we have data to map, eg not
* a speculative peek operation.
*/
if (arg->mode & KBUF_MODE_EXPAND && nr_avail > nr_iovs && arg->max_len) {
iov = kmalloc_array(nr_avail, sizeof(struct iovec), GFP_KERNEL);
if (unlikely(!iov))
return -ENOMEM;
if (arg->mode & KBUF_MODE_FREE)
kfree(arg->iovs);
arg->iovs = iov;
nr_iovs = nr_avail;
} else if (nr_avail < nr_iovs) {
nr_iovs = nr_avail;
}
/* set it to max, if not set, so we can use it unconditionally */
if (!arg->max_len)
arg->max_len = INT_MAX;
req->buf_index = buf->bid;
do {
/* truncate end piece, if needed */
if (buf->len > arg->max_len)
buf->len = arg->max_len;
iov->iov_base = u64_to_user_ptr(buf->addr);
iov->iov_len = buf->len;
iov++;
arg->out_len += buf->len;
arg->max_len -= buf->len;
if (!arg->max_len)
break;
buf = io_ring_head_to_buf(br, ++head, bl->mask);
} while (--nr_iovs);
if (head == tail)
req->flags |= REQ_F_BL_EMPTY;
req->flags |= REQ_F_BUFFER_RING;
req->buf_list = bl;
return iov - arg->iovs;
}
int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
int ret = -ENOENT;
io_ring_submit_lock(ctx, issue_flags);
bl = io_buffer_get_list(ctx, req->buf_index);
if (unlikely(!bl))
goto out_unlock;
if (bl->is_buf_ring) {
ret = io_ring_buffers_peek(req, arg, bl);
/*
* Don't recycle these buffers if we need to go through poll.
* Nobody else can use them anyway, and holding on to provided
* buffers for a send/write operation would happen on the app
* side anyway with normal buffers. Besides, we already
* committed them, they cannot be put back in the queue.
*/
if (ret > 0) {
req->flags |= REQ_F_BL_NO_RECYCLE;
req->buf_list->head += ret;
}
} else {
ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs);
}
out_unlock:
io_ring_submit_unlock(ctx, issue_flags);
return ret;
}
int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
int ret;
lockdep_assert_held(&ctx->uring_lock);
bl = io_buffer_get_list(ctx, req->buf_index);
if (unlikely(!bl))
return -ENOENT;
if (bl->is_buf_ring) {
ret = io_ring_buffers_peek(req, arg, bl);
if (ret > 0)
req->flags |= REQ_F_BUFFERS_COMMIT;
return ret;
}
/* don't support multiple buffer selections for legacy */
return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs);
}
static int __io_remove_buffers(struct io_ring_ctx *ctx,
struct io_buffer_list *bl, unsigned nbufs)
{

View File

@ -41,8 +41,26 @@ struct io_buffer {
__u16 bgid;
};
enum {
/* can alloc a bigger vec */
KBUF_MODE_EXPAND = 1,
/* if bigger vec allocated, free old one */
KBUF_MODE_FREE = 2,
};
struct buf_sel_arg {
struct iovec *iovs;
size_t out_len;
size_t max_len;
int nr_iovs;
int mode;
};
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
unsigned int issue_flags);
int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
unsigned int issue_flags);
int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg);
void io_destroy_buffers(struct io_ring_ctx *ctx);
int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
@ -75,7 +93,7 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
*/
if (req->buf_list) {
req->buf_index = req->buf_list->bgid;
req->flags &= ~REQ_F_BUFFER_RING;
req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT);
return true;
}
return false;
@ -99,11 +117,16 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
return false;
}
static inline void __io_put_kbuf_ring(struct io_kiocb *req)
static inline void __io_put_kbuf_ring(struct io_kiocb *req, int nr)
{
if (req->buf_list) {
req->buf_index = req->buf_list->bgid;
req->buf_list->head++;
struct io_buffer_list *bl = req->buf_list;
if (bl) {
if (req->flags & REQ_F_BUFFERS_COMMIT) {
bl->head += nr;
req->flags &= ~REQ_F_BUFFERS_COMMIT;
}
req->buf_index = bl->bgid;
}
req->flags &= ~REQ_F_BUFFER_RING;
}
@ -112,7 +135,7 @@ static inline void __io_put_kbuf_list(struct io_kiocb *req,
struct list_head *list)
{
if (req->flags & REQ_F_BUFFER_RING) {
__io_put_kbuf_ring(req);
__io_put_kbuf_ring(req, 1);
} else {
req->buf_index = req->kbuf->bgid;
list_add(&req->kbuf->list, list);
@ -130,8 +153,8 @@ static inline void io_kbuf_drop(struct io_kiocb *req)
__io_put_kbuf_list(req, &req->ctx->io_buffers_comp);
}
static inline unsigned int io_put_kbuf(struct io_kiocb *req,
unsigned issue_flags)
static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs,
unsigned issue_flags)
{
unsigned int ret;
@ -140,9 +163,21 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
if (req->flags & REQ_F_BUFFER_RING)
__io_put_kbuf_ring(req);
__io_put_kbuf_ring(req, nbufs);
else
__io_put_kbuf(req, issue_flags);
return ret;
}
static inline unsigned int io_put_kbuf(struct io_kiocb *req,
unsigned issue_flags)
{
return __io_put_kbufs(req, 1, issue_flags);
}
static inline unsigned int io_put_kbufs(struct io_kiocb *req, int nbufs,
unsigned issue_flags)
{
return __io_put_kbufs(req, nbufs, issue_flags);
}
#endif