linux/io_uring/rsrc.c
Jens Axboe 6af82f7614 io_uring/rsrc: encode node type and ctx together
Rather than keep the type field separate rom ctx, use the fact that we
can encode up to 4 types of nodes in the LSB of the ctx pointer. Doesn't
reclaim any space right now on 64-bit archs, but it leaves a full int
for future use.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-06 13:54:15 -07:00

1083 lines
25 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/nospec.h>
#include <linux/hugetlb.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "openclose.h"
#include "rsrc.h"
#include "memmap.h"
#include "register.h"
struct io_rsrc_update {
struct file *file;
u64 arg;
u32 nr_args;
u32 offset;
};
static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
struct iovec *iov, struct page **last_hpage);
/* only define max */
#define IORING_MAX_FIXED_FILES (1U << 20)
#define IORING_MAX_REG_BUFFERS (1U << 14)
int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
{
unsigned long page_limit, cur_pages, new_pages;
if (!nr_pages)
return 0;
/* Don't allow more pages than we can safely lock */
page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
cur_pages = atomic_long_read(&user->locked_vm);
do {
new_pages = cur_pages + nr_pages;
if (new_pages > page_limit)
return -ENOMEM;
} while (!atomic_long_try_cmpxchg(&user->locked_vm,
&cur_pages, new_pages));
return 0;
}
static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
if (ctx->user)
__io_unaccount_mem(ctx->user, nr_pages);
if (ctx->mm_account)
atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
}
static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
int ret;
if (ctx->user) {
ret = __io_account_mem(ctx->user, nr_pages);
if (ret)
return ret;
}
if (ctx->mm_account)
atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
return 0;
}
static int io_buffer_validate(struct iovec *iov)
{
unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
/*
* Don't impose further limits on the size and buffer
* constraints here, we'll -EINVAL later when IO is
* submitted if they are wrong.
*/
if (!iov->iov_base)
return iov->iov_len ? -EFAULT : 0;
if (!iov->iov_len)
return -EFAULT;
/* arbitrary limit, but we need something */
if (iov->iov_len > SZ_1G)
return -EFAULT;
if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
return -EOVERFLOW;
return 0;
}
static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
{
unsigned int i;
if (node->buf) {
struct io_mapped_ubuf *imu = node->buf;
if (!refcount_dec_and_test(&imu->refs))
return;
for (i = 0; i < imu->nr_bvecs; i++)
unpin_user_page(imu->bvec[i].bv_page);
if (imu->acct_pages)
io_unaccount_mem(ctx, imu->acct_pages);
kvfree(imu);
}
}
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
{
struct io_rsrc_node *node;
node = kzalloc(sizeof(*node), GFP_KERNEL);
if (node) {
node->ctx_ptr = (unsigned long) ctx | type;
node->refs = 1;
}
return node;
}
__cold void io_rsrc_data_free(struct io_rsrc_data *data)
{
if (!data->nr)
return;
while (data->nr--) {
if (data->nodes[data->nr])
io_put_rsrc_node(data->nodes[data->nr]);
}
kvfree(data->nodes);
data->nodes = NULL;
data->nr = 0;
}
__cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
{
data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *),
GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (data->nodes) {
data->nr = nr;
return 0;
}
return -ENOMEM;
}
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_rsrc_update2 *up,
unsigned nr_args)
{
u64 __user *tags = u64_to_user_ptr(up->tags);
__s32 __user *fds = u64_to_user_ptr(up->data);
int fd, i, err = 0;
unsigned int done;
if (!ctx->file_table.data.nr)
return -ENXIO;
if (up->offset + nr_args > ctx->file_table.data.nr)
return -EINVAL;
for (done = 0; done < nr_args; done++) {
u64 tag = 0;
if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
copy_from_user(&fd, &fds[done], sizeof(fd))) {
err = -EFAULT;
break;
}
if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
err = -EINVAL;
break;
}
if (fd == IORING_REGISTER_FILES_SKIP)
continue;
i = up->offset + done;
if (io_reset_rsrc_node(&ctx->file_table.data, i))
io_file_bitmap_clear(&ctx->file_table, i);
if (fd != -1) {
struct file *file = fget(fd);
struct io_rsrc_node *node;
if (!file) {
err = -EBADF;
break;
}
/*
* Don't allow io_uring instances to be registered.
*/
if (io_is_uring_fops(file)) {
fput(file);
err = -EBADF;
break;
}
node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
if (!node) {
err = -ENOMEM;
fput(file);
break;
}
ctx->file_table.data.nodes[i] = node;
if (tag)
node->tag = tag;
io_fixed_file_set(node, file);
io_file_bitmap_set(&ctx->file_table, i);
}
}
return done ? done : err;
}
static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
struct io_uring_rsrc_update2 *up,
unsigned int nr_args)
{
u64 __user *tags = u64_to_user_ptr(up->tags);
struct iovec fast_iov, *iov;
struct page *last_hpage = NULL;
struct iovec __user *uvec;
u64 user_data = up->data;
__u32 done;
int i, err;
if (!ctx->buf_table.nr)
return -ENXIO;
if (up->offset + nr_args > ctx->buf_table.nr)
return -EINVAL;
for (done = 0; done < nr_args; done++) {
struct io_rsrc_node *node;
u64 tag = 0;
uvec = u64_to_user_ptr(user_data);
iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
if (IS_ERR(iov)) {
err = PTR_ERR(iov);
break;
}
if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
err = -EFAULT;
break;
}
err = io_buffer_validate(iov);
if (err)
break;
node = io_sqe_buffer_register(ctx, iov, &last_hpage);
if (IS_ERR(node)) {
err = PTR_ERR(node);
break;
}
if (tag) {
if (!node) {
err = -EINVAL;
break;
}
node->tag = tag;
}
i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
io_reset_rsrc_node(&ctx->buf_table, i);
ctx->buf_table.nodes[i] = node;
if (ctx->compat)
user_data += sizeof(struct compat_iovec);
else
user_data += sizeof(struct iovec);
}
return done ? done : err;
}
static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
struct io_uring_rsrc_update2 *up,
unsigned nr_args)
{
__u32 tmp;
lockdep_assert_held(&ctx->uring_lock);
if (check_add_overflow(up->offset, nr_args, &tmp))
return -EOVERFLOW;
switch (type) {
case IORING_RSRC_FILE:
return __io_sqe_files_update(ctx, up, nr_args);
case IORING_RSRC_BUFFER:
return __io_sqe_buffers_update(ctx, up, nr_args);
}
return -EINVAL;
}
int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
struct io_uring_rsrc_update2 up;
if (!nr_args)
return -EINVAL;
memset(&up, 0, sizeof(up));
if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
return -EFAULT;
if (up.resv || up.resv2)
return -EINVAL;
return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
}
int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned size, unsigned type)
{
struct io_uring_rsrc_update2 up;
if (size != sizeof(up))
return -EINVAL;
if (copy_from_user(&up, arg, sizeof(up)))
return -EFAULT;
if (!up.nr || up.resv || up.resv2)
return -EINVAL;
return __io_register_rsrc_update(ctx, type, &up, up.nr);
}
__cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type)
{
struct io_uring_rsrc_register rr;
/* keep it extendible */
if (size != sizeof(rr))
return -EINVAL;
memset(&rr, 0, sizeof(rr));
if (copy_from_user(&rr, arg, size))
return -EFAULT;
if (!rr.nr || rr.resv2)
return -EINVAL;
if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
return -EINVAL;
switch (type) {
case IORING_RSRC_FILE:
if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
break;
return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
rr.nr, u64_to_user_ptr(rr.tags));
case IORING_RSRC_BUFFER:
if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
break;
return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
rr.nr, u64_to_user_ptr(rr.tags));
}
return -EINVAL;
}
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
if (sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
up->offset = READ_ONCE(sqe->off);
up->nr_args = READ_ONCE(sqe->len);
if (!up->nr_args)
return -EINVAL;
up->arg = READ_ONCE(sqe->addr);
return 0;
}
static int io_files_update_with_index_alloc(struct io_kiocb *req,
unsigned int issue_flags)
{
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
__s32 __user *fds = u64_to_user_ptr(up->arg);
unsigned int done;
struct file *file;
int ret, fd;
if (!req->ctx->file_table.data.nr)
return -ENXIO;
for (done = 0; done < up->nr_args; done++) {
if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
ret = -EFAULT;
break;
}
file = fget(fd);
if (!file) {
ret = -EBADF;
break;
}
ret = io_fixed_fd_install(req, issue_flags, file,
IORING_FILE_INDEX_ALLOC);
if (ret < 0)
break;
if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
__io_close_fixed(req->ctx, issue_flags, ret);
ret = -EFAULT;
break;
}
}
if (done)
return done;
return ret;
}
int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
struct io_ring_ctx *ctx = req->ctx;
struct io_uring_rsrc_update2 up2;
int ret;
up2.offset = up->offset;
up2.data = up->arg;
up2.nr = 0;
up2.tags = 0;
up2.resv = 0;
up2.resv2 = 0;
if (up->offset == IORING_FILE_INDEX_ALLOC) {
ret = io_files_update_with_index_alloc(req, issue_flags);
} else {
io_ring_submit_lock(ctx, issue_flags);
ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
&up2, up->nr_args);
io_ring_submit_unlock(ctx, issue_flags);
}
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}
void io_free_rsrc_node(struct io_rsrc_node *node)
{
struct io_ring_ctx *ctx = io_rsrc_node_ctx(node);
lockdep_assert_held(&ctx->uring_lock);
if (node->tag)
io_post_aux_cqe(ctx, node->tag, 0, 0);
switch (io_rsrc_node_type(node)) {
case IORING_RSRC_FILE:
if (io_slot_file(node))
fput(io_slot_file(node));
break;
case IORING_RSRC_BUFFER:
if (node->buf)
io_buffer_unmap(ctx, node);
break;
default:
WARN_ON_ONCE(1);
break;
}
kfree(node);
}
int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
if (!ctx->file_table.data.nr)
return -ENXIO;
io_free_file_tables(&ctx->file_table);
io_file_table_set_alloc_range(ctx, 0, 0);
return 0;
}
int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args, u64 __user *tags)
{
__s32 __user *fds = (__s32 __user *) arg;
struct file *file;
int fd, ret;
unsigned i;
if (ctx->file_table.data.nr)
return -EBUSY;
if (!nr_args)
return -EINVAL;
if (nr_args > IORING_MAX_FIXED_FILES)
return -EMFILE;
if (nr_args > rlimit(RLIMIT_NOFILE))
return -EMFILE;
if (!io_alloc_file_tables(&ctx->file_table, nr_args))
return -ENOMEM;
for (i = 0; i < nr_args; i++) {
struct io_rsrc_node *node;
u64 tag = 0;
ret = -EFAULT;
if (tags && copy_from_user(&tag, &tags[i], sizeof(tag)))
goto fail;
if (fds && copy_from_user(&fd, &fds[i], sizeof(fd)))
goto fail;
/* allow sparse sets */
if (!fds || fd == -1) {
ret = -EINVAL;
if (tag)
goto fail;
continue;
}
file = fget(fd);
ret = -EBADF;
if (unlikely(!file))
goto fail;
/*
* Don't allow io_uring instances to be registered.
*/
if (io_is_uring_fops(file)) {
fput(file);
goto fail;
}
ret = -ENOMEM;
node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
if (!node) {
fput(file);
goto fail;
}
if (tag)
node->tag = tag;
ctx->file_table.data.nodes[i] = node;
io_fixed_file_set(node, file);
io_file_bitmap_set(&ctx->file_table, i);
}
/* default it to the whole table */
io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
return 0;
fail:
io_sqe_files_unregister(ctx);
return ret;
}
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
if (!ctx->buf_table.nr)
return -ENXIO;
io_rsrc_data_free(&ctx->buf_table);
return 0;
}
/*
* Not super efficient, but this is just a registration time. And we do cache
* the last compound head, so generally we'll only do a full search if we don't
* match that one.
*
* We check if the given compound head page has already been accounted, to
* avoid double accounting it. This allows us to account the full size of the
* page, not just the constituent pages of a huge page.
*/
static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
int nr_pages, struct page *hpage)
{
int i, j;
/* check current page array */
for (i = 0; i < nr_pages; i++) {
if (!PageCompound(pages[i]))
continue;
if (compound_head(pages[i]) == hpage)
return true;
}
/* check previously registered pages */
for (i = 0; i < ctx->buf_table.nr; i++) {
struct io_rsrc_node *node = ctx->buf_table.nodes[i];
struct io_mapped_ubuf *imu;
if (!node)
continue;
imu = node->buf;
for (j = 0; j < imu->nr_bvecs; j++) {
if (!PageCompound(imu->bvec[j].bv_page))
continue;
if (compound_head(imu->bvec[j].bv_page) == hpage)
return true;
}
}
return false;
}
static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
int nr_pages, struct io_mapped_ubuf *imu,
struct page **last_hpage)
{
int i, ret;
imu->acct_pages = 0;
for (i = 0; i < nr_pages; i++) {
if (!PageCompound(pages[i])) {
imu->acct_pages++;
} else {
struct page *hpage;
hpage = compound_head(pages[i]);
if (hpage == *last_hpage)
continue;
*last_hpage = hpage;
if (headpage_already_acct(ctx, pages, i, hpage))
continue;
imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
}
}
if (!imu->acct_pages)
return 0;
ret = io_account_mem(ctx, imu->acct_pages);
if (ret)
imu->acct_pages = 0;
return ret;
}
static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
struct io_imu_folio_data *data, int nr_folios)
{
struct page **page_array = *pages, **new_array = NULL;
int nr_pages_left = *nr_pages, i, j;
/* Store head pages only*/
new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
GFP_KERNEL);
if (!new_array)
return false;
new_array[0] = compound_head(page_array[0]);
/*
* The pages are bound to the folio, it doesn't
* actually unpin them but drops all but one reference,
* which is usually put down by io_buffer_unmap().
* Note, needs a better helper.
*/
if (data->nr_pages_head > 1)
unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
j = data->nr_pages_head;
nr_pages_left -= data->nr_pages_head;
for (i = 1; i < nr_folios; i++) {
unsigned int nr_unpin;
new_array[i] = page_array[j];
nr_unpin = min_t(unsigned int, nr_pages_left - 1,
data->nr_pages_mid - 1);
if (nr_unpin)
unpin_user_pages(&page_array[j+1], nr_unpin);
j += data->nr_pages_mid;
nr_pages_left -= data->nr_pages_mid;
}
kvfree(page_array);
*pages = new_array;
*nr_pages = nr_folios;
return true;
}
static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
struct io_imu_folio_data *data)
{
struct page **page_array = *pages;
struct folio *folio = page_folio(page_array[0]);
unsigned int count = 1, nr_folios = 1;
int i;
if (*nr_pages <= 1)
return false;
data->nr_pages_mid = folio_nr_pages(folio);
if (data->nr_pages_mid == 1)
return false;
data->folio_shift = folio_shift(folio);
/*
* Check if pages are contiguous inside a folio, and all folios have
* the same page count except for the head and tail.
*/
for (i = 1; i < *nr_pages; i++) {
if (page_folio(page_array[i]) == folio &&
page_array[i] == page_array[i-1] + 1) {
count++;
continue;
}
if (nr_folios == 1) {
if (folio_page_idx(folio, page_array[i-1]) !=
data->nr_pages_mid - 1)
return false;
data->nr_pages_head = count;
} else if (count != data->nr_pages_mid) {
return false;
}
folio = page_folio(page_array[i]);
if (folio_size(folio) != (1UL << data->folio_shift) ||
folio_page_idx(folio, page_array[i]) != 0)
return false;
count = 1;
nr_folios++;
}
if (nr_folios == 1)
data->nr_pages_head = count;
return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
}
static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
struct iovec *iov,
struct page **last_hpage)
{
struct io_mapped_ubuf *imu = NULL;
struct page **pages = NULL;
struct io_rsrc_node *node;
unsigned long off;
size_t size;
int ret, nr_pages, i;
struct io_imu_folio_data data;
bool coalesced;
if (!iov->iov_base)
return NULL;
node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
if (!node)
return ERR_PTR(-ENOMEM);
node->buf = NULL;
ret = -ENOMEM;
pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
&nr_pages);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
pages = NULL;
goto done;
}
/* If it's huge page(s), try to coalesce them into fewer bvec entries */
coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data);
imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
if (!imu)
goto done;
ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
if (ret) {
unpin_user_pages(pages, nr_pages);
goto done;
}
size = iov->iov_len;
/* store original address for later verification */
imu->ubuf = (unsigned long) iov->iov_base;
imu->len = iov->iov_len;
imu->nr_bvecs = nr_pages;
imu->folio_shift = PAGE_SHIFT;
if (coalesced)
imu->folio_shift = data.folio_shift;
refcount_set(&imu->refs, 1);
off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
node->buf = imu;
ret = 0;
for (i = 0; i < nr_pages; i++) {
size_t vec_len;
vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
off = 0;
size -= vec_len;
}
done:
if (ret) {
kvfree(imu);
if (node)
io_put_rsrc_node(node);
node = ERR_PTR(ret);
}
kvfree(pages);
return node;
}
int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int nr_args, u64 __user *tags)
{
struct page *last_hpage = NULL;
struct io_rsrc_data data;
struct iovec fast_iov, *iov = &fast_iov;
const struct iovec __user *uvec;
int i, ret;
BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
if (ctx->buf_table.nr)
return -EBUSY;
if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
return -EINVAL;
ret = io_rsrc_data_alloc(&data, nr_args);
if (ret)
return ret;
if (!arg)
memset(iov, 0, sizeof(*iov));
for (i = 0; i < nr_args; i++) {
struct io_rsrc_node *node;
u64 tag = 0;
if (arg) {
uvec = (struct iovec __user *) arg;
iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
if (IS_ERR(iov)) {
ret = PTR_ERR(iov);
break;
}
ret = io_buffer_validate(iov);
if (ret)
break;
if (ctx->compat)
arg += sizeof(struct compat_iovec);
else
arg += sizeof(struct iovec);
}
if (tags) {
if (copy_from_user(&tag, &tags[i], sizeof(tag))) {
ret = -EFAULT;
break;
}
}
node = io_sqe_buffer_register(ctx, iov, &last_hpage);
if (IS_ERR(node)) {
ret = PTR_ERR(node);
break;
}
if (tag) {
if (!node) {
ret = -EINVAL;
break;
}
node->tag = tag;
}
data.nodes[i] = node;
}
ctx->buf_table = data;
if (ret)
io_sqe_buffers_unregister(ctx);
return ret;
}
int io_import_fixed(int ddir, struct iov_iter *iter,
struct io_mapped_ubuf *imu,
u64 buf_addr, size_t len)
{
u64 buf_end;
size_t offset;
if (WARN_ON_ONCE(!imu))
return -EFAULT;
if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
return -EFAULT;
/* not inside the mapped region */
if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
return -EFAULT;
/*
* Might not be a start of buffer, set size appropriately
* and advance us to the beginning.
*/
offset = buf_addr - imu->ubuf;
iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
if (offset) {
/*
* Don't use iov_iter_advance() here, as it's really slow for
* using the latter parts of a big fixed buffer - it iterates
* over each segment manually. We can cheat a bit here, because
* we know that:
*
* 1) it's a BVEC iter, we set it up
* 2) all bvecs are the same in size, except potentially the
* first and last bvec
*
* So just find our index, and adjust the iterator afterwards.
* If the offset is within the first bvec (or the whole first
* bvec, just use iov_iter_advance(). This makes it easier
* since we can just skip the first segment, which may not
* be folio_size aligned.
*/
const struct bio_vec *bvec = imu->bvec;
if (offset < bvec->bv_len) {
iter->count -= offset;
iter->iov_offset = offset;
} else {
unsigned long seg_skip;
/* skip first vec */
offset -= bvec->bv_len;
seg_skip = 1 + (offset >> imu->folio_shift);
iter->bvec += seg_skip;
iter->nr_segs -= seg_skip;
iter->count -= bvec->bv_len + offset;
iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
}
}
return 0;
}
static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
struct io_uring_clone_buffers *arg)
{
struct io_rsrc_data data;
int i, ret, off, nr;
unsigned int nbufs;
/* if offsets are given, must have nr specified too */
if (!arg->nr && (arg->dst_off || arg->src_off))
return -EINVAL;
/* not allowed unless REPLACE is set */
if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
return -EBUSY;
nbufs = READ_ONCE(src_ctx->buf_table.nr);
if (!arg->nr)
arg->nr = nbufs;
else if (arg->nr > nbufs)
return -EINVAL;
else if (arg->nr > IORING_MAX_REG_BUFFERS)
return -EINVAL;
if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
return -EOVERFLOW;
ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr));
if (ret)
return ret;
/* Fill entries in data from dst that won't overlap with src */
for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
struct io_rsrc_node *src_node = ctx->buf_table.nodes[i];
if (src_node) {
data.nodes[i] = src_node;
src_node->refs++;
}
}
/*
* Drop our own lock here. We'll setup the data we need and reference
* the source buffers, then re-grab, check, and assign at the end.
*/
mutex_unlock(&ctx->uring_lock);
mutex_lock(&src_ctx->uring_lock);
ret = -ENXIO;
nbufs = src_ctx->buf_table.nr;
if (!nbufs)
goto out_unlock;
ret = -EINVAL;
if (!arg->nr)
arg->nr = nbufs;
else if (arg->nr > nbufs)
goto out_unlock;
ret = -EOVERFLOW;
if (check_add_overflow(arg->nr, arg->src_off, &off))
goto out_unlock;
if (off > nbufs)
goto out_unlock;
off = arg->dst_off;
i = arg->src_off;
nr = arg->nr;
while (nr--) {
struct io_rsrc_node *dst_node, *src_node;
src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i);
if (!src_node) {
dst_node = NULL;
} else {
dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
if (!dst_node) {
ret = -ENOMEM;
goto out_put_free;
}
refcount_inc(&src_node->buf->refs);
dst_node->buf = src_node->buf;
}
data.nodes[off++] = dst_node;
i++;
}
/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
mutex_unlock(&src_ctx->uring_lock);
mutex_lock(&ctx->uring_lock);
/*
* If asked for replace, put the old table. data->nodes[] holds both
* old and new nodes at this point.
*/
if (arg->flags & IORING_REGISTER_DST_REPLACE)
io_rsrc_data_free(&ctx->buf_table);
/*
* ctx->buf_table should be empty now - either the contents are being
* replaced and we just freed the table, or someone raced setting up
* a buffer table while the clone was happening. If not empty, fall
* through to failure handling.
*/
if (!ctx->buf_table.nr) {
ctx->buf_table = data;
return 0;
}
mutex_unlock(&ctx->uring_lock);
mutex_lock(&src_ctx->uring_lock);
/* someone raced setting up buffers, dump ours */
ret = -EBUSY;
out_put_free:
i = data.nr;
while (i--) {
io_buffer_unmap(src_ctx, data.nodes[i]);
kfree(data.nodes[i]);
}
out_unlock:
io_rsrc_data_free(&data);
mutex_unlock(&src_ctx->uring_lock);
mutex_lock(&ctx->uring_lock);
return ret;
}
/*
* Copy the registered buffers from the source ring whose file descriptor
* is given in the src_fd to the current ring. This is identical to registering
* the buffers with ctx, except faster as mappings already exist.
*
* Since the memory is already accounted once, don't account it again.
*/
int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_clone_buffers buf;
bool registered_src;
struct file *file;
int ret;
if (copy_from_user(&buf, arg, sizeof(buf)))
return -EFAULT;
if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE))
return -EINVAL;
if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr)
return -EBUSY;
if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
return -EINVAL;
registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
file = io_uring_register_get_file(buf.src_fd, registered_src);
if (IS_ERR(file))
return PTR_ERR(file);
ret = io_clone_buffers(ctx, file->private_data, &buf);
if (!registered_src)
fput(file);
return ret;
}