2024-03-27 20:59:09 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/mman.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/vmalloc.h>
|
|
|
|
#include <linux/io_uring.h>
|
|
|
|
#include <linux/io_uring_types.h>
|
|
|
|
#include <asm/shmparam.h>
|
|
|
|
|
|
|
|
#include "memmap.h"
|
|
|
|
#include "kbuf.h"
|
io_uring: introduce concept of memory regions
We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)
Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.
Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-15 16:54:41 +00:00
|
|
|
#include "rsrc.h"
|
2024-03-27 20:59:09 +00:00
|
|
|
|
|
|
|
static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
|
|
|
|
size_t size, gfp_t gfp)
|
|
|
|
{
|
|
|
|
struct page *page;
|
|
|
|
int i, order;
|
|
|
|
|
|
|
|
order = get_order(size);
|
|
|
|
if (order > MAX_PAGE_ORDER)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
else if (order)
|
|
|
|
gfp |= __GFP_COMP;
|
|
|
|
|
|
|
|
page = alloc_pages(gfp, order);
|
|
|
|
if (!page)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
for (i = 0; i < nr_pages; i++)
|
|
|
|
pages[i] = page + i;
|
|
|
|
|
|
|
|
return page_address(page);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
|
|
|
|
gfp_t gfp)
|
|
|
|
{
|
|
|
|
void *ret;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < nr_pages; i++) {
|
|
|
|
pages[i] = alloc_page(gfp);
|
|
|
|
if (!pages[i])
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
err:
|
|
|
|
while (i--)
|
|
|
|
put_page(pages[i]);
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
}
|
|
|
|
|
|
|
|
void *io_pages_map(struct page ***out_pages, unsigned short *npages,
|
|
|
|
size_t size)
|
|
|
|
{
|
|
|
|
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
|
|
|
|
struct page **pages;
|
|
|
|
int nr_pages;
|
|
|
|
void *ret;
|
|
|
|
|
|
|
|
nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
|
|
|
pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp);
|
|
|
|
if (!pages)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
ret = io_mem_alloc_compound(pages, nr_pages, size, gfp);
|
|
|
|
if (!IS_ERR(ret))
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
ret = io_mem_alloc_single(pages, nr_pages, size, gfp);
|
|
|
|
if (!IS_ERR(ret)) {
|
|
|
|
done:
|
|
|
|
*out_pages = pages;
|
|
|
|
*npages = nr_pages;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
kvfree(pages);
|
|
|
|
*out_pages = NULL;
|
|
|
|
*npages = 0;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
|
|
|
|
bool put_pages)
|
|
|
|
{
|
|
|
|
bool do_vunmap = false;
|
|
|
|
|
|
|
|
if (!ptr)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (put_pages && *npages) {
|
|
|
|
struct page **to_free = *pages;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only did vmap for the non-compound multiple page case.
|
|
|
|
* For the compound page, we just need to put the head.
|
|
|
|
*/
|
|
|
|
if (PageCompound(to_free[0]))
|
|
|
|
*npages = 1;
|
|
|
|
else if (*npages > 1)
|
|
|
|
do_vunmap = true;
|
|
|
|
for (i = 0; i < *npages; i++)
|
|
|
|
put_page(to_free[i]);
|
|
|
|
}
|
|
|
|
if (do_vunmap)
|
|
|
|
vunmap(ptr);
|
|
|
|
kvfree(*pages);
|
|
|
|
*pages = NULL;
|
|
|
|
*npages = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void io_pages_free(struct page ***pages, int npages)
|
|
|
|
{
|
|
|
|
struct page **page_array = *pages;
|
|
|
|
|
|
|
|
if (!page_array)
|
|
|
|
return;
|
|
|
|
|
|
|
|
unpin_user_pages(page_array, npages);
|
|
|
|
kvfree(page_array);
|
|
|
|
*pages = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
|
|
|
|
{
|
|
|
|
unsigned long start, end, nr_pages;
|
|
|
|
struct page **pages;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
|
|
|
start = uaddr >> PAGE_SHIFT;
|
|
|
|
nr_pages = end - start;
|
|
|
|
if (WARN_ON_ONCE(!nr_pages))
|
|
|
|
return ERR_PTR(-EINVAL);
|
2024-11-15 16:54:38 +00:00
|
|
|
if (WARN_ON_ONCE(nr_pages > INT_MAX))
|
|
|
|
return ERR_PTR(-EOVERFLOW);
|
2024-03-27 20:59:09 +00:00
|
|
|
|
|
|
|
pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
|
|
|
|
if (!pages)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
|
|
|
|
pages);
|
|
|
|
/* success, mapped all pages */
|
|
|
|
if (ret == nr_pages) {
|
|
|
|
*npages = nr_pages;
|
|
|
|
return pages;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* partial map, or didn't map anything */
|
|
|
|
if (ret >= 0) {
|
|
|
|
/* if we did partial map, release any pages we did get */
|
|
|
|
if (ret)
|
|
|
|
unpin_user_pages(pages, ret);
|
|
|
|
ret = -EFAULT;
|
|
|
|
}
|
|
|
|
kvfree(pages);
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
|
|
|
|
unsigned long uaddr, size_t size)
|
|
|
|
{
|
|
|
|
struct page **page_array;
|
|
|
|
unsigned int nr_pages;
|
|
|
|
void *page_addr;
|
|
|
|
|
|
|
|
*npages = 0;
|
|
|
|
|
|
|
|
if (uaddr & (PAGE_SIZE - 1) || !size)
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
|
|
|
nr_pages = 0;
|
|
|
|
page_array = io_pin_pages(uaddr, size, &nr_pages);
|
|
|
|
if (IS_ERR(page_array))
|
|
|
|
return page_array;
|
|
|
|
|
|
|
|
page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL);
|
|
|
|
if (page_addr) {
|
|
|
|
*pages = page_array;
|
|
|
|
*npages = nr_pages;
|
|
|
|
return page_addr;
|
|
|
|
}
|
|
|
|
|
|
|
|
io_pages_free(&page_array, nr_pages);
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
}
|
|
|
|
|
io_uring: introduce concept of memory regions
We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)
Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.
Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-15 16:54:41 +00:00
|
|
|
void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr)
|
|
|
|
{
|
|
|
|
if (mr->pages) {
|
|
|
|
unpin_user_pages(mr->pages, mr->nr_pages);
|
|
|
|
kvfree(mr->pages);
|
|
|
|
}
|
|
|
|
if (mr->vmap_ptr)
|
|
|
|
vunmap(mr->vmap_ptr);
|
|
|
|
if (mr->nr_pages && ctx->user)
|
|
|
|
__io_unaccount_mem(ctx->user, mr->nr_pages);
|
|
|
|
|
|
|
|
memset(mr, 0, sizeof(*mr));
|
|
|
|
}
|
|
|
|
|
|
|
|
int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
|
|
|
|
struct io_uring_region_desc *reg)
|
|
|
|
{
|
|
|
|
int pages_accounted = 0;
|
|
|
|
struct page **pages;
|
|
|
|
int nr_pages, ret;
|
|
|
|
void *vptr;
|
|
|
|
u64 end;
|
|
|
|
|
|
|
|
if (WARN_ON_ONCE(mr->pages || mr->vmap_ptr || mr->nr_pages))
|
|
|
|
return -EFAULT;
|
|
|
|
if (memchr_inv(®->__resv, 0, sizeof(reg->__resv)))
|
|
|
|
return -EINVAL;
|
|
|
|
if (reg->flags != IORING_MEM_REGION_TYPE_USER)
|
|
|
|
return -EINVAL;
|
|
|
|
if (!reg->user_addr)
|
|
|
|
return -EFAULT;
|
|
|
|
if (!reg->size || reg->mmap_offset || reg->id)
|
|
|
|
return -EINVAL;
|
|
|
|
if ((reg->size >> PAGE_SHIFT) > INT_MAX)
|
|
|
|
return E2BIG;
|
|
|
|
if ((reg->user_addr | reg->size) & ~PAGE_MASK)
|
|
|
|
return -EINVAL;
|
|
|
|
if (check_add_overflow(reg->user_addr, reg->size, &end))
|
|
|
|
return -EOVERFLOW;
|
|
|
|
|
|
|
|
pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages);
|
|
|
|
if (IS_ERR(pages))
|
|
|
|
return PTR_ERR(pages);
|
|
|
|
|
|
|
|
if (ctx->user) {
|
|
|
|
ret = __io_account_mem(ctx->user, nr_pages);
|
|
|
|
if (ret)
|
|
|
|
goto out_free;
|
|
|
|
pages_accounted = nr_pages;
|
|
|
|
}
|
|
|
|
|
|
|
|
vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
|
|
|
|
if (!vptr)
|
|
|
|
goto out_free;
|
|
|
|
|
|
|
|
mr->pages = pages;
|
|
|
|
mr->vmap_ptr = vptr;
|
|
|
|
mr->nr_pages = nr_pages;
|
|
|
|
return 0;
|
|
|
|
out_free:
|
|
|
|
if (pages_accounted)
|
|
|
|
__io_unaccount_mem(ctx->user, pages_accounted);
|
|
|
|
io_pages_free(&pages, nr_pages);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2024-03-27 20:59:09 +00:00
|
|
|
static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
|
|
|
|
size_t sz)
|
|
|
|
{
|
|
|
|
struct io_ring_ctx *ctx = file->private_data;
|
|
|
|
loff_t offset = pgoff << PAGE_SHIFT;
|
|
|
|
|
|
|
|
switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) {
|
|
|
|
case IORING_OFF_SQ_RING:
|
|
|
|
case IORING_OFF_CQ_RING:
|
|
|
|
/* Don't allow mmap if the ring was setup without it */
|
|
|
|
if (ctx->flags & IORING_SETUP_NO_MMAP)
|
|
|
|
return ERR_PTR(-EINVAL);
|
2024-10-24 16:52:02 +00:00
|
|
|
if (!ctx->rings)
|
|
|
|
return ERR_PTR(-EFAULT);
|
2024-03-27 20:59:09 +00:00
|
|
|
return ctx->rings;
|
|
|
|
case IORING_OFF_SQES:
|
|
|
|
/* Don't allow mmap if the ring was setup without it */
|
|
|
|
if (ctx->flags & IORING_SETUP_NO_MMAP)
|
|
|
|
return ERR_PTR(-EINVAL);
|
2024-10-24 16:52:02 +00:00
|
|
|
if (!ctx->sq_sqes)
|
|
|
|
return ERR_PTR(-EFAULT);
|
2024-03-27 20:59:09 +00:00
|
|
|
return ctx->sq_sqes;
|
|
|
|
case IORING_OFF_PBUF_RING: {
|
|
|
|
struct io_buffer_list *bl;
|
|
|
|
unsigned int bgid;
|
|
|
|
void *ptr;
|
|
|
|
|
|
|
|
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
|
|
|
|
bl = io_pbuf_get_bl(ctx, bgid);
|
|
|
|
if (IS_ERR(bl))
|
|
|
|
return bl;
|
|
|
|
ptr = bl->buf_ring;
|
|
|
|
io_put_bl(ctx, bl);
|
|
|
|
return ptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
}
|
|
|
|
|
|
|
|
int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
|
|
|
|
struct page **pages, int npages)
|
|
|
|
{
|
|
|
|
unsigned long nr_pages = npages;
|
|
|
|
|
|
|
|
vm_flags_set(vma, VM_DONTEXPAND);
|
|
|
|
return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_MMU
|
|
|
|
|
|
|
|
__cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
struct io_ring_ctx *ctx = file->private_data;
|
|
|
|
size_t sz = vma->vm_end - vma->vm_start;
|
|
|
|
long offset = vma->vm_pgoff << PAGE_SHIFT;
|
2024-05-29 15:38:38 +00:00
|
|
|
unsigned int npages;
|
2024-03-27 20:59:09 +00:00
|
|
|
void *ptr;
|
|
|
|
|
io_uring/register: add IORING_REGISTER_RESIZE_RINGS
Once a ring has been created, the size of the CQ and SQ rings are fixed.
Usually this isn't a problem on the SQ ring side, as it merely controls
the available number of requests that can be submitted in a single
system call, and there's rarely a need to change that.
For the CQ ring, it's a different story. For most efficient use of
io_uring, it's important that the CQ ring never overflows. This means
that applications must size it for the worst case scenario, which can
be wasteful.
Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
the existing rings. It takes a struct io_uring_params argument, the same
one which is used to setup the ring initially, and resizes rings
according to the sizes given.
Certain properties are always inherited from the original ring setup,
like SQE128/CQE32 and other setup options. The implementation only
allows flag associated with how the CQ ring is sized and clamped.
Existing unconsumed SQE and CQE entries are copied as part of the
process. If either the SQ or CQ resized destination ring cannot hold the
entries already present in the source rings, then the operation is failed
with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new
submissions, and the internal mapping holds the completion lock as well
across moving CQ ring state.
To prevent races between mmap and ring resizing, add a mutex that's
solely used to serialize ring resize and mmap. mmap_sem can't be used
here, as as fork'ed process may be doing mmaps on the ring as well.
The ctx->resize_lock is held across mmap operations, and the resize
will grab it before swapping out the already mapped new data.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-21 19:34:10 +00:00
|
|
|
guard(mutex)(&ctx->resize_lock);
|
|
|
|
|
2024-03-27 20:59:09 +00:00
|
|
|
ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
|
|
|
|
if (IS_ERR(ptr))
|
|
|
|
return PTR_ERR(ptr);
|
|
|
|
|
|
|
|
switch (offset & IORING_OFF_MMAP_MASK) {
|
|
|
|
case IORING_OFF_SQ_RING:
|
|
|
|
case IORING_OFF_CQ_RING:
|
2024-05-29 15:38:38 +00:00
|
|
|
npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT);
|
|
|
|
return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages);
|
2024-03-27 20:59:09 +00:00
|
|
|
case IORING_OFF_SQES:
|
|
|
|
return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
|
|
|
|
ctx->n_sqe_pages);
|
|
|
|
case IORING_OFF_PBUF_RING:
|
|
|
|
return io_pbuf_mmap(file, vma);
|
|
|
|
}
|
|
|
|
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long pgoff,
|
|
|
|
unsigned long flags)
|
|
|
|
{
|
io_uring/register: add IORING_REGISTER_RESIZE_RINGS
Once a ring has been created, the size of the CQ and SQ rings are fixed.
Usually this isn't a problem on the SQ ring side, as it merely controls
the available number of requests that can be submitted in a single
system call, and there's rarely a need to change that.
For the CQ ring, it's a different story. For most efficient use of
io_uring, it's important that the CQ ring never overflows. This means
that applications must size it for the worst case scenario, which can
be wasteful.
Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
the existing rings. It takes a struct io_uring_params argument, the same
one which is used to setup the ring initially, and resizes rings
according to the sizes given.
Certain properties are always inherited from the original ring setup,
like SQE128/CQE32 and other setup options. The implementation only
allows flag associated with how the CQ ring is sized and clamped.
Existing unconsumed SQE and CQE entries are copied as part of the
process. If either the SQ or CQ resized destination ring cannot hold the
entries already present in the source rings, then the operation is failed
with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new
submissions, and the internal mapping holds the completion lock as well
across moving CQ ring state.
To prevent races between mmap and ring resizing, add a mutex that's
solely used to serialize ring resize and mmap. mmap_sem can't be used
here, as as fork'ed process may be doing mmaps on the ring as well.
The ctx->resize_lock is held across mmap operations, and the resize
will grab it before swapping out the already mapped new data.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-21 19:34:10 +00:00
|
|
|
struct io_ring_ctx *ctx = filp->private_data;
|
2024-03-27 20:59:09 +00:00
|
|
|
void *ptr;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do not allow to map to user-provided address to avoid breaking the
|
|
|
|
* aliasing rules. Userspace is not able to guess the offset address of
|
|
|
|
* kernel kmalloc()ed memory area.
|
|
|
|
*/
|
|
|
|
if (addr)
|
|
|
|
return -EINVAL;
|
|
|
|
|
io_uring/register: add IORING_REGISTER_RESIZE_RINGS
Once a ring has been created, the size of the CQ and SQ rings are fixed.
Usually this isn't a problem on the SQ ring side, as it merely controls
the available number of requests that can be submitted in a single
system call, and there's rarely a need to change that.
For the CQ ring, it's a different story. For most efficient use of
io_uring, it's important that the CQ ring never overflows. This means
that applications must size it for the worst case scenario, which can
be wasteful.
Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
the existing rings. It takes a struct io_uring_params argument, the same
one which is used to setup the ring initially, and resizes rings
according to the sizes given.
Certain properties are always inherited from the original ring setup,
like SQE128/CQE32 and other setup options. The implementation only
allows flag associated with how the CQ ring is sized and clamped.
Existing unconsumed SQE and CQE entries are copied as part of the
process. If either the SQ or CQ resized destination ring cannot hold the
entries already present in the source rings, then the operation is failed
with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new
submissions, and the internal mapping holds the completion lock as well
across moving CQ ring state.
To prevent races between mmap and ring resizing, add a mutex that's
solely used to serialize ring resize and mmap. mmap_sem can't be used
here, as as fork'ed process may be doing mmaps on the ring as well.
The ctx->resize_lock is held across mmap operations, and the resize
will grab it before swapping out the already mapped new data.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-21 19:34:10 +00:00
|
|
|
guard(mutex)(&ctx->resize_lock);
|
|
|
|
|
2024-03-27 20:59:09 +00:00
|
|
|
ptr = io_uring_validate_mmap_request(filp, pgoff, len);
|
|
|
|
if (IS_ERR(ptr))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Some architectures have strong cache aliasing requirements.
|
|
|
|
* For such architectures we need a coherent mapping which aliases
|
|
|
|
* kernel memory *and* userspace memory. To achieve that:
|
|
|
|
* - use a NULL file pointer to reference physical memory, and
|
|
|
|
* - use the kernel virtual address of the shared io_uring context
|
|
|
|
* (instead of the userspace-provided address, which has to be 0UL
|
|
|
|
* anyway).
|
|
|
|
* - use the same pgoff which the get_unmapped_area() uses to
|
|
|
|
* calculate the page colouring.
|
|
|
|
* For architectures without such aliasing requirements, the
|
|
|
|
* architecture will return any suitable mapping because addr is 0.
|
|
|
|
*/
|
|
|
|
filp = NULL;
|
|
|
|
flags |= MAP_SHARED;
|
|
|
|
pgoff = 0; /* has been translated to ptr above */
|
|
|
|
#ifdef SHM_COLOUR
|
|
|
|
addr = (uintptr_t) ptr;
|
|
|
|
pgoff = addr >> PAGE_SHIFT;
|
|
|
|
#else
|
|
|
|
addr = 0UL;
|
|
|
|
#endif
|
The usual shower of singleton fixes and minor series all over MM,
documented (hopefully adequately) in the respective changelogs. Notable
series include:
- Lucas Stach has provided some page-mapping
cleanup/consolidation/maintainability work in the series "mm/treewide:
Remove pXd_huge() API".
- In the series "Allow migrate on protnone reference with
MPOL_PREFERRED_MANY policy", Donet Tom has optimized mempolicy's
MPOL_PREFERRED_MANY mode, yielding almost doubled performance in one
test.
- In their series "Memory allocation profiling" Kent Overstreet and
Suren Baghdasaryan have contributed a means of determining (via
/proc/allocinfo) whereabouts in the kernel memory is being allocated:
number of calls and amount of memory.
- Matthew Wilcox has provided the series "Various significant MM
patches" which does a number of rather unrelated things, but in largely
similar code sites.
- In his series "mm: page_alloc: freelist migratetype hygiene" Johannes
Weiner has fixed the page allocator's handling of migratetype requests,
with resulting improvements in compaction efficiency.
- In the series "make the hugetlb migration strategy consistent" Baolin
Wang has fixed a hugetlb migration issue, which should improve hugetlb
allocation reliability.
- Liu Shixin has hit an I/O meltdown caused by readahead in a
memory-tight memcg. Addressed in the series "Fix I/O high when memory
almost met memcg limit".
- In the series "mm/filemap: optimize folio adding and splitting" Kairui
Song has optimized pagecache insertion, yielding ~10% performance
improvement in one test.
- Baoquan He has cleaned up and consolidated the early zone
initialization code in the series "mm/mm_init.c: refactor
free_area_init_core()".
- Baoquan has also redone some MM initializatio code in the series
"mm/init: minor clean up and improvement".
- MM helper cleanups from Christoph Hellwig in his series "remove
follow_pfn".
- More cleanups from Matthew Wilcox in the series "Various page->flags
cleanups".
- Vlastimil Babka has contributed maintainability improvements in the
series "memcg_kmem hooks refactoring".
- More folio conversions and cleanups in Matthew Wilcox's series
"Convert huge_zero_page to huge_zero_folio"
"khugepaged folio conversions"
"Remove page_idle and page_young wrappers"
"Use folio APIs in procfs"
"Clean up __folio_put()"
"Some cleanups for memory-failure"
"Remove page_mapping()"
"More folio compat code removal"
- David Hildenbrand chipped in with "fs/proc/task_mmu: convert hugetlb
functions to work on folis".
- Code consolidation and cleanup work related to GUP's handling of
hugetlbs in Peter Xu's series "mm/gup: Unify hugetlb, part 2".
- Rick Edgecombe has developed some fixes to stack guard gaps in the
series "Cover a guard gap corner case".
- Jinjiang Tu has fixed KSM's behaviour after a fork+exec in the series
"mm/ksm: fix ksm exec support for prctl".
- Baolin Wang has implemented NUMA balancing for multi-size THPs. This
is a simple first-cut implementation for now. The series is "support
multi-size THP numa balancing".
- Cleanups to vma handling helper functions from Matthew Wilcox in the
series "Unify vma_address and vma_pgoff_address".
- Some selftests maintenance work from Dev Jain in the series
"selftests/mm: mremap_test: Optimizations and style fixes".
- Improvements to the swapping of multi-size THPs from Ryan Roberts in
the series "Swap-out mTHP without splitting".
- Kefeng Wang has significantly optimized the handling of arm64's
permission page faults in the series
"arch/mm/fault: accelerate pagefault when badaccess"
"mm: remove arch's private VM_FAULT_BADMAP/BADACCESS"
- GUP cleanups from David Hildenbrand in "mm/gup: consistently call it
GUP-fast".
- hugetlb fault code cleanups from Vishal Moola in "Hugetlb fault path to
use struct vm_fault".
- selftests build fixes from John Hubbard in the series "Fix
selftests/mm build without requiring "make headers"".
- Memory tiering fixes/improvements from Ho-Ren (Jack) Chuang in the
series "Improved Memory Tier Creation for CPUless NUMA Nodes". Fixes
the initialization code so that migration between different memory types
works as intended.
- David Hildenbrand has improved follow_pte() and fixed an errant driver
in the series "mm: follow_pte() improvements and acrn follow_pte()
fixes".
- David also did some cleanup work on large folio mapcounts in his
series "mm: mapcount for large folios + page_mapcount() cleanups".
- Folio conversions in KSM in Alex Shi's series "transfer page to folio
in KSM".
- Barry Song has added some sysfs stats for monitoring multi-size THP's
in the series "mm: add per-order mTHP alloc and swpout counters".
- Some zswap cleanups from Yosry Ahmed in the series "zswap same-filled
and limit checking cleanups".
- Matthew Wilcox has been looking at buffer_head code and found the
documentation to be lacking. The series is "Improve buffer head
documentation".
- Multi-size THPs get more work, this time from Lance Yang. His series
"mm/madvise: enhance lazyfreeing with mTHP in madvise_free" optimizes
the freeing of these things.
- Kemeng Shi has added more userspace-visible writeback instrumentation
in the series "Improve visibility of writeback".
- Kemeng Shi then sent some maintenance work on top in the series "Fix
and cleanups to page-writeback".
- Matthew Wilcox reduces mmap_lock traffic in the anon vma code in the
series "Improve anon_vma scalability for anon VMAs". Intel's test bot
reported an improbable 3x improvement in one test.
- SeongJae Park adds some DAMON feature work in the series
"mm/damon: add a DAMOS filter type for page granularity access recheck"
"selftests/damon: add DAMOS quota goal test"
- Also some maintenance work in the series
"mm/damon/paddr: simplify page level access re-check for pageout"
"mm/damon: misc fixes and improvements"
- David Hildenbrand has disabled some known-to-fail selftests ni the
series "selftests: mm: cow: flag vmsplice() hugetlb tests as XFAIL".
- memcg metadata storage optimizations from Shakeel Butt in "memcg:
reduce memory consumption by memcg stats".
- DAX fixes and maintenance work from Vishal Verma in the series
"dax/bus.c: Fixups for dax-bus locking".
-----BEGIN PGP SIGNATURE-----
iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZkgQYwAKCRDdBJ7gKXxA
jrdKAP9WVJdpEcXxpoub/vVE0UWGtffr8foifi9bCwrQrGh5mgEAx7Yf0+d/oBZB
nvA4E0DcPrUAFy144FNM0NTCb7u9vAw=
=V3R/
-----END PGP SIGNATURE-----
Merge tag 'mm-stable-2024-05-17-19-19' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull mm updates from Andrew Morton:
"The usual shower of singleton fixes and minor series all over MM,
documented (hopefully adequately) in the respective changelogs.
Notable series include:
- Lucas Stach has provided some page-mapping cleanup/consolidation/
maintainability work in the series "mm/treewide: Remove pXd_huge()
API".
- In the series "Allow migrate on protnone reference with
MPOL_PREFERRED_MANY policy", Donet Tom has optimized mempolicy's
MPOL_PREFERRED_MANY mode, yielding almost doubled performance in
one test.
- In their series "Memory allocation profiling" Kent Overstreet and
Suren Baghdasaryan have contributed a means of determining (via
/proc/allocinfo) whereabouts in the kernel memory is being
allocated: number of calls and amount of memory.
- Matthew Wilcox has provided the series "Various significant MM
patches" which does a number of rather unrelated things, but in
largely similar code sites.
- In his series "mm: page_alloc: freelist migratetype hygiene"
Johannes Weiner has fixed the page allocator's handling of
migratetype requests, with resulting improvements in compaction
efficiency.
- In the series "make the hugetlb migration strategy consistent"
Baolin Wang has fixed a hugetlb migration issue, which should
improve hugetlb allocation reliability.
- Liu Shixin has hit an I/O meltdown caused by readahead in a
memory-tight memcg. Addressed in the series "Fix I/O high when
memory almost met memcg limit".
- In the series "mm/filemap: optimize folio adding and splitting"
Kairui Song has optimized pagecache insertion, yielding ~10%
performance improvement in one test.
- Baoquan He has cleaned up and consolidated the early zone
initialization code in the series "mm/mm_init.c: refactor
free_area_init_core()".
- Baoquan has also redone some MM initializatio code in the series
"mm/init: minor clean up and improvement".
- MM helper cleanups from Christoph Hellwig in his series "remove
follow_pfn".
- More cleanups from Matthew Wilcox in the series "Various
page->flags cleanups".
- Vlastimil Babka has contributed maintainability improvements in the
series "memcg_kmem hooks refactoring".
- More folio conversions and cleanups in Matthew Wilcox's series:
"Convert huge_zero_page to huge_zero_folio"
"khugepaged folio conversions"
"Remove page_idle and page_young wrappers"
"Use folio APIs in procfs"
"Clean up __folio_put()"
"Some cleanups for memory-failure"
"Remove page_mapping()"
"More folio compat code removal"
- David Hildenbrand chipped in with "fs/proc/task_mmu: convert
hugetlb functions to work on folis".
- Code consolidation and cleanup work related to GUP's handling of
hugetlbs in Peter Xu's series "mm/gup: Unify hugetlb, part 2".
- Rick Edgecombe has developed some fixes to stack guard gaps in the
series "Cover a guard gap corner case".
- Jinjiang Tu has fixed KSM's behaviour after a fork+exec in the
series "mm/ksm: fix ksm exec support for prctl".
- Baolin Wang has implemented NUMA balancing for multi-size THPs.
This is a simple first-cut implementation for now. The series is
"support multi-size THP numa balancing".
- Cleanups to vma handling helper functions from Matthew Wilcox in
the series "Unify vma_address and vma_pgoff_address".
- Some selftests maintenance work from Dev Jain in the series
"selftests/mm: mremap_test: Optimizations and style fixes".
- Improvements to the swapping of multi-size THPs from Ryan Roberts
in the series "Swap-out mTHP without splitting".
- Kefeng Wang has significantly optimized the handling of arm64's
permission page faults in the series
"arch/mm/fault: accelerate pagefault when badaccess"
"mm: remove arch's private VM_FAULT_BADMAP/BADACCESS"
- GUP cleanups from David Hildenbrand in "mm/gup: consistently call
it GUP-fast".
- hugetlb fault code cleanups from Vishal Moola in "Hugetlb fault
path to use struct vm_fault".
- selftests build fixes from John Hubbard in the series "Fix
selftests/mm build without requiring "make headers"".
- Memory tiering fixes/improvements from Ho-Ren (Jack) Chuang in the
series "Improved Memory Tier Creation for CPUless NUMA Nodes".
Fixes the initialization code so that migration between different
memory types works as intended.
- David Hildenbrand has improved follow_pte() and fixed an errant
driver in the series "mm: follow_pte() improvements and acrn
follow_pte() fixes".
- David also did some cleanup work on large folio mapcounts in his
series "mm: mapcount for large folios + page_mapcount() cleanups".
- Folio conversions in KSM in Alex Shi's series "transfer page to
folio in KSM".
- Barry Song has added some sysfs stats for monitoring multi-size
THP's in the series "mm: add per-order mTHP alloc and swpout
counters".
- Some zswap cleanups from Yosry Ahmed in the series "zswap
same-filled and limit checking cleanups".
- Matthew Wilcox has been looking at buffer_head code and found the
documentation to be lacking. The series is "Improve buffer head
documentation".
- Multi-size THPs get more work, this time from Lance Yang. His
series "mm/madvise: enhance lazyfreeing with mTHP in madvise_free"
optimizes the freeing of these things.
- Kemeng Shi has added more userspace-visible writeback
instrumentation in the series "Improve visibility of writeback".
- Kemeng Shi then sent some maintenance work on top in the series
"Fix and cleanups to page-writeback".
- Matthew Wilcox reduces mmap_lock traffic in the anon vma code in
the series "Improve anon_vma scalability for anon VMAs". Intel's
test bot reported an improbable 3x improvement in one test.
- SeongJae Park adds some DAMON feature work in the series
"mm/damon: add a DAMOS filter type for page granularity access recheck"
"selftests/damon: add DAMOS quota goal test"
- Also some maintenance work in the series
"mm/damon/paddr: simplify page level access re-check for pageout"
"mm/damon: misc fixes and improvements"
- David Hildenbrand has disabled some known-to-fail selftests ni the
series "selftests: mm: cow: flag vmsplice() hugetlb tests as
XFAIL".
- memcg metadata storage optimizations from Shakeel Butt in "memcg:
reduce memory consumption by memcg stats".
- DAX fixes and maintenance work from Vishal Verma in the series
"dax/bus.c: Fixups for dax-bus locking""
* tag 'mm-stable-2024-05-17-19-19' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (426 commits)
memcg, oom: cleanup unused memcg_oom_gfp_mask and memcg_oom_order
selftests/mm: hugetlb_madv_vs_map: avoid test skipping by querying hugepage size at runtime
mm/hugetlb: add missing VM_FAULT_SET_HINDEX in hugetlb_wp
mm/hugetlb: add missing VM_FAULT_SET_HINDEX in hugetlb_fault
selftests: cgroup: add tests to verify the zswap writeback path
mm: memcg: make alloc_mem_cgroup_per_node_info() return bool
mm/damon/core: fix return value from damos_wmark_metric_value
mm: do not update memcg stats for NR_{FILE/SHMEM}_PMDMAPPED
selftests: cgroup: remove redundant enabling of memory controller
Docs/mm/damon/maintainer-profile: allow posting patches based on damon/next tree
Docs/mm/damon/maintainer-profile: change the maintainer's timezone from PST to PT
Docs/mm/damon/design: use a list for supported filters
Docs/admin-guide/mm/damon/usage: fix wrong schemes effective quota update command
Docs/admin-guide/mm/damon/usage: fix wrong example of DAMOS filter matching sysfs file
selftests/damon: classify tests for functionalities and regressions
selftests/damon/_damon_sysfs: use 'is' instead of '==' for 'None'
selftests/damon/_damon_sysfs: find sysfs mount point from /proc/mounts
selftests/damon/_damon_sysfs: check errors from nr_schemes file reads
mm/damon/core: initialize ->esz_bp from damos_quota_init_priv()
selftests/damon: add a test for DAMOS quota goal
...
2024-05-19 16:21:03 +00:00
|
|
|
return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
|
2024-03-27 20:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#else /* !CONFIG_MMU */
|
|
|
|
|
|
|
|
int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
|
|
|
|
{
|
|
|
|
return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long pgoff,
|
|
|
|
unsigned long flags)
|
|
|
|
{
|
io_uring/register: add IORING_REGISTER_RESIZE_RINGS
Once a ring has been created, the size of the CQ and SQ rings are fixed.
Usually this isn't a problem on the SQ ring side, as it merely controls
the available number of requests that can be submitted in a single
system call, and there's rarely a need to change that.
For the CQ ring, it's a different story. For most efficient use of
io_uring, it's important that the CQ ring never overflows. This means
that applications must size it for the worst case scenario, which can
be wasteful.
Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
the existing rings. It takes a struct io_uring_params argument, the same
one which is used to setup the ring initially, and resizes rings
according to the sizes given.
Certain properties are always inherited from the original ring setup,
like SQE128/CQE32 and other setup options. The implementation only
allows flag associated with how the CQ ring is sized and clamped.
Existing unconsumed SQE and CQE entries are copied as part of the
process. If either the SQ or CQ resized destination ring cannot hold the
entries already present in the source rings, then the operation is failed
with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new
submissions, and the internal mapping holds the completion lock as well
across moving CQ ring state.
To prevent races between mmap and ring resizing, add a mutex that's
solely used to serialize ring resize and mmap. mmap_sem can't be used
here, as as fork'ed process may be doing mmaps on the ring as well.
The ctx->resize_lock is held across mmap operations, and the resize
will grab it before swapping out the already mapped new data.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-21 19:34:10 +00:00
|
|
|
struct io_ring_ctx *ctx = file->private_data;
|
2024-03-27 20:59:09 +00:00
|
|
|
void *ptr;
|
|
|
|
|
io_uring/register: add IORING_REGISTER_RESIZE_RINGS
Once a ring has been created, the size of the CQ and SQ rings are fixed.
Usually this isn't a problem on the SQ ring side, as it merely controls
the available number of requests that can be submitted in a single
system call, and there's rarely a need to change that.
For the CQ ring, it's a different story. For most efficient use of
io_uring, it's important that the CQ ring never overflows. This means
that applications must size it for the worst case scenario, which can
be wasteful.
Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
the existing rings. It takes a struct io_uring_params argument, the same
one which is used to setup the ring initially, and resizes rings
according to the sizes given.
Certain properties are always inherited from the original ring setup,
like SQE128/CQE32 and other setup options. The implementation only
allows flag associated with how the CQ ring is sized and clamped.
Existing unconsumed SQE and CQE entries are copied as part of the
process. If either the SQ or CQ resized destination ring cannot hold the
entries already present in the source rings, then the operation is failed
with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new
submissions, and the internal mapping holds the completion lock as well
across moving CQ ring state.
To prevent races between mmap and ring resizing, add a mutex that's
solely used to serialize ring resize and mmap. mmap_sem can't be used
here, as as fork'ed process may be doing mmaps on the ring as well.
The ctx->resize_lock is held across mmap operations, and the resize
will grab it before swapping out the already mapped new data.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-21 19:34:10 +00:00
|
|
|
guard(mutex)(&ctx->resize_lock);
|
|
|
|
|
2024-03-27 20:59:09 +00:00
|
|
|
ptr = io_uring_validate_mmap_request(file, pgoff, len);
|
|
|
|
if (IS_ERR(ptr))
|
|
|
|
return PTR_ERR(ptr);
|
|
|
|
|
|
|
|
return (unsigned long) ptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* !CONFIG_MMU */
|