mirror of
https://github.com/torvalds/linux.git
synced 2024-12-13 22:53:20 +00:00
io_uring: improve registered buffer accounting for huge pages
io_uring does account any registered buffer as pinned/locked memory, and checks limit and fails if the given user doesn't have a big enough limit to register the ranges specified. However, if huge pages are used, we are potentially under-accounting the memory in terms of what gets pinned on the vm side. This patch rectifies that, by ensuring that we account the full size of a compound page, regardless of how much of it is being registered. Huge pages are not accounted mulitple times - if multiple sections of a huge page is registered, then the page is only accounted once. Reported-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
parent
14db84110d
commit
de2939388b
@ -190,6 +190,7 @@ struct io_mapped_ubuf {
|
||||
size_t len;
|
||||
struct bio_vec *bvec;
|
||||
unsigned int nr_bvecs;
|
||||
unsigned long acct_pages;
|
||||
};
|
||||
|
||||
struct fixed_file_table {
|
||||
@ -8002,7 +8003,8 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
|
||||
for (j = 0; j < imu->nr_bvecs; j++)
|
||||
unpin_user_page(imu->bvec[j].bv_page);
|
||||
|
||||
io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED);
|
||||
if (imu->acct_pages)
|
||||
io_unaccount_mem(ctx, imu->acct_pages, ACCT_PINNED);
|
||||
kvfree(imu->bvec);
|
||||
imu->nr_bvecs = 0;
|
||||
}
|
||||
@ -8038,11 +8040,80 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Not super efficient, but this is just a registration time. And we do cache
|
||||
* the last compound head, so generally we'll only do a full search if we don't
|
||||
* match that one.
|
||||
*
|
||||
* We check if the given compound head page has already been accounted, to
|
||||
* avoid double accounting it. This allows us to account the full size of the
|
||||
* page, not just the constituent pages of a huge page.
|
||||
*/
|
||||
static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
|
||||
int nr_pages, struct page *hpage)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
/* check current page array */
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
if (!PageCompound(pages[i]))
|
||||
continue;
|
||||
if (compound_head(pages[i]) == hpage)
|
||||
return true;
|
||||
}
|
||||
|
||||
/* check previously registered pages */
|
||||
for (i = 0; i < ctx->nr_user_bufs; i++) {
|
||||
struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
|
||||
|
||||
for (j = 0; j < imu->nr_bvecs; j++) {
|
||||
if (!PageCompound(imu->bvec[j].bv_page))
|
||||
continue;
|
||||
if (compound_head(imu->bvec[j].bv_page) == hpage)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
|
||||
int nr_pages, struct io_mapped_ubuf *imu,
|
||||
struct page **last_hpage)
|
||||
{
|
||||
int i, ret;
|
||||
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
if (!PageCompound(pages[i])) {
|
||||
imu->acct_pages++;
|
||||
} else {
|
||||
struct page *hpage;
|
||||
|
||||
hpage = compound_head(pages[i]);
|
||||
if (hpage == *last_hpage)
|
||||
continue;
|
||||
*last_hpage = hpage;
|
||||
if (headpage_already_acct(ctx, pages, i, hpage))
|
||||
continue;
|
||||
imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
|
||||
}
|
||||
}
|
||||
|
||||
if (!imu->acct_pages)
|
||||
return 0;
|
||||
|
||||
ret = io_account_mem(ctx, imu->acct_pages, ACCT_PINNED);
|
||||
if (ret)
|
||||
imu->acct_pages = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
|
||||
unsigned nr_args)
|
||||
{
|
||||
struct vm_area_struct **vmas = NULL;
|
||||
struct page **pages = NULL;
|
||||
struct page *last_hpage = NULL;
|
||||
int i, j, got_pages = 0;
|
||||
int ret = -EINVAL;
|
||||
|
||||
@ -8085,10 +8156,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
|
||||
start = ubuf >> PAGE_SHIFT;
|
||||
nr_pages = end - start;
|
||||
|
||||
ret = io_account_mem(ctx, nr_pages, ACCT_PINNED);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = 0;
|
||||
if (!pages || nr_pages > got_pages) {
|
||||
kvfree(vmas);
|
||||
@ -8100,7 +8167,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
|
||||
GFP_KERNEL);
|
||||
if (!pages || !vmas) {
|
||||
ret = -ENOMEM;
|
||||
io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
|
||||
goto err;
|
||||
}
|
||||
got_pages = nr_pages;
|
||||
@ -8109,10 +8175,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
|
||||
imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
|
||||
GFP_KERNEL);
|
||||
ret = -ENOMEM;
|
||||
if (!imu->bvec) {
|
||||
io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
|
||||
if (!imu->bvec)
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
mmap_read_lock(current->mm);
|
||||
@ -8141,7 +8205,13 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
|
||||
*/
|
||||
if (pret > 0)
|
||||
unpin_user_pages(pages, pret);
|
||||
io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
|
||||
kvfree(imu->bvec);
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = io_buffer_account_pin(ctx, pages, pret, imu, &last_hpage);
|
||||
if (ret) {
|
||||
unpin_user_pages(pages, pret);
|
||||
kvfree(imu->bvec);
|
||||
goto err;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user