mirror of
https://github.com/torvalds/linux.git
synced 2024-12-11 05:33:09 +00:00
aebb02ce8b
Current code allows the page_reporting_order parameter to be changed via sysfs to any integer value. The new value is used immediately in page reporting code with no validation, which could cause incorrect behavior. Fix this by adding validation of the new value. Export this parameter for use in the driver that is calling the page_reporting_register(). This is needed by drivers like hv_balloon to know the order of the pages reported. Traditionally the values provided in the kernel boot line or subsequently changed via sysfs take priority therefore, if page_reporting_order parameter's value is set, it takes precedence over the value passed while registering with the driver. Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com> Reviewed-by: Michael Kelley <mikelley@microsoft.com> Acked-by: Andrew Morton <akpm@linux-foundation.org> Link: https://lore.kernel.org/r/1664517699-1085-2-git-send-email-shradhagupta@linux.microsoft.com Signed-off-by: Wei Liu <wei.liu@kernel.org>
416 lines
12 KiB
C
416 lines
12 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#include <linux/mm.h>
|
|
#include <linux/mmzone.h>
|
|
#include <linux/page_reporting.h>
|
|
#include <linux/gfp.h>
|
|
#include <linux/export.h>
|
|
#include <linux/module.h>
|
|
#include <linux/delay.h>
|
|
#include <linux/scatterlist.h>
|
|
|
|
#include "page_reporting.h"
|
|
#include "internal.h"
|
|
|
|
/* Initialize to an unsupported value */
|
|
unsigned int page_reporting_order = -1;
|
|
|
|
static int page_order_update_notify(const char *val, const struct kernel_param *kp)
|
|
{
|
|
/*
|
|
* If param is set beyond this limit, order is set to default
|
|
* pageblock_order value
|
|
*/
|
|
return param_set_uint_minmax(val, kp, 0, MAX_ORDER-1);
|
|
}
|
|
|
|
static const struct kernel_param_ops page_reporting_param_ops = {
|
|
.set = &page_order_update_notify,
|
|
/*
|
|
* For the get op, use param_get_int instead of param_get_uint.
|
|
* This is to make sure that when unset the initialized value of
|
|
* -1 is shown correctly
|
|
*/
|
|
.get = ¶m_get_int,
|
|
};
|
|
|
|
module_param_cb(page_reporting_order, &page_reporting_param_ops,
|
|
&page_reporting_order, 0644);
|
|
MODULE_PARM_DESC(page_reporting_order, "Set page reporting order");
|
|
|
|
/*
|
|
* This symbol is also a kernel parameter. Export the page_reporting_order
|
|
* symbol so that other drivers can access it to control order values without
|
|
* having to introduce another configurable parameter. Only one driver can
|
|
* register with the page_reporting driver for the service, so we have just
|
|
* one control parameter for the use case(which can be accessed in both
|
|
* drivers)
|
|
*/
|
|
EXPORT_SYMBOL_GPL(page_reporting_order);
|
|
|
|
#define PAGE_REPORTING_DELAY (2 * HZ)
|
|
static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
|
|
|
|
enum {
|
|
PAGE_REPORTING_IDLE = 0,
|
|
PAGE_REPORTING_REQUESTED,
|
|
PAGE_REPORTING_ACTIVE
|
|
};
|
|
|
|
/* request page reporting */
|
|
static void
|
|
__page_reporting_request(struct page_reporting_dev_info *prdev)
|
|
{
|
|
unsigned int state;
|
|
|
|
/* Check to see if we are in desired state */
|
|
state = atomic_read(&prdev->state);
|
|
if (state == PAGE_REPORTING_REQUESTED)
|
|
return;
|
|
|
|
/*
|
|
* If reporting is already active there is nothing we need to do.
|
|
* Test against 0 as that represents PAGE_REPORTING_IDLE.
|
|
*/
|
|
state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED);
|
|
if (state != PAGE_REPORTING_IDLE)
|
|
return;
|
|
|
|
/*
|
|
* Delay the start of work to allow a sizable queue to build. For
|
|
* now we are limiting this to running no more than once every
|
|
* couple of seconds.
|
|
*/
|
|
schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
|
|
}
|
|
|
|
/* notify prdev of free page reporting request */
|
|
void __page_reporting_notify(void)
|
|
{
|
|
struct page_reporting_dev_info *prdev;
|
|
|
|
/*
|
|
* We use RCU to protect the pr_dev_info pointer. In almost all
|
|
* cases this should be present, however in the unlikely case of
|
|
* a shutdown this will be NULL and we should exit.
|
|
*/
|
|
rcu_read_lock();
|
|
prdev = rcu_dereference(pr_dev_info);
|
|
if (likely(prdev))
|
|
__page_reporting_request(prdev);
|
|
|
|
rcu_read_unlock();
|
|
}
|
|
|
|
static void
|
|
page_reporting_drain(struct page_reporting_dev_info *prdev,
|
|
struct scatterlist *sgl, unsigned int nents, bool reported)
|
|
{
|
|
struct scatterlist *sg = sgl;
|
|
|
|
/*
|
|
* Drain the now reported pages back into their respective
|
|
* free lists/areas. We assume at least one page is populated.
|
|
*/
|
|
do {
|
|
struct page *page = sg_page(sg);
|
|
int mt = get_pageblock_migratetype(page);
|
|
unsigned int order = get_order(sg->length);
|
|
|
|
__putback_isolated_page(page, order, mt);
|
|
|
|
/* If the pages were not reported due to error skip flagging */
|
|
if (!reported)
|
|
continue;
|
|
|
|
/*
|
|
* If page was not comingled with another page we can
|
|
* consider the result to be "reported" since the page
|
|
* hasn't been modified, otherwise we will need to
|
|
* report on the new larger page when we make our way
|
|
* up to that higher order.
|
|
*/
|
|
if (PageBuddy(page) && buddy_order(page) == order)
|
|
__SetPageReported(page);
|
|
} while ((sg = sg_next(sg)));
|
|
|
|
/* reinitialize scatterlist now that it is empty */
|
|
sg_init_table(sgl, nents);
|
|
}
|
|
|
|
/*
|
|
* The page reporting cycle consists of 4 stages, fill, report, drain, and
|
|
* idle. We will cycle through the first 3 stages until we cannot obtain a
|
|
* full scatterlist of pages, in that case we will switch to idle.
|
|
*/
|
|
static int
|
|
page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
|
|
unsigned int order, unsigned int mt,
|
|
struct scatterlist *sgl, unsigned int *offset)
|
|
{
|
|
struct free_area *area = &zone->free_area[order];
|
|
struct list_head *list = &area->free_list[mt];
|
|
unsigned int page_len = PAGE_SIZE << order;
|
|
struct page *page, *next;
|
|
long budget;
|
|
int err = 0;
|
|
|
|
/*
|
|
* Perform early check, if free area is empty there is
|
|
* nothing to process so we can skip this free_list.
|
|
*/
|
|
if (list_empty(list))
|
|
return err;
|
|
|
|
spin_lock_irq(&zone->lock);
|
|
|
|
/*
|
|
* Limit how many calls we will be making to the page reporting
|
|
* device for this list. By doing this we avoid processing any
|
|
* given list for too long.
|
|
*
|
|
* The current value used allows us enough calls to process over a
|
|
* sixteenth of the current list plus one additional call to handle
|
|
* any pages that may have already been present from the previous
|
|
* list processed. This should result in us reporting all pages on
|
|
* an idle system in about 30 seconds.
|
|
*
|
|
* The division here should be cheap since PAGE_REPORTING_CAPACITY
|
|
* should always be a power of 2.
|
|
*/
|
|
budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
|
|
|
|
/* loop through free list adding unreported pages to sg list */
|
|
list_for_each_entry_safe(page, next, list, lru) {
|
|
/* We are going to skip over the reported pages. */
|
|
if (PageReported(page))
|
|
continue;
|
|
|
|
/*
|
|
* If we fully consumed our budget then update our
|
|
* state to indicate that we are requesting additional
|
|
* processing and exit this list.
|
|
*/
|
|
if (budget < 0) {
|
|
atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
|
|
next = page;
|
|
break;
|
|
}
|
|
|
|
/* Attempt to pull page from list and place in scatterlist */
|
|
if (*offset) {
|
|
if (!__isolate_free_page(page, order)) {
|
|
next = page;
|
|
break;
|
|
}
|
|
|
|
/* Add page to scatter list */
|
|
--(*offset);
|
|
sg_set_page(&sgl[*offset], page, page_len, 0);
|
|
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* Make the first non-reported page in the free list
|
|
* the new head of the free list before we release the
|
|
* zone lock.
|
|
*/
|
|
if (!list_is_first(&page->lru, list))
|
|
list_rotate_to_front(&page->lru, list);
|
|
|
|
/* release lock before waiting on report processing */
|
|
spin_unlock_irq(&zone->lock);
|
|
|
|
/* begin processing pages in local list */
|
|
err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
|
|
|
|
/* reset offset since the full list was reported */
|
|
*offset = PAGE_REPORTING_CAPACITY;
|
|
|
|
/* update budget to reflect call to report function */
|
|
budget--;
|
|
|
|
/* reacquire zone lock and resume processing */
|
|
spin_lock_irq(&zone->lock);
|
|
|
|
/* flush reported pages from the sg list */
|
|
page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err);
|
|
|
|
/*
|
|
* Reset next to first entry, the old next isn't valid
|
|
* since we dropped the lock to report the pages
|
|
*/
|
|
next = list_first_entry(list, struct page, lru);
|
|
|
|
/* exit on error */
|
|
if (err)
|
|
break;
|
|
}
|
|
|
|
/* Rotate any leftover pages to the head of the freelist */
|
|
if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, list))
|
|
list_rotate_to_front(&next->lru, list);
|
|
|
|
spin_unlock_irq(&zone->lock);
|
|
|
|
return err;
|
|
}
|
|
|
|
static int
|
|
page_reporting_process_zone(struct page_reporting_dev_info *prdev,
|
|
struct scatterlist *sgl, struct zone *zone)
|
|
{
|
|
unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
|
|
unsigned long watermark;
|
|
int err = 0;
|
|
|
|
/* Generate minimum watermark to be able to guarantee progress */
|
|
watermark = low_wmark_pages(zone) +
|
|
(PAGE_REPORTING_CAPACITY << page_reporting_order);
|
|
|
|
/*
|
|
* Cancel request if insufficient free memory or if we failed
|
|
* to allocate page reporting statistics for the zone.
|
|
*/
|
|
if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
|
|
return err;
|
|
|
|
/* Process each free list starting from lowest order/mt */
|
|
for (order = page_reporting_order; order < MAX_ORDER; order++) {
|
|
for (mt = 0; mt < MIGRATE_TYPES; mt++) {
|
|
/* We do not pull pages from the isolate free list */
|
|
if (is_migrate_isolate(mt))
|
|
continue;
|
|
|
|
err = page_reporting_cycle(prdev, zone, order, mt,
|
|
sgl, &offset);
|
|
if (err)
|
|
return err;
|
|
}
|
|
}
|
|
|
|
/* report the leftover pages before going idle */
|
|
leftover = PAGE_REPORTING_CAPACITY - offset;
|
|
if (leftover) {
|
|
sgl = &sgl[offset];
|
|
err = prdev->report(prdev, sgl, leftover);
|
|
|
|
/* flush any remaining pages out from the last report */
|
|
spin_lock_irq(&zone->lock);
|
|
page_reporting_drain(prdev, sgl, leftover, !err);
|
|
spin_unlock_irq(&zone->lock);
|
|
}
|
|
|
|
return err;
|
|
}
|
|
|
|
static void page_reporting_process(struct work_struct *work)
|
|
{
|
|
struct delayed_work *d_work = to_delayed_work(work);
|
|
struct page_reporting_dev_info *prdev =
|
|
container_of(d_work, struct page_reporting_dev_info, work);
|
|
int err = 0, state = PAGE_REPORTING_ACTIVE;
|
|
struct scatterlist *sgl;
|
|
struct zone *zone;
|
|
|
|
/*
|
|
* Change the state to "Active" so that we can track if there is
|
|
* anyone requests page reporting after we complete our pass. If
|
|
* the state is not altered by the end of the pass we will switch
|
|
* to idle and quit scheduling reporting runs.
|
|
*/
|
|
atomic_set(&prdev->state, state);
|
|
|
|
/* allocate scatterlist to store pages being reported on */
|
|
sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);
|
|
if (!sgl)
|
|
goto err_out;
|
|
|
|
sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
|
|
|
|
for_each_zone(zone) {
|
|
err = page_reporting_process_zone(prdev, sgl, zone);
|
|
if (err)
|
|
break;
|
|
}
|
|
|
|
kfree(sgl);
|
|
err_out:
|
|
/*
|
|
* If the state has reverted back to requested then there may be
|
|
* additional pages to be processed. We will defer for 2s to allow
|
|
* more pages to accumulate.
|
|
*/
|
|
state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
|
|
if (state == PAGE_REPORTING_REQUESTED)
|
|
schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
|
|
}
|
|
|
|
static DEFINE_MUTEX(page_reporting_mutex);
|
|
DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
|
|
|
|
int page_reporting_register(struct page_reporting_dev_info *prdev)
|
|
{
|
|
int err = 0;
|
|
|
|
mutex_lock(&page_reporting_mutex);
|
|
|
|
/* nothing to do if already in use */
|
|
if (rcu_access_pointer(pr_dev_info)) {
|
|
err = -EBUSY;
|
|
goto err_out;
|
|
}
|
|
|
|
/*
|
|
* If the page_reporting_order value is not set, we check if
|
|
* an order is provided from the driver that is performing the
|
|
* registration. If that is not provided either, we default to
|
|
* pageblock_order.
|
|
*/
|
|
|
|
if (page_reporting_order == -1) {
|
|
if (prdev->order > 0 && prdev->order <= MAX_ORDER)
|
|
page_reporting_order = prdev->order;
|
|
else
|
|
page_reporting_order = pageblock_order;
|
|
}
|
|
|
|
/* initialize state and work structures */
|
|
atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
|
|
INIT_DELAYED_WORK(&prdev->work, &page_reporting_process);
|
|
|
|
/* Begin initial flush of zones */
|
|
__page_reporting_request(prdev);
|
|
|
|
/* Assign device to allow notifications */
|
|
rcu_assign_pointer(pr_dev_info, prdev);
|
|
|
|
/* enable page reporting notification */
|
|
if (!static_key_enabled(&page_reporting_enabled)) {
|
|
static_branch_enable(&page_reporting_enabled);
|
|
pr_info("Free page reporting enabled\n");
|
|
}
|
|
err_out:
|
|
mutex_unlock(&page_reporting_mutex);
|
|
|
|
return err;
|
|
}
|
|
EXPORT_SYMBOL_GPL(page_reporting_register);
|
|
|
|
void page_reporting_unregister(struct page_reporting_dev_info *prdev)
|
|
{
|
|
mutex_lock(&page_reporting_mutex);
|
|
|
|
if (rcu_access_pointer(pr_dev_info) == prdev) {
|
|
/* Disable page reporting notification */
|
|
RCU_INIT_POINTER(pr_dev_info, NULL);
|
|
synchronize_rcu();
|
|
|
|
/* Flush any existing work, and lock it out */
|
|
cancel_delayed_work_sync(&prdev->work);
|
|
}
|
|
|
|
mutex_unlock(&page_reporting_mutex);
|
|
}
|
|
EXPORT_SYMBOL_GPL(page_reporting_unregister);
|