mm/page_alloc: introduce vm.percpu_pagelist_high_fraction
This introduces a new sysctl vm.percpu_pagelist_high_fraction. It is similar to the old vm.percpu_pagelist_fraction. The old sysctl increased both pcp->batch and pcp->high with the higher pcp->high potentially reducing zone->lock contention. However, the higher pcp->batch value also potentially increased allocation latency while the PCP was refilled. This sysctl only adjusts pcp->high so that zone->lock contention is potentially reduced but allocation latency during a PCP refill remains the same. # grep -E "high:|batch" /proc/zoneinfo | tail -2 high: 649 batch: 63 # sysctl vm.percpu_pagelist_high_fraction=8 # grep -E "high:|batch" /proc/zoneinfo | tail -2 high: 35071 batch: 63 # sysctl vm.percpu_pagelist_high_fraction=64 high: 4383 batch: 63 # sysctl vm.percpu_pagelist_high_fraction=0 high: 649 batch: 63 [mgorman@techsingularity.net: fix documentation] Link: https://lkml.kernel.org/r/20210528151010.GQ30378@techsingularity.net Link: https://lkml.kernel.org/r/20210525080119.5455-7-mgorman@techsingularity.net Signed-off-by: Mel Gorman <mgorman@techsingularity.net> Acked-by: Dave Hansen <dave.hansen@linux.intel.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Hillf Danton <hdanton@sina.com> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
c49c2c47da
commit
74f4482209
@ -64,6 +64,7 @@ Currently, these files are in /proc/sys/vm:
|
||||
- overcommit_ratio
|
||||
- page-cluster
|
||||
- panic_on_oom
|
||||
- percpu_pagelist_high_fraction
|
||||
- stat_interval
|
||||
- stat_refresh
|
||||
- numa_stat
|
||||
@ -789,6 +790,26 @@ panic_on_oom=2+kdump gives you very strong tool to investigate
|
||||
why oom happens. You can get snapshot.
|
||||
|
||||
|
||||
percpu_pagelist_high_fraction
|
||||
=============================
|
||||
|
||||
This is the fraction of pages in each zone that are can be stored to
|
||||
per-cpu page lists. It is an upper boundary that is divided depending
|
||||
on the number of online CPUs. The min value for this is 8 which means
|
||||
that we do not allow more than 1/8th of pages in each zone to be stored
|
||||
on per-cpu page lists. This entry only changes the value of hot per-cpu
|
||||
page lists. A user can specify a number like 100 to allocate 1/100th of
|
||||
each zone between per-cpu lists.
|
||||
|
||||
The batch value of each per-cpu page list remains the same regardless of
|
||||
the value of the high fraction so allocation latencies are unaffected.
|
||||
|
||||
The initial value is zero. Kernel uses this value to set the high pcp->high
|
||||
mark based on the low watermark for the zone and the number of local
|
||||
online CPUs. If the user writes '0' to this sysctl, it will revert to
|
||||
this default behavior.
|
||||
|
||||
|
||||
stat_interval
|
||||
=============
|
||||
|
||||
|
@ -1029,12 +1029,15 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *,
|
||||
extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
|
||||
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *,
|
||||
size_t *, loff_t *);
|
||||
int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *, int,
|
||||
void *, size_t *, loff_t *);
|
||||
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
|
||||
void *, size_t *, loff_t *);
|
||||
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
|
||||
void *, size_t *, loff_t *);
|
||||
int numa_zonelist_order_handler(struct ctl_table *, int,
|
||||
void *, size_t *, loff_t *);
|
||||
extern int percpu_pagelist_high_fraction;
|
||||
extern char numa_zonelist_order[];
|
||||
#define NUMA_ZONELIST_ORDER_LEN 16
|
||||
|
||||
|
@ -2908,6 +2908,14 @@ static struct ctl_table vm_table[] = {
|
||||
.extra1 = SYSCTL_ONE,
|
||||
.extra2 = &one_thousand,
|
||||
},
|
||||
{
|
||||
.procname = "percpu_pagelist_high_fraction",
|
||||
.data = &percpu_pagelist_high_fraction,
|
||||
.maxlen = sizeof(percpu_pagelist_high_fraction),
|
||||
.mode = 0644,
|
||||
.proc_handler = percpu_pagelist_high_fraction_sysctl_handler,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
},
|
||||
{
|
||||
.procname = "page_lock_unfairness",
|
||||
.data = &sysctl_page_lock_unfairness,
|
||||
|
@ -120,6 +120,7 @@ typedef int __bitwise fpi_t;
|
||||
|
||||
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
|
||||
static DEFINE_MUTEX(pcp_batch_high_lock);
|
||||
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
|
||||
|
||||
struct pagesets {
|
||||
local_lock_t lock;
|
||||
@ -192,6 +193,7 @@ EXPORT_SYMBOL(_totalram_pages);
|
||||
unsigned long totalreserve_pages __read_mostly;
|
||||
unsigned long totalcma_pages __read_mostly;
|
||||
|
||||
int percpu_pagelist_high_fraction;
|
||||
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
|
||||
DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
|
||||
EXPORT_SYMBOL(init_on_alloc);
|
||||
@ -6725,17 +6727,32 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
|
||||
#ifdef CONFIG_MMU
|
||||
int high;
|
||||
int nr_local_cpus;
|
||||
unsigned long total_pages;
|
||||
|
||||
if (!percpu_pagelist_high_fraction) {
|
||||
/*
|
||||
* By default, the high value of the pcp is based on the zone
|
||||
* low watermark so that if they are full then background
|
||||
* reclaim will not be started prematurely.
|
||||
*/
|
||||
total_pages = low_wmark_pages(zone);
|
||||
} else {
|
||||
/*
|
||||
* If percpu_pagelist_high_fraction is configured, the high
|
||||
* value is based on a fraction of the managed pages in the
|
||||
* zone.
|
||||
*/
|
||||
total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
|
||||
}
|
||||
|
||||
/*
|
||||
* The high value of the pcp is based on the zone low watermark
|
||||
* so that if they are full then background reclaim will not be
|
||||
* started prematurely. The value is split across all online CPUs
|
||||
* local to the zone. Note that early in boot that CPUs may not be
|
||||
* online yet and that during CPU hotplug that the cpumask is not
|
||||
* yet updated when a CPU is being onlined.
|
||||
* Split the high value across all online CPUs local to the zone. Note
|
||||
* that early in boot that CPUs may not be online yet and that during
|
||||
* CPU hotplug that the cpumask is not yet updated when a CPU is being
|
||||
* onlined.
|
||||
*/
|
||||
nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))) + cpu_online;
|
||||
high = low_wmark_pages(zone) / nr_local_cpus;
|
||||
high = total_pages / nr_local_cpus;
|
||||
|
||||
/*
|
||||
* Ensure high is at least batch*4. The multiple is based on the
|
||||
@ -8500,6 +8517,44 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
|
||||
* cpu. It is the fraction of total pages in each zone that a hot per cpu
|
||||
* pagelist can have before it gets flushed back to buddy allocator.
|
||||
*/
|
||||
int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
|
||||
int write, void *buffer, size_t *length, loff_t *ppos)
|
||||
{
|
||||
struct zone *zone;
|
||||
int old_percpu_pagelist_high_fraction;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&pcp_batch_high_lock);
|
||||
old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
|
||||
|
||||
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
|
||||
if (!write || ret < 0)
|
||||
goto out;
|
||||
|
||||
/* Sanity checking to avoid pcp imbalance */
|
||||
if (percpu_pagelist_high_fraction &&
|
||||
percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) {
|
||||
percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction;
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* No change? */
|
||||
if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction)
|
||||
goto out;
|
||||
|
||||
for_each_populated_zone(zone)
|
||||
zone_set_pageset_high_and_batch(zone, 0);
|
||||
out:
|
||||
mutex_unlock(&pcp_batch_high_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
|
||||
/*
|
||||
* Returns the number of pages that arch has reserved but
|
||||
|
Loading…
Reference in New Issue
Block a user