forked from Minki/linux
f183324133
From Roman ("percpu: partial chunk depopulation"): In our [Facebook] production experience the percpu memory allocator is sometimes struggling with returning the memory to the system. A typical example is a creation of several thousands memory cgroups (each has several chunks of the percpu data used for vmstats, vmevents, ref counters etc). Deletion and complete releasing of these cgroups doesn't always lead to a shrinkage of the percpu memory, so that sometimes there are several GB's of memory wasted. The underlying problem is the fragmentation: to release an underlying chunk all percpu allocations should be released first. The percpu allocator tends to top up chunks to improve the utilization. It means new small-ish allocations (e.g. percpu ref counters) are placed onto almost filled old-ish chunks, effectively pinning them in memory. This patchset solves this problem by implementing a partial depopulation of percpu chunks: chunks with many empty pages are being asynchronously depopulated and the pages are returned to the system. To illustrate the problem the following script can be used: -- cd /sys/fs/cgroup mkdir percpu_test echo "+memory" > percpu_test/cgroup.subtree_control cat /proc/meminfo | grep Percpu for i in `seq 1 1000`; do mkdir percpu_test/cg_"${i}" for j in `seq 1 10`; do mkdir percpu_test/cg_"${i}"_"${j}" done done cat /proc/meminfo | grep Percpu for i in `seq 1 1000`; do for j in `seq 1 10`; do rmdir percpu_test/cg_"${i}"_"${j}" done done sleep 10 cat /proc/meminfo | grep Percpu for i in `seq 1 1000`; do rmdir percpu_test/cg_"${i}" done rmdir percpu_test -- It creates 11000 memory cgroups and removes every 10 out of 11. It prints the initial size of the percpu memory, the size after creating all cgroups and the size after deleting most of them. Results: vanilla: ./percpu_test.sh Percpu: 7488 kB Percpu: 481152 kB Percpu: 481152 kB with this patchset applied: ./percpu_test.sh Percpu: 7488 kB Percpu: 481408 kB Percpu: 135552 kB The total size of the percpu memory was reduced by more than 3.5 times. This patch: This patch implements partial depopulation of percpu chunks. As of now, a chunk can be depopulated only as a part of the final destruction, if there are no more outstanding allocations. However to minimize a memory waste it might be useful to depopulate a partially filed chunk, if a small number of outstanding allocations prevents the chunk from being fully reclaimed. This patch implements the following depopulation process: it scans over the chunk pages, looks for a range of empty and populated pages and performs the depopulation. To avoid races with new allocations, the chunk is previously isolated. After the depopulation the chunk is sidelined to a special list or freed. New allocations prefer using active chunks to sidelined chunks. If a sidelined chunk is used, it is reintegrated to the active lists. The depopulation is scheduled on the free path if the chunk is all of the following: 1) has more than 1/4 of total pages free and populated 2) the system has enough free percpu pages aside of this chunk 3) isn't the reserved chunk 4) isn't the first chunk If it's already depopulated but got free populated pages, it's a good target too. The chunk is moved to a special slot, pcpu_to_depopulate_slot, chunk->isolated is set, and the balance work item is scheduled. On isolation, these pages are removed from the pcpu_nr_empty_pop_pages. It is constantly replaced to the to_depopulate_slot when it meets these qualifications. pcpu_reclaim_populated() iterates over the to_depopulate_slot until it becomes empty. The depopulation is performed in the reverse direction to keep populated pages close to the beginning. Depopulated chunks are sidelined to preferentially avoid them for new allocations. When no active chunk can suffice a new allocation, sidelined chunks are first checked before creating a new chunk. Signed-off-by: Roman Gushchin <guro@fb.com> Co-developed-by: Dennis Zhou <dennis@kernel.org> Signed-off-by: Dennis Zhou <dennis@kernel.org> Tested-by: Pratik Sampat <psampat@linux.ibm.com> Signed-off-by: Dennis Zhou <dennis@kernel.org>
290 lines
7.9 KiB
C
290 lines
7.9 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _MM_PERCPU_INTERNAL_H
|
|
#define _MM_PERCPU_INTERNAL_H
|
|
|
|
#include <linux/types.h>
|
|
#include <linux/percpu.h>
|
|
|
|
/*
|
|
* There are two chunk types: root and memcg-aware.
|
|
* Chunks of each type have separate slots list.
|
|
*
|
|
* Memcg-aware chunks have an attached vector of obj_cgroup pointers, which is
|
|
* used to store memcg membership data of a percpu object. Obj_cgroups are
|
|
* ref-counted pointers to a memory cgroup with an ability to switch dynamically
|
|
* to the parent memory cgroup. This allows to reclaim a deleted memory cgroup
|
|
* without reclaiming of all outstanding objects, which hold a reference at it.
|
|
*/
|
|
enum pcpu_chunk_type {
|
|
PCPU_CHUNK_ROOT,
|
|
#ifdef CONFIG_MEMCG_KMEM
|
|
PCPU_CHUNK_MEMCG,
|
|
#endif
|
|
PCPU_NR_CHUNK_TYPES,
|
|
PCPU_FAIL_ALLOC = PCPU_NR_CHUNK_TYPES
|
|
};
|
|
|
|
/*
|
|
* pcpu_block_md is the metadata block struct.
|
|
* Each chunk's bitmap is split into a number of full blocks.
|
|
* All units are in terms of bits.
|
|
*
|
|
* The scan hint is the largest known contiguous area before the contig hint.
|
|
* It is not necessarily the actual largest contig hint though. There is an
|
|
* invariant that the scan_hint_start > contig_hint_start iff
|
|
* scan_hint == contig_hint. This is necessary because when scanning forward,
|
|
* we don't know if a new contig hint would be better than the current one.
|
|
*/
|
|
struct pcpu_block_md {
|
|
int scan_hint; /* scan hint for block */
|
|
int scan_hint_start; /* block relative starting
|
|
position of the scan hint */
|
|
int contig_hint; /* contig hint for block */
|
|
int contig_hint_start; /* block relative starting
|
|
position of the contig hint */
|
|
int left_free; /* size of free space along
|
|
the left side of the block */
|
|
int right_free; /* size of free space along
|
|
the right side of the block */
|
|
int first_free; /* block position of first free */
|
|
int nr_bits; /* total bits responsible for */
|
|
};
|
|
|
|
struct pcpu_chunk {
|
|
#ifdef CONFIG_PERCPU_STATS
|
|
int nr_alloc; /* # of allocations */
|
|
size_t max_alloc_size; /* largest allocation size */
|
|
#endif
|
|
|
|
struct list_head list; /* linked to pcpu_slot lists */
|
|
int free_bytes; /* free bytes in the chunk */
|
|
struct pcpu_block_md chunk_md;
|
|
void *base_addr; /* base address of this chunk */
|
|
|
|
unsigned long *alloc_map; /* allocation map */
|
|
unsigned long *bound_map; /* boundary map */
|
|
struct pcpu_block_md *md_blocks; /* metadata blocks */
|
|
|
|
void *data; /* chunk data */
|
|
bool immutable; /* no [de]population allowed */
|
|
bool isolated; /* isolated from active chunk
|
|
slots */
|
|
int start_offset; /* the overlap with the previous
|
|
region to have a page aligned
|
|
base_addr */
|
|
int end_offset; /* additional area required to
|
|
have the region end page
|
|
aligned */
|
|
#ifdef CONFIG_MEMCG_KMEM
|
|
struct obj_cgroup **obj_cgroups; /* vector of object cgroups */
|
|
#endif
|
|
|
|
int nr_pages; /* # of pages served by this chunk */
|
|
int nr_populated; /* # of populated pages */
|
|
int nr_empty_pop_pages; /* # of empty populated pages */
|
|
unsigned long populated[]; /* populated bitmap */
|
|
};
|
|
|
|
extern spinlock_t pcpu_lock;
|
|
|
|
extern struct list_head *pcpu_chunk_lists;
|
|
extern int pcpu_nr_slots;
|
|
extern int pcpu_sidelined_slot;
|
|
extern int pcpu_to_depopulate_slot;
|
|
extern int pcpu_nr_empty_pop_pages[];
|
|
|
|
extern struct pcpu_chunk *pcpu_first_chunk;
|
|
extern struct pcpu_chunk *pcpu_reserved_chunk;
|
|
|
|
/**
|
|
* pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks
|
|
* @chunk: chunk of interest
|
|
*
|
|
* This conversion is from the number of physical pages that the chunk
|
|
* serves to the number of bitmap blocks used.
|
|
*/
|
|
static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk)
|
|
{
|
|
return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE;
|
|
}
|
|
|
|
/**
|
|
* pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap
|
|
* @pages: number of physical pages
|
|
*
|
|
* This conversion is from physical pages to the number of bits
|
|
* required in the bitmap.
|
|
*/
|
|
static inline int pcpu_nr_pages_to_map_bits(int pages)
|
|
{
|
|
return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
|
|
}
|
|
|
|
/**
|
|
* pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap
|
|
* @chunk: chunk of interest
|
|
*
|
|
* This conversion is from the number of physical pages that the chunk
|
|
* serves to the number of bits in the bitmap.
|
|
*/
|
|
static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
|
|
{
|
|
return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
|
|
}
|
|
|
|
#ifdef CONFIG_MEMCG_KMEM
|
|
static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
|
|
{
|
|
if (chunk->obj_cgroups)
|
|
return PCPU_CHUNK_MEMCG;
|
|
return PCPU_CHUNK_ROOT;
|
|
}
|
|
|
|
static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
|
|
{
|
|
return chunk_type == PCPU_CHUNK_MEMCG;
|
|
}
|
|
|
|
#else
|
|
static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
|
|
{
|
|
return PCPU_CHUNK_ROOT;
|
|
}
|
|
|
|
static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
|
|
{
|
|
return false;
|
|
}
|
|
#endif
|
|
|
|
static inline struct list_head *pcpu_chunk_list(enum pcpu_chunk_type chunk_type)
|
|
{
|
|
return &pcpu_chunk_lists[pcpu_nr_slots *
|
|
pcpu_is_memcg_chunk(chunk_type)];
|
|
}
|
|
|
|
#ifdef CONFIG_PERCPU_STATS
|
|
|
|
#include <linux/spinlock.h>
|
|
|
|
struct percpu_stats {
|
|
u64 nr_alloc; /* lifetime # of allocations */
|
|
u64 nr_dealloc; /* lifetime # of deallocations */
|
|
u64 nr_cur_alloc; /* current # of allocations */
|
|
u64 nr_max_alloc; /* max # of live allocations */
|
|
u32 nr_chunks; /* current # of live chunks */
|
|
u32 nr_max_chunks; /* max # of live chunks */
|
|
size_t min_alloc_size; /* min allocaiton size */
|
|
size_t max_alloc_size; /* max allocation size */
|
|
};
|
|
|
|
extern struct percpu_stats pcpu_stats;
|
|
extern struct pcpu_alloc_info pcpu_stats_ai;
|
|
|
|
/*
|
|
* For debug purposes. We don't care about the flexible array.
|
|
*/
|
|
static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
|
|
{
|
|
memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info));
|
|
|
|
/* initialize min_alloc_size to unit_size */
|
|
pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size;
|
|
}
|
|
|
|
/*
|
|
* pcpu_stats_area_alloc - increment area allocation stats
|
|
* @chunk: the location of the area being allocated
|
|
* @size: size of area to allocate in bytes
|
|
*
|
|
* CONTEXT:
|
|
* pcpu_lock.
|
|
*/
|
|
static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
|
|
{
|
|
lockdep_assert_held(&pcpu_lock);
|
|
|
|
pcpu_stats.nr_alloc++;
|
|
pcpu_stats.nr_cur_alloc++;
|
|
pcpu_stats.nr_max_alloc =
|
|
max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc);
|
|
pcpu_stats.min_alloc_size =
|
|
min(pcpu_stats.min_alloc_size, size);
|
|
pcpu_stats.max_alloc_size =
|
|
max(pcpu_stats.max_alloc_size, size);
|
|
|
|
chunk->nr_alloc++;
|
|
chunk->max_alloc_size = max(chunk->max_alloc_size, size);
|
|
}
|
|
|
|
/*
|
|
* pcpu_stats_area_dealloc - decrement allocation stats
|
|
* @chunk: the location of the area being deallocated
|
|
*
|
|
* CONTEXT:
|
|
* pcpu_lock.
|
|
*/
|
|
static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
|
|
{
|
|
lockdep_assert_held(&pcpu_lock);
|
|
|
|
pcpu_stats.nr_dealloc++;
|
|
pcpu_stats.nr_cur_alloc--;
|
|
|
|
chunk->nr_alloc--;
|
|
}
|
|
|
|
/*
|
|
* pcpu_stats_chunk_alloc - increment chunk stats
|
|
*/
|
|
static inline void pcpu_stats_chunk_alloc(void)
|
|
{
|
|
unsigned long flags;
|
|
spin_lock_irqsave(&pcpu_lock, flags);
|
|
|
|
pcpu_stats.nr_chunks++;
|
|
pcpu_stats.nr_max_chunks =
|
|
max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks);
|
|
|
|
spin_unlock_irqrestore(&pcpu_lock, flags);
|
|
}
|
|
|
|
/*
|
|
* pcpu_stats_chunk_dealloc - decrement chunk stats
|
|
*/
|
|
static inline void pcpu_stats_chunk_dealloc(void)
|
|
{
|
|
unsigned long flags;
|
|
spin_lock_irqsave(&pcpu_lock, flags);
|
|
|
|
pcpu_stats.nr_chunks--;
|
|
|
|
spin_unlock_irqrestore(&pcpu_lock, flags);
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
|
|
{
|
|
}
|
|
|
|
static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
|
|
{
|
|
}
|
|
|
|
static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
|
|
{
|
|
}
|
|
|
|
static inline void pcpu_stats_chunk_alloc(void)
|
|
{
|
|
}
|
|
|
|
static inline void pcpu_stats_chunk_dealloc(void)
|
|
{
|
|
}
|
|
|
|
#endif /* !CONFIG_PERCPU_STATS */
|
|
|
|
#endif
|