mirror of
https://github.com/torvalds/linux.git
synced 2024-11-30 16:11:38 +00:00
3c7be18ac9
Percpu memory is becoming more and more widely used by various subsystems, and the total amount of memory controlled by the percpu allocator can make a good part of the total memory. As an example, bpf maps can consume a lot of percpu memory, and they are created by a user. Also, some cgroup internals (e.g. memory controller statistics) can be quite large. On a machine with many CPUs and big number of cgroups they can consume hundreds of megabytes. So the lack of memcg accounting is creating a breach in the memory isolation. Similar to the slab memory, percpu memory should be accounted by default. To implement the perpcu accounting it's possible to take the slab memory accounting as a model to follow. Let's introduce two types of percpu chunks: root and memcg. What makes memcg chunks different is an additional space allocated to store memcg membership information. If __GFP_ACCOUNT is passed on allocation, a memcg chunk should be be used. If it's possible to charge the corresponding size to the target memory cgroup, allocation is performed, and the memcg ownership data is recorded. System-wide allocations are performed using root chunks, so there is no additional memory overhead. To implement a fast reparenting of percpu memory on memcg removal, we don't store mem_cgroup pointers directly: instead we use obj_cgroup API, introduced for slab accounting. [akpm@linux-foundation.org: fix CONFIG_MEMCG_KMEM=n build errors and warning] [akpm@linux-foundation.org: move unreachable code, per Roman] [cuibixuan@huawei.com: mm/percpu: fix 'defined but not used' warning] Link: http://lkml.kernel.org/r/6d41b939-a741-b521-a7a2-e7296ec16219@huawei.com Signed-off-by: Roman Gushchin <guro@fb.com> Signed-off-by: Bixuan Cui <cuibixuan@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Shakeel Butt <shakeelb@google.com> Acked-by: Dennis Zhou <dennis@kernel.org> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Tejun Heo <tj@kernel.org> Cc: Tobin C. Harding <tobin@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Waiman Long <longman@redhat.com> Cc: Bixuan Cui <cuibixuan@huawei.com> Cc: Michal Koutný <mkoutny@suse.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Link: http://lkml.kernel.org/r/20200623184515.4132564-3-guro@fb.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
121 lines
3.0 KiB
C
121 lines
3.0 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* mm/percpu-km.c - kernel memory based chunk allocation
|
|
*
|
|
* Copyright (C) 2010 SUSE Linux Products GmbH
|
|
* Copyright (C) 2010 Tejun Heo <tj@kernel.org>
|
|
*
|
|
* Chunks are allocated as a contiguous kernel memory using gfp
|
|
* allocation. This is to be used on nommu architectures.
|
|
*
|
|
* To use percpu-km,
|
|
*
|
|
* - define CONFIG_NEED_PER_CPU_KM from the arch Kconfig.
|
|
*
|
|
* - CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK must not be defined. It's
|
|
* not compatible with PER_CPU_KM. EMBED_FIRST_CHUNK should work
|
|
* fine.
|
|
*
|
|
* - NUMA is not supported. When setting up the first chunk,
|
|
* @cpu_distance_fn should be NULL or report all CPUs to be nearer
|
|
* than or at LOCAL_DISTANCE.
|
|
*
|
|
* - It's best if the chunk size is power of two multiple of
|
|
* PAGE_SIZE. Because each chunk is allocated as a contiguous
|
|
* kernel memory block using alloc_pages(), memory will be wasted if
|
|
* chunk size is not aligned. percpu-km code will whine about it.
|
|
*/
|
|
|
|
#if defined(CONFIG_SMP) && defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
|
|
#error "contiguous percpu allocation is incompatible with paged first chunk"
|
|
#endif
|
|
|
|
#include <linux/log2.h>
|
|
|
|
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
|
|
int page_start, int page_end, gfp_t gfp)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
|
|
int page_start, int page_end)
|
|
{
|
|
/* nada */
|
|
}
|
|
|
|
static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
|
|
gfp_t gfp)
|
|
{
|
|
const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
|
|
struct pcpu_chunk *chunk;
|
|
struct page *pages;
|
|
unsigned long flags;
|
|
int i;
|
|
|
|
chunk = pcpu_alloc_chunk(type, gfp);
|
|
if (!chunk)
|
|
return NULL;
|
|
|
|
pages = alloc_pages(gfp, order_base_2(nr_pages));
|
|
if (!pages) {
|
|
pcpu_free_chunk(chunk);
|
|
return NULL;
|
|
}
|
|
|
|
for (i = 0; i < nr_pages; i++)
|
|
pcpu_set_page_chunk(nth_page(pages, i), chunk);
|
|
|
|
chunk->data = pages;
|
|
chunk->base_addr = page_address(pages);
|
|
|
|
spin_lock_irqsave(&pcpu_lock, flags);
|
|
pcpu_chunk_populated(chunk, 0, nr_pages);
|
|
spin_unlock_irqrestore(&pcpu_lock, flags);
|
|
|
|
pcpu_stats_chunk_alloc();
|
|
trace_percpu_create_chunk(chunk->base_addr);
|
|
|
|
return chunk;
|
|
}
|
|
|
|
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
|
|
{
|
|
const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
|
|
|
|
if (!chunk)
|
|
return;
|
|
|
|
pcpu_stats_chunk_dealloc();
|
|
trace_percpu_destroy_chunk(chunk->base_addr);
|
|
|
|
if (chunk->data)
|
|
__free_pages(chunk->data, order_base_2(nr_pages));
|
|
pcpu_free_chunk(chunk);
|
|
}
|
|
|
|
static struct page *pcpu_addr_to_page(void *addr)
|
|
{
|
|
return virt_to_page(addr);
|
|
}
|
|
|
|
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
|
|
{
|
|
size_t nr_pages, alloc_pages;
|
|
|
|
/* all units must be in a single group */
|
|
if (ai->nr_groups != 1) {
|
|
pr_crit("can't handle more than one group\n");
|
|
return -EINVAL;
|
|
}
|
|
|
|
nr_pages = (ai->groups[0].nr_units * ai->unit_size) >> PAGE_SHIFT;
|
|
alloc_pages = roundup_pow_of_two(nr_pages);
|
|
|
|
if (alloc_pages > nr_pages)
|
|
pr_warn("wasting %zu pages per chunk\n",
|
|
alloc_pages - nr_pages);
|
|
|
|
return 0;
|
|
}
|