From 24c6a097b5a270e05c6e99a99da66b91be81fd7d Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 2 Nov 2023 03:23:22 +0000 Subject: [PATCH 01/34] slub: Reflow ___slab_alloc() The get_partial() interface used in ___slab_alloc() may return a single object in the "kmem_cache_debug(s)" case, in which we will just return the "freelist" object. Move this handling up to prepare for later changes. And the "pfmemalloc_match()" part is not needed for node partial slab, since we already check this in the get_partial_node(). Signed-off-by: Chengming Zhou Reviewed-by: Vlastimil Babka Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 63d281dfacdb..0b0fdc8c189f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3216,8 +3216,21 @@ new_objects: pc.slab = &slab; pc.orig_size = orig_size; freelist = get_partial(s, node, &pc); - if (freelist) - goto check_new_slab; + if (freelist) { + if (kmem_cache_debug(s)) { + /* + * For debug caches here we had to go through + * alloc_single_from_partial() so just store the + * tracking info and return the object. + */ + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr); + + return freelist; + } + + goto retry_load_slab; + } slub_put_cpu_ptr(s->cpu_slab); slab = new_slab(s, gfpflags, node); @@ -3253,20 +3266,6 @@ new_objects: inc_slabs_node(s, slab_nid(slab), slab->objects); -check_new_slab: - - if (kmem_cache_debug(s)) { - /* - * For debug caches here we had to go through - * alloc_single_from_partial() so just store the tracking info - * and return the object - */ - if (s->flags & SLAB_STORE_USER) - set_track(s, freelist, TRACK_ALLOC, addr); - - return freelist; - } - if (unlikely(!pfmemalloc_match(slab, gfpflags))) { /* * For !pfmemalloc_match() case we don't load freelist so that From 43c4c349149c77f27c8e5801755a7b8883a70ebe Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 2 Nov 2023 03:23:23 +0000 Subject: [PATCH 02/34] slub: Change get_partial() interfaces to return slab We need all get_partial() related interfaces to return a slab, instead of returning the freelist (or object). Use the partial_context.object to return back freelist or object for now. This patch shouldn't have any functional changes. Suggested-by: Vlastimil Babka Signed-off-by: Chengming Zhou Reviewed-by: Vlastimil Babka Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 63 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 0b0fdc8c189f..03384cd965c5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -204,9 +204,9 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); /* Structure holding parameters for get_partial() call chain */ struct partial_context { - struct slab **slab; gfp_t flags; unsigned int orig_size; + void *object; }; static inline bool kmem_cache_debug(struct kmem_cache *s) @@ -2269,10 +2269,11 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); /* * Try to allocate a partial slab from a specific node. */ -static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, - struct partial_context *pc) +static struct slab *get_partial_node(struct kmem_cache *s, + struct kmem_cache_node *n, + struct partial_context *pc) { - struct slab *slab, *slab2; + struct slab *slab, *slab2, *partial = NULL; void *object = NULL; unsigned long flags; unsigned int partial_slabs = 0; @@ -2288,27 +2289,28 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { - void *t; - if (!pfmemalloc_match(slab, pc->flags)) continue; if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { object = alloc_single_from_partial(s, n, slab, pc->orig_size); - if (object) + if (object) { + partial = slab; + pc->object = object; break; + } continue; } - t = acquire_slab(s, n, slab, object == NULL); - if (!t) + object = acquire_slab(s, n, slab, object == NULL); + if (!object) break; - if (!object) { - *pc->slab = slab; + if (!partial) { + partial = slab; + pc->object = object; stat(s, ALLOC_FROM_PARTIAL); - object = t; } else { put_cpu_partial(s, slab, 0); stat(s, CPU_PARTIAL_NODE); @@ -2324,20 +2326,21 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, } spin_unlock_irqrestore(&n->list_lock, flags); - return object; + return partial; } /* * Get a slab from somewhere. Search in increasing NUMA distances. */ -static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) +static struct slab *get_any_partial(struct kmem_cache *s, + struct partial_context *pc) { #ifdef CONFIG_NUMA struct zonelist *zonelist; struct zoneref *z; struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(pc->flags); - void *object; + struct slab *slab; unsigned int cpuset_mems_cookie; /* @@ -2372,8 +2375,8 @@ static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) if (n && cpuset_zone_allowed(zone, pc->flags) && n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, pc); - if (object) { + slab = get_partial_node(s, n, pc); + if (slab) { /* * Don't check read_mems_allowed_retry() * here - if mems_allowed was updated in @@ -2381,7 +2384,7 @@ static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) * between allocation and the cpuset * update */ - return object; + return slab; } } } @@ -2393,17 +2396,18 @@ static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) /* * Get a partial slab, lock it and return it. */ -static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc) +static struct slab *get_partial(struct kmem_cache *s, int node, + struct partial_context *pc) { - void *object; + struct slab *slab; int searchnode = node; if (node == NUMA_NO_NODE) searchnode = numa_mem_id(); - object = get_partial_node(s, get_node(s, searchnode), pc); - if (object || node != NUMA_NO_NODE) - return object; + slab = get_partial_node(s, get_node(s, searchnode), pc); + if (slab || node != NUMA_NO_NODE) + return slab; return get_any_partial(s, pc); } @@ -3213,10 +3217,10 @@ new_slab: new_objects: pc.flags = gfpflags; - pc.slab = &slab; pc.orig_size = orig_size; - freelist = get_partial(s, node, &pc); - if (freelist) { + slab = get_partial(s, node, &pc); + if (slab) { + freelist = pc.object; if (kmem_cache_debug(s)) { /* * For debug caches here we had to go through @@ -3408,12 +3412,11 @@ static void *__slab_alloc_node(struct kmem_cache *s, void *object; pc.flags = gfpflags; - pc.slab = &slab; pc.orig_size = orig_size; - object = get_partial(s, node, &pc); + slab = get_partial(s, node, &pc); - if (object) - return object; + if (slab) + return pc.object; slab = new_slab(s, gfpflags, node); if (unlikely(!slab)) { From 8a399e2f60037ed07a55278e39b20e43dea4f0c2 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 2 Nov 2023 03:23:24 +0000 Subject: [PATCH 03/34] slub: Keep track of whether slub is on the per-node partial list Now we rely on the "frozen" bit to see if we should manipulate the slab->slab_list, which will be changed in the following patch. Instead we introduce another way to keep track of whether slub is on the per-node partial list, here we reuse the PG_workingset bit. We have to use the atomic set_bit() and clear_bit() variants and change slab_unlock() to bit_spin_unlock() because when cmpxchg is not available and PG_lock is used, there may be concurrent operations on the two bits. Thanks to Mark Brown for reporting a hang and testing of a previous version where the non-atomic operations were used. Suggested-by: Matthew Wilcox Signed-off-by: Chengming Zhou Reviewed-by: Vlastimil Babka Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 03384cd965c5..6efcbf79fd2d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -522,7 +522,7 @@ static __always_inline void slab_unlock(struct slab *slab) struct page *page = slab_page(slab); VM_BUG_ON_PAGE(PageTail(page), page); - __bit_spin_unlock(PG_locked, &page->flags); + bit_spin_unlock(PG_locked, &page->flags); } static inline bool @@ -2116,6 +2116,25 @@ static void discard_slab(struct kmem_cache *s, struct slab *slab) free_slab(s, slab); } +/* + * SLUB reuses PG_workingset bit to keep track of whether it's on + * the per-node partial list. + */ +static inline bool slab_test_node_partial(const struct slab *slab) +{ + return folio_test_workingset((struct folio *)slab_folio(slab)); +} + +static inline void slab_set_node_partial(struct slab *slab) +{ + set_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); +} + +static inline void slab_clear_node_partial(struct slab *slab) +{ + clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); +} + /* * Management of partially allocated slabs. */ @@ -2127,6 +2146,7 @@ __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) list_add_tail(&slab->slab_list, &n->partial); else list_add(&slab->slab_list, &n->partial); + slab_set_node_partial(slab); } static inline void add_partial(struct kmem_cache_node *n, @@ -2141,6 +2161,7 @@ static inline void remove_partial(struct kmem_cache_node *n, { lockdep_assert_held(&n->list_lock); list_del(&slab->slab_list); + slab_clear_node_partial(slab); n->nr_partial--; } @@ -4833,6 +4854,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) if (free == slab->objects) { list_move(&slab->slab_list, &discard); + slab_clear_node_partial(slab); n->nr_partial--; dec_slabs_node(s, node, slab->objects); } else if (free <= SHRINK_PROMOTE_MAX) From 422e7d54375889484b66962d1dcbc392a6bd9e7a Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 2 Nov 2023 03:23:25 +0000 Subject: [PATCH 04/34] slub: Prepare __slab_free() for unfrozen partial slab out of node partial list Now the partially empty slub will be frozen when taken out of node partial list, so the __slab_free() will know from "was_frozen" that the partially empty slab is not on node partial list and is a cpu or cpu partial slab of some cpu. But we will change this, make partial slabs leave the node partial list with unfrozen state, so we need to change __slab_free() to use the new slab_test_node_partial() we just introduced. Signed-off-by: Chengming Zhou Reviewed-by: Vlastimil Babka Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index 6efcbf79fd2d..18f18fbbd97e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3631,6 +3631,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, unsigned long counters; struct kmem_cache_node *n = NULL; unsigned long flags; + bool on_node_partial; stat(s, FREE_SLOWPATH); @@ -3678,6 +3679,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, */ spin_lock_irqsave(&n->list_lock, flags); + on_node_partial = slab_test_node_partial(slab); } } @@ -3706,6 +3708,15 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, return; } + /* + * This slab was partially empty but not on the per-node partial list, + * in which case we shouldn't manipulate its list, just return. + */ + if (prior && !on_node_partial) { + spin_unlock_irqrestore(&n->list_lock, flags); + return; + } + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) goto slab_empty; From 213094b5d1af7e6ab294a6d8f3b50cafb72642ae Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 2 Nov 2023 03:23:26 +0000 Subject: [PATCH 05/34] slub: Introduce freeze_slab() We will have unfrozen slabs out of the node partial list later, so we need a freeze_slab() function to freeze the partial slab and get its freelist. Signed-off-by: Chengming Zhou Reviewed-by: Vlastimil Babka Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index 18f18fbbd97e..253626ef9f37 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3098,6 +3098,33 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) return freelist; } +/* + * Freeze the partial slab and return the pointer to the freelist. + */ +static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) +{ + struct slab new; + unsigned long counters; + void *freelist; + + do { + freelist = slab->freelist; + counters = slab->counters; + + new.counters = counters; + VM_BUG_ON(new.frozen); + + new.inuse = slab->objects; + new.frozen = 1; + + } while (!slab_update_freelist(s, slab, + freelist, counters, + NULL, new.counters, + "freeze_slab")); + + return freelist; +} + /* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. From 8cd3fa428b56352beaa38df756c1d3f1556f5514 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 2 Nov 2023 03:23:27 +0000 Subject: [PATCH 06/34] slub: Delay freezing of partial slabs Now we will freeze slabs when moving them out of node partial list to cpu partial list, this method needs two cmpxchg_double operations: 1. freeze slab (acquire_slab()) under the node list_lock 2. get_freelist() when pick used in ___slab_alloc() Actually we don't need to freeze when moving slabs out of node partial list, we can delay freezing to when use slab freelist in ___slab_alloc(), so we can save one cmpxchg_double(). And there are other good points: - The moving of slabs between node partial list and cpu partial list becomes simpler, since we don't need to freeze or unfreeze at all. - The node list_lock contention would be less, since we don't need to freeze any slab under the node list_lock. We can achieve this because there is no concurrent path would manipulate the partial slab list except the __slab_free() path, which is now serialized by slab_test_node_partial() under the list_lock. Since the slab returned by get_partial() interfaces is not frozen anymore and no freelist is returned in the partial_context, so we need to use the introduced freeze_slab() to freeze it and get its freelist. Similarly, the slabs on the CPU partial list are not frozen anymore, we need to freeze_slab() on it before use. We can now delete acquire_slab() as it became unused. Signed-off-by: Chengming Zhou Reviewed-by: Vlastimil Babka Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 115 ++++++++++++------------------------------------------ 1 file changed, 24 insertions(+), 91 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 253626ef9f37..5a5102a4c273 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2166,7 +2166,7 @@ static inline void remove_partial(struct kmem_cache_node *n, } /* - * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a + * Called only for kmem_cache_debug() caches instead of remove_partial(), with a * slab from the n->partial list. Remove only a single object from the slab, do * the alloc_debug_processing() checks and leave the slab on the list, or move * it to full list if it was the last free object. @@ -2234,51 +2234,6 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, return object; } -/* - * Remove slab from the partial list, freeze it and - * return the pointer to the freelist. - * - * Returns a list of objects or NULL if it fails. - */ -static inline void *acquire_slab(struct kmem_cache *s, - struct kmem_cache_node *n, struct slab *slab, - int mode) -{ - void *freelist; - unsigned long counters; - struct slab new; - - lockdep_assert_held(&n->list_lock); - - /* - * Zap the freelist and set the frozen bit. - * The old freelist is the list of objects for the - * per cpu allocation list. - */ - freelist = slab->freelist; - counters = slab->counters; - new.counters = counters; - if (mode) { - new.inuse = slab->objects; - new.freelist = NULL; - } else { - new.freelist = freelist; - } - - VM_BUG_ON(new.frozen); - new.frozen = 1; - - if (!__slab_update_freelist(s, slab, - freelist, counters, - new.freelist, new.counters, - "acquire_slab")) - return NULL; - - remove_partial(n, slab); - WARN_ON(!freelist); - return freelist; -} - #ifdef CONFIG_SLUB_CPU_PARTIAL static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); #else @@ -2295,7 +2250,6 @@ static struct slab *get_partial_node(struct kmem_cache *s, struct partial_context *pc) { struct slab *slab, *slab2, *partial = NULL; - void *object = NULL; unsigned long flags; unsigned int partial_slabs = 0; @@ -2314,7 +2268,7 @@ static struct slab *get_partial_node(struct kmem_cache *s, continue; if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { - object = alloc_single_from_partial(s, n, slab, + void *object = alloc_single_from_partial(s, n, slab, pc->orig_size); if (object) { partial = slab; @@ -2324,13 +2278,10 @@ static struct slab *get_partial_node(struct kmem_cache *s, continue; } - object = acquire_slab(s, n, slab, object == NULL); - if (!object) - break; + remove_partial(n, slab); if (!partial) { partial = slab; - pc->object = object; stat(s, ALLOC_FROM_PARTIAL); } else { put_cpu_partial(s, slab, 0); @@ -2629,9 +2580,6 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) unsigned long flags = 0; while (partial_slab) { - struct slab new; - struct slab old; - slab = partial_slab; partial_slab = slab->next; @@ -2644,23 +2592,7 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) spin_lock_irqsave(&n->list_lock, flags); } - do { - - old.freelist = slab->freelist; - old.counters = slab->counters; - VM_BUG_ON(!old.frozen); - - new.counters = old.counters; - new.freelist = old.freelist; - - new.frozen = 0; - - } while (!__slab_update_freelist(s, slab, - old.freelist, old.counters, - new.freelist, new.counters, - "unfreezing slab")); - - if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) { slab->next = slab_to_discard; slab_to_discard = slab; } else { @@ -3167,7 +3099,6 @@ reread_slab: node = NUMA_NO_NODE; goto new_slab; } -redo: if (unlikely(!node_match(slab, node))) { /* @@ -3243,7 +3174,8 @@ deactivate_slab: new_slab: - if (slub_percpu_partial(c)) { +#ifdef CONFIG_SLUB_CPU_PARTIAL + while (slub_percpu_partial(c)) { local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(c->slab)) { local_unlock_irqrestore(&s->cpu_slab->lock, flags); @@ -3255,12 +3187,22 @@ new_slab: goto new_objects; } - slab = c->slab = slub_percpu_partial(c); + slab = slub_percpu_partial(c); slub_set_percpu_partial(c, slab); local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, CPU_PARTIAL_ALLOC); - goto redo; + + if (unlikely(!node_match(slab, node) || + !pfmemalloc_match(slab, gfpflags))) { + slab->next = NULL; + __unfreeze_partials(s, slab); + continue; + } + + freelist = freeze_slab(s, slab); + goto retry_load_slab; } +#endif new_objects: @@ -3268,8 +3210,8 @@ new_objects: pc.orig_size = orig_size; slab = get_partial(s, node, &pc); if (slab) { - freelist = pc.object; if (kmem_cache_debug(s)) { + freelist = pc.object; /* * For debug caches here we had to go through * alloc_single_from_partial() so just store the @@ -3281,6 +3223,7 @@ new_objects: return freelist; } + freelist = freeze_slab(s, slab); goto retry_load_slab; } @@ -3682,18 +3625,8 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, was_frozen = new.frozen; new.inuse -= cnt; if ((!new.inuse || !prior) && !was_frozen) { - - if (kmem_cache_has_cpu_partial(s) && !prior) { - - /* - * Slab was on no list before and will be - * partially empty - * We can defer the list move and instead - * freeze it. - */ - new.frozen = 1; - - } else { /* Needs to be taken off a list */ + /* Needs to be taken off a list */ + if (!kmem_cache_has_cpu_partial(s) || prior) { n = get_node(s, slab_nid(slab)); /* @@ -3723,9 +3656,9 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * activity can be necessary. */ stat(s, FREE_FROZEN); - } else if (new.frozen) { + } else if (kmem_cache_has_cpu_partial(s) && !prior) { /* - * If we just froze the slab then put it onto the + * If we started with a full slab then put it onto the * per cpu partial list. */ put_cpu_partial(s, slab, 1); From 00eb60c28815e22690834b2e3951ded0cd300b8d Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 2 Nov 2023 03:23:28 +0000 Subject: [PATCH 07/34] slub: Optimize deactivate_slab() Since the introduce of unfrozen slabs on cpu partial list, we don't need to synchronize the slab frozen state under the node list_lock. The caller of deactivate_slab() and the caller of __slab_free() won't manipulate the slab list concurrently. So we can get node list_lock in the last stage if we really need to manipulate the slab list in this path. Signed-off-by: Chengming Zhou Reviewed-by: Vlastimil Babka Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 81 +++++++++++++++++++------------------------------------ 1 file changed, 27 insertions(+), 54 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 5a5102a4c273..47655f2fe55a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2468,10 +2468,8 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) static void deactivate_slab(struct kmem_cache *s, struct slab *slab, void *freelist) { - enum slab_modes { M_NONE, M_PARTIAL, M_FREE, M_FULL_NOLIST }; struct kmem_cache_node *n = get_node(s, slab_nid(slab)); int free_delta = 0; - enum slab_modes mode = M_NONE; void *nextfree, *freelist_iter, *freelist_tail; int tail = DEACTIVATE_TO_HEAD; unsigned long flags = 0; @@ -2509,65 +2507,40 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, /* * Stage two: Unfreeze the slab while splicing the per-cpu * freelist to the head of slab's freelist. - * - * Ensure that the slab is unfrozen while the list presence - * reflects the actual number of objects during unfreeze. - * - * We first perform cmpxchg holding lock and insert to list - * when it succeed. If there is mismatch then the slab is not - * unfrozen and number of objects in the slab may have changed. - * Then release lock and retry cmpxchg again. */ -redo: + do { + old.freelist = READ_ONCE(slab->freelist); + old.counters = READ_ONCE(slab->counters); + VM_BUG_ON(!old.frozen); - old.freelist = READ_ONCE(slab->freelist); - old.counters = READ_ONCE(slab->counters); - VM_BUG_ON(!old.frozen); - - /* Determine target state of the slab */ - new.counters = old.counters; - if (freelist_tail) { - new.inuse -= free_delta; - set_freepointer(s, freelist_tail, old.freelist); - new.freelist = freelist; - } else - new.freelist = old.freelist; - - new.frozen = 0; + /* Determine target state of the slab */ + new.counters = old.counters; + new.frozen = 0; + if (freelist_tail) { + new.inuse -= free_delta; + set_freepointer(s, freelist_tail, old.freelist); + new.freelist = freelist; + } else { + new.freelist = old.freelist; + } + } while (!slab_update_freelist(s, slab, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")); + /* + * Stage three: Manipulate the slab list based on the updated state. + */ if (!new.inuse && n->nr_partial >= s->min_partial) { - mode = M_FREE; - } else if (new.freelist) { - mode = M_PARTIAL; - /* - * Taking the spinlock removes the possibility that - * acquire_slab() will see a slab that is frozen - */ - spin_lock_irqsave(&n->list_lock, flags); - } else { - mode = M_FULL_NOLIST; - } - - - if (!slab_update_freelist(s, slab, - old.freelist, old.counters, - new.freelist, new.counters, - "unfreezing slab")) { - if (mode == M_PARTIAL) - spin_unlock_irqrestore(&n->list_lock, flags); - goto redo; - } - - - if (mode == M_PARTIAL) { - add_partial(n, slab, tail); - spin_unlock_irqrestore(&n->list_lock, flags); - stat(s, tail); - } else if (mode == M_FREE) { stat(s, DEACTIVATE_EMPTY); discard_slab(s, slab); stat(s, FREE_SLAB); - } else if (mode == M_FULL_NOLIST) { + } else if (new.freelist) { + spin_lock_irqsave(&n->list_lock, flags); + add_partial(n, slab, tail); + spin_unlock_irqrestore(&n->list_lock, flags); + stat(s, tail); + } else { stat(s, DEACTIVATE_FULL); } } From 21316fdc799932ff43fa00a6d6a45b16dbd77844 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 2 Nov 2023 03:23:29 +0000 Subject: [PATCH 08/34] slub: Rename all *unfreeze_partials* functions to *put_partials* Since all partial slabs on the CPU partial list are not frozen anymore, we don't unfreeze when moving cpu partial slabs to node partial list, it's better to rename these functions. Signed-off-by: Chengming Zhou Reviewed-by: Vlastimil Babka Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 47655f2fe55a..fe5fcf074dfd 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2546,7 +2546,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, } #ifdef CONFIG_SLUB_CPU_PARTIAL -static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) +static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) { struct kmem_cache_node *n = NULL, *n2 = NULL; struct slab *slab, *slab_to_discard = NULL; @@ -2588,9 +2588,9 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) } /* - * Unfreeze all the cpu partial slabs. + * Put all the cpu partial slabs to the node partial list. */ -static void unfreeze_partials(struct kmem_cache *s) +static void put_partials(struct kmem_cache *s) { struct slab *partial_slab; unsigned long flags; @@ -2601,11 +2601,11 @@ static void unfreeze_partials(struct kmem_cache *s) local_unlock_irqrestore(&s->cpu_slab->lock, flags); if (partial_slab) - __unfreeze_partials(s, partial_slab); + __put_partials(s, partial_slab); } -static void unfreeze_partials_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) +static void put_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) { struct slab *partial_slab; @@ -2613,7 +2613,7 @@ static void unfreeze_partials_cpu(struct kmem_cache *s, c->partial = NULL; if (partial_slab) - __unfreeze_partials(s, partial_slab); + __put_partials(s, partial_slab); } /* @@ -2626,7 +2626,7 @@ static void unfreeze_partials_cpu(struct kmem_cache *s, static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) { struct slab *oldslab; - struct slab *slab_to_unfreeze = NULL; + struct slab *slab_to_put = NULL; unsigned long flags; int slabs = 0; @@ -2641,7 +2641,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) * per node partial list. Postpone the actual unfreezing * outside of the critical section. */ - slab_to_unfreeze = oldslab; + slab_to_put = oldslab; oldslab = NULL; } else { slabs = oldslab->slabs; @@ -2657,17 +2657,17 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) local_unlock_irqrestore(&s->cpu_slab->lock, flags); - if (slab_to_unfreeze) { - __unfreeze_partials(s, slab_to_unfreeze); + if (slab_to_put) { + __put_partials(s, slab_to_put); stat(s, CPU_PARTIAL_DRAIN); } } #else /* CONFIG_SLUB_CPU_PARTIAL */ -static inline void unfreeze_partials(struct kmem_cache *s) { } -static inline void unfreeze_partials_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) { } +static inline void put_partials(struct kmem_cache *s) { } +static inline void put_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) { } #endif /* CONFIG_SLUB_CPU_PARTIAL */ @@ -2709,7 +2709,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) stat(s, CPUSLAB_FLUSH); } - unfreeze_partials_cpu(s, c); + put_partials_cpu(s, c); } struct slub_flush_work { @@ -2737,7 +2737,7 @@ static void flush_cpu_slab(struct work_struct *w) if (c->slab) flush_slab(s, c); - unfreeze_partials(s); + put_partials(s); } static bool has_cpu_slab(int cpu, struct kmem_cache *s) @@ -3168,7 +3168,7 @@ new_slab: if (unlikely(!node_match(slab, node) || !pfmemalloc_match(slab, gfpflags))) { slab->next = NULL; - __unfreeze_partials(s, slab); + __put_partials(s, slab); continue; } From 31bda717d7777b8b6cf542af2730651ad6bb4839 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 2 Nov 2023 03:23:30 +0000 Subject: [PATCH 09/34] slub: Update frozen slabs documentations in the source The current updated scheme (which this series implemented) is: - node partial slabs: PG_Workingset && !frozen - cpu partial slabs: !PG_Workingset && !frozen - cpu slabs: !PG_Workingset && frozen - full slabs: !PG_Workingset && !frozen The most important change is that "frozen" bit is not set for the cpu partial slabs anymore, __slab_free() will grab node list_lock then check by !PG_Workingset that it's not on a node partial list. And the "frozen" bit is still kept for the cpu slabs for performance, since we don't need to grab node list_lock to check whether the PG_Workingset is set or not if the "frozen" bit is set in __slab_free(). Update related documentations and comments in the source. Signed-off-by: Chengming Zhou Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Christoph Lameter (Ampere) Signed-off-by: Vlastimil Babka --- mm/slub.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index fe5fcf074dfd..4fc203a4fa03 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -76,13 +76,28 @@ * * Frozen slabs * - * If a slab is frozen then it is exempt from list management. It is not - * on any list except per cpu partial list. The processor that froze the + * If a slab is frozen then it is exempt from list management. It is + * the cpu slab which is actively allocated from by the processor that + * froze it and it is not on any list. The processor that froze the * slab is the one who can perform list operations on the slab. Other * processors may put objects onto the freelist but the processor that * froze the slab is the only one that can retrieve the objects from the * slab's freelist. * + * CPU partial slabs + * + * The partially empty slabs cached on the CPU partial list are used + * for performance reasons, which speeds up the allocation process. + * These slabs are not frozen, but are also exempt from list management, + * by clearing the PG_workingset flag when moving out of the node + * partial list. Please see __slab_free() for more details. + * + * To sum up, the current scheme is: + * - node partial slab: PG_Workingset && !frozen + * - cpu partial slab: !PG_Workingset && !frozen + * - cpu slab: !PG_Workingset && frozen + * - full slab: !PG_Workingset && !frozen + * * list_lock * * The list_lock protects the partial and full list on each node and @@ -2617,8 +2632,7 @@ static void put_partials_cpu(struct kmem_cache *s, } /* - * Put a slab that was just frozen (in __slab_free|get_partial_node) into a - * partial slab slot if available. + * Put a slab into a partial slab slot if available. * * If we did not find a slot then simply move all the partials to the * per node partial list. From 0445ee000498ec1a5b1ed31bf35816cbeaef5e1e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 20 Nov 2023 17:11:10 +0100 Subject: [PATCH 10/34] mm/slab, docs: switch mm-api docs generation from slab.c to slub.c The SLAB implementation is going to be removed, and mm-api.rst currently uses mm/slab.c to obtain kerneldocs for some API functions. Switch it to mm/slub.c and move the relevant kerneldocs of exported functions from one to the other. The rest of kerneldocs in slab.c is for static SLAB implementation-specific functions that don't have counterparts in slub.c and thus can be simply removed with the implementation. Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- Documentation/core-api/mm-api.rst | 2 +- mm/slab.c | 21 --------------------- mm/slub.c | 21 +++++++++++++++++++++ 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 2d091c873d1e..af8151db88b2 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -37,7 +37,7 @@ The Slab Cache .. kernel-doc:: include/linux/slab.h :internal: -.. kernel-doc:: mm/slab.c +.. kernel-doc:: mm/slub.c :export: .. kernel-doc:: mm/slab_common.c diff --git a/mm/slab.c b/mm/slab.c index 9ad3d0f2d1a5..37efe3241f9c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3491,19 +3491,6 @@ error: } EXPORT_SYMBOL(kmem_cache_alloc_bulk); -/** - * kmem_cache_alloc_node - Allocate an object on the specified node - * @cachep: The cache to allocate from. - * @flags: See kmalloc(). - * @nodeid: node number of the target node. - * - * Identical to kmem_cache_alloc but it will allocate memory on the given - * node, which can improve the performance for cpu bound structures. - * - * Fallback to other node is possible if __GFP_THISNODE is not set. - * - * Return: pointer to the new object or %NULL in case of error - */ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_); @@ -3564,14 +3551,6 @@ void __kmem_cache_free(struct kmem_cache *cachep, void *objp, __do_kmem_cache_free(cachep, objp, caller); } -/** - * kmem_cache_free - Deallocate an object - * @cachep: The cache the allocation was from. - * @objp: The previously allocated object. - * - * Free an object which was previously allocated from this - * cache. - */ void kmem_cache_free(struct kmem_cache *cachep, void *objp) { cachep = cache_from_obj(cachep, objp); diff --git a/mm/slub.c b/mm/slub.c index 63d281dfacdb..3e01731783df 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3518,6 +3518,19 @@ void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, caller, orig_size); } +/** + * kmem_cache_alloc_node - Allocate an object on the specified node + * @s: The cache to allocate from. + * @gfpflags: See kmalloc(). + * @node: node number of the target node. + * + * Identical to kmem_cache_alloc but it will allocate memory on the given + * node, which can improve the performance for cpu bound structures. + * + * Fallback to other node is possible if __GFP_THISNODE is not set. + * + * Return: pointer to the new object or %NULL in case of error + */ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); @@ -3822,6 +3835,14 @@ void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller) slab_free(s, virt_to_slab(x), x, NULL, &x, 1, caller); } +/** + * kmem_cache_free - Deallocate an object + * @s: The cache the allocation was from. + * @x: The previously allocated object. + * + * Free an object which was previously allocated from this + * cache. + */ void kmem_cache_free(struct kmem_cache *s, void *x) { s = cache_from_obj(s, x); From 2a19be61a65157b9c6c25e831392cdefbd0a8940 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 15:43:03 +0200 Subject: [PATCH 11/34] mm/slab: remove CONFIG_SLAB from all Kconfig and Makefile Remove CONFIG_SLAB, CONFIG_DEBUG_SLAB, CONFIG_SLAB_DEPRECATED and everything in Kconfig files and mm/Makefile that depends on those. Since SLUB is the only remaining allocator, remove the allocator choice, make CONFIG_SLUB a "def_bool y" for now and remove all explicit dependencies on SLUB or SLAB as it's now always enabled. Make every option's verbose name and description refer to "the slab allocator" without refering to the specific implementation. Do not rename the CONFIG_ option names yet. Everything under #ifdef CONFIG_SLAB, and mm/slab.c is now dead code, all code under #ifdef CONFIG_SLUB is now always compiled. Reviewed-by: Kees Cook Reviewed-by: Christoph Lameter Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- arch/arm64/Kconfig | 2 +- arch/s390/Kconfig | 2 +- arch/x86/Kconfig | 2 +- lib/Kconfig.debug | 1 - lib/Kconfig.kasan | 11 ++------ lib/Kconfig.kfence | 2 +- lib/Kconfig.kmsan | 2 +- mm/Kconfig | 68 ++++++++++------------------------------------ mm/Kconfig.debug | 16 +++-------- mm/Makefile | 6 +--- 10 files changed, 28 insertions(+), 84 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7b071a00425d..325b7140b576 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -154,7 +154,7 @@ config ARM64 select HAVE_MOVE_PUD select HAVE_PCI select HAVE_ACPI_APEI if (ACPI && EFI) - select HAVE_ALIGNED_STRUCT_PAGE if SLUB + select HAVE_ALIGNED_STRUCT_PAGE select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_BITREVERSE select HAVE_ARCH_COMPILER_H diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 3bec98d20283..afa42a6f2e09 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -146,7 +146,7 @@ config S390 select GENERIC_TIME_VSYSCALL select GENERIC_VDSO_TIME_NS select GENERIC_IOREMAP if PCI - select HAVE_ALIGNED_STRUCT_PAGE if SLUB + select HAVE_ALIGNED_STRUCT_PAGE select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3762f41bb092..3f460f334d4e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -169,7 +169,7 @@ config X86 select HAS_IOPORT select HAVE_ACPI_APEI if ACPI select HAVE_ACPI_APEI_NMI if ACPI - select HAVE_ALIGNED_STRUCT_PAGE if SLUB + select HAVE_ALIGNED_STRUCT_PAGE select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_HUGE_VMALLOC if X86_64 diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cc7d53d9dc01..e1765face106 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1985,7 +1985,6 @@ config FAULT_INJECTION config FAILSLAB bool "Fault-injection capability for kmalloc" depends on FAULT_INJECTION - depends on SLAB || SLUB help Provide fault-injection capability for kmalloc. diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index fdca89c05745..97e1fdbb5910 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -37,7 +37,7 @@ menuconfig KASAN (HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS)) && \ CC_HAS_WORKING_NOSANITIZE_ADDRESS) || \ HAVE_ARCH_KASAN_HW_TAGS - depends on (SLUB && SYSFS && !SLUB_TINY) || (SLAB && !DEBUG_SLAB) + depends on SYSFS && !SLUB_TINY select STACKDEPOT_ALWAYS_INIT help Enables KASAN (Kernel Address Sanitizer) - a dynamic memory safety @@ -78,7 +78,7 @@ config KASAN_GENERIC bool "Generic KASAN" depends on HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS - select SLUB_DEBUG if SLUB + select SLUB_DEBUG select CONSTRUCTORS help Enables Generic KASAN. @@ -89,13 +89,11 @@ config KASAN_GENERIC overhead of ~50% for dynamic allocations. The performance slowdown is ~x3. - (Incompatible with CONFIG_DEBUG_SLAB: the kernel does not boot.) - config KASAN_SW_TAGS bool "Software Tag-Based KASAN" depends on HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS - select SLUB_DEBUG if SLUB + select SLUB_DEBUG select CONSTRUCTORS help Enables Software Tag-Based KASAN. @@ -110,12 +108,9 @@ config KASAN_SW_TAGS May potentially introduce problems related to pointer casting and comparison, as it embeds a tag into the top byte of each pointer. - (Incompatible with CONFIG_DEBUG_SLAB: the kernel does not boot.) - config KASAN_HW_TAGS bool "Hardware Tag-Based KASAN" depends on HAVE_ARCH_KASAN_HW_TAGS - depends on SLUB help Enables Hardware Tag-Based KASAN. diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence index 459dda9ef619..6fbbebec683a 100644 --- a/lib/Kconfig.kfence +++ b/lib/Kconfig.kfence @@ -5,7 +5,7 @@ config HAVE_ARCH_KFENCE menuconfig KFENCE bool "KFENCE: low-overhead sampling-based memory safety error detector" - depends on HAVE_ARCH_KFENCE && (SLAB || SLUB) + depends on HAVE_ARCH_KFENCE select STACKTRACE select IRQ_WORK help diff --git a/lib/Kconfig.kmsan b/lib/Kconfig.kmsan index ef2c8f256c57..0541d7b079cc 100644 --- a/lib/Kconfig.kmsan +++ b/lib/Kconfig.kmsan @@ -11,7 +11,7 @@ config HAVE_KMSAN_COMPILER config KMSAN bool "KMSAN: detector of uninitialized values use" depends on HAVE_ARCH_KMSAN && HAVE_KMSAN_COMPILER - depends on SLUB && DEBUG_KERNEL && !KASAN && !KCSAN + depends on DEBUG_KERNEL && !KASAN && !KCSAN depends on !PREEMPT_RT select STACKDEPOT select STACKDEPOT_ALWAYS_INIT diff --git a/mm/Kconfig b/mm/Kconfig index 89971a894b60..4636870499bb 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -226,52 +226,17 @@ config ZSMALLOC_CHAIN_SIZE For more information, see zsmalloc documentation. -menu "SLAB allocator options" - -choice - prompt "Choose SLAB allocator" - default SLUB - help - This option allows to select a slab allocator. - -config SLAB_DEPRECATED - bool "SLAB (DEPRECATED)" - depends on !PREEMPT_RT - help - Deprecated and scheduled for removal in a few cycles. Replaced by - SLUB. - - If you cannot migrate to SLUB, please contact linux-mm@kvack.org - and the people listed in the SLAB ALLOCATOR section of MAINTAINERS - file, explaining why. - - The regular slab allocator that is established and known to work - well in all environments. It organizes cache hot objects in - per cpu and per node queues. +menu "Slab allocator options" config SLUB - bool "SLUB (Unqueued Allocator)" - help - SLUB is a slab allocator that minimizes cache line usage - instead of managing queues of cached objects (SLAB approach). - Per cpu caching is realized using slabs of objects instead - of queues of objects. SLUB can use memory efficiently - and has enhanced diagnostics. SLUB is the default choice for - a slab allocator. - -endchoice - -config SLAB - bool - default y - depends on SLAB_DEPRECATED + def_bool y config SLUB_TINY - bool "Configure SLUB for minimal memory footprint" - depends on SLUB && EXPERT + bool "Configure for minimal memory footprint" + depends on EXPERT select SLAB_MERGE_DEFAULT help - Configures the SLUB allocator in a way to achieve minimal memory + Configures the slab allocator in a way to achieve minimal memory footprint, sacrificing scalability, debugging and other features. This is intended only for the smallest system that had used the SLOB allocator and is not recommended for systems with more than @@ -282,7 +247,6 @@ config SLUB_TINY config SLAB_MERGE_DEFAULT bool "Allow slab caches to be merged" default y - depends on SLAB || SLUB help For reduced kernel memory fragmentation, slab caches can be merged when they share the same size and other characteristics. @@ -296,7 +260,7 @@ config SLAB_MERGE_DEFAULT config SLAB_FREELIST_RANDOM bool "Randomize slab freelist" - depends on SLAB || (SLUB && !SLUB_TINY) + depends on !SLUB_TINY help Randomizes the freelist order used on creating new pages. This security feature reduces the predictability of the kernel slab @@ -304,21 +268,19 @@ config SLAB_FREELIST_RANDOM config SLAB_FREELIST_HARDENED bool "Harden slab freelist metadata" - depends on SLAB || (SLUB && !SLUB_TINY) + depends on !SLUB_TINY help Many kernel heap attacks try to target slab cache metadata and other infrastructure. This options makes minor performance sacrifices to harden the kernel slab allocator against common - freelist exploit methods. Some slab implementations have more - sanity-checking than others. This option is most effective with - CONFIG_SLUB. + freelist exploit methods. config SLUB_STATS default n - bool "Enable SLUB performance statistics" - depends on SLUB && SYSFS && !SLUB_TINY + bool "Enable performance statistics" + depends on SYSFS && !SLUB_TINY help - SLUB statistics are useful to debug SLUBs allocation behavior in + The statistics are useful to debug slab allocation behavior in order find ways to optimize the allocator. This should never be enabled for production use since keeping statistics slows down the allocator by a few percentage points. The slabinfo command @@ -328,8 +290,8 @@ config SLUB_STATS config SLUB_CPU_PARTIAL default y - depends on SLUB && SMP && !SLUB_TINY - bool "SLUB per cpu partial cache" + depends on SMP && !SLUB_TINY + bool "Enable per cpu partial caches" help Per cpu partial caches accelerate objects allocation and freeing that is local to a processor at the price of more indeterminism @@ -339,7 +301,7 @@ config SLUB_CPU_PARTIAL config RANDOM_KMALLOC_CACHES default n - depends on SLUB && !SLUB_TINY + depends on !SLUB_TINY bool "Randomize slab caches for normal kmalloc" help A hardening feature that creates multiple copies of slab caches for @@ -354,7 +316,7 @@ config RANDOM_KMALLOC_CACHES limited degree of memory and CPU overhead that relates to hardware and system workload. -endmenu # SLAB allocator options +endmenu # Slab allocator options config SHUFFLE_PAGE_ALLOCATOR bool "Page allocator randomization" diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 018a5bd2f576..321ab379994f 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -45,18 +45,10 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT Enable debug page memory allocations by default? This value can be overridden by debug_pagealloc=off|on. -config DEBUG_SLAB - bool "Debug slab memory allocations" - depends on DEBUG_KERNEL && SLAB - help - Say Y here to have the kernel do limited verification on memory - allocation as well as poisoning memory on free to catch use of freed - memory. This can make kmalloc/kfree-intensive workloads much slower. - config SLUB_DEBUG default y bool "Enable SLUB debugging support" if EXPERT - depends on SLUB && SYSFS && !SLUB_TINY + depends on SYSFS && !SLUB_TINY select STACKDEPOT if STACKTRACE_SUPPORT help SLUB has extensive debug support features. Disabling these can @@ -66,7 +58,7 @@ config SLUB_DEBUG config SLUB_DEBUG_ON bool "SLUB debugging on by default" - depends on SLUB && SLUB_DEBUG + depends on SLUB_DEBUG select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT default n help @@ -231,8 +223,8 @@ config DEBUG_KMEMLEAK allocations. See Documentation/dev-tools/kmemleak.rst for more details. - Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances - of finding leaks due to the slab objects poisoning. + Enabling SLUB_DEBUG may increase the chances of finding leaks + due to the slab objects poisoning. In order to access the kmemleak file, debugfs needs to be mounted (usually at /sys/kernel/debug). diff --git a/mm/Makefile b/mm/Makefile index 33873c8aedb3..e4b5b75aaec9 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -4,7 +4,6 @@ # KASAN_SANITIZE_slab_common.o := n -KASAN_SANITIZE_slab.o := n KASAN_SANITIZE_slub.o := n KCSAN_SANITIZE_kmemleak.o := n @@ -12,7 +11,6 @@ KCSAN_SANITIZE_kmemleak.o := n # the same word but accesses to different bits of that word. Re-enable KCSAN # for these when we have more consensus on what to do about them. KCSAN_SANITIZE_slab_common.o := n -KCSAN_SANITIZE_slab.o := n KCSAN_SANITIZE_slub.o := n KCSAN_SANITIZE_page_alloc.o := n # But enable explicit instrumentation for memory barriers. @@ -22,7 +20,6 @@ KCSAN_INSTRUMENT_BARRIERS := y # flaky coverage that is not a function of syscall inputs. E.g. slab is out of # free pages, or a task is migrated between nodes. KCOV_INSTRUMENT_slab_common.o := n -KCOV_INSTRUMENT_slab.o := n KCOV_INSTRUMENT_slub.o := n KCOV_INSTRUMENT_page_alloc.o := n KCOV_INSTRUMENT_debug-pagealloc.o := n @@ -66,6 +63,7 @@ obj-y += page-alloc.o obj-y += init-mm.o obj-y += memblock.o obj-y += $(memory-hotplug-y) +obj-y += slub.o ifdef CONFIG_MMU obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o @@ -82,8 +80,6 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += page_poison.o -obj-$(CONFIG_SLAB) += slab.o -obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_KFENCE) += kfence/ obj-$(CONFIG_KMSAN) += kmsan/ From 72786c0a3dc5d4151469f512909049a0b17ada3d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 16:17:16 +0200 Subject: [PATCH 12/34] KASAN: remove code paths guarded by CONFIG_SLAB With SLAB removed and SLUB the only remaining allocator, we can clean up some code that was depending on the choice. Reviewed-by: Kees Cook Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/kasan/common.c | 13 ++----------- mm/kasan/kasan.h | 3 +-- mm/kasan/quarantine.c | 7 ------- 3 files changed, 3 insertions(+), 20 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 256930da578a..5d95219e69d7 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -153,10 +153,6 @@ void __kasan_poison_object_data(struct kmem_cache *cache, void *object) * 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be * accessed after being freed. We preassign tags for objects in these * caches as well. - * 3. For SLAB allocator we can't preassign tags randomly since the freelist - * is stored as an array of indexes instead of a linked list. Assign tags - * based on objects indexes, so that objects that are next to each other - * get different tags. */ static inline u8 assign_tag(struct kmem_cache *cache, const void *object, bool init) @@ -171,17 +167,12 @@ static inline u8 assign_tag(struct kmem_cache *cache, if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU)) return init ? KASAN_TAG_KERNEL : kasan_random_tag(); - /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */ -#ifdef CONFIG_SLAB - /* For SLAB assign tags based on the object index in the freelist. */ - return (u8)obj_to_index(cache, virt_to_slab(object), (void *)object); -#else /* - * For SLUB assign a random tag during slab creation, otherwise reuse + * For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU, + * assign a random tag during slab creation, otherwise reuse * the already assigned tag. */ return init ? kasan_random_tag() : get_tag(object); -#endif } void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 8b06bab5c406..eef50233640a 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -373,8 +373,7 @@ void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); void kasan_save_free_info(struct kmem_cache *cache, void *object); -#if defined(CONFIG_KASAN_GENERIC) && \ - (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) +#ifdef CONFIG_KASAN_GENERIC bool kasan_quarantine_put(struct kmem_cache *cache, void *object); void kasan_quarantine_reduce(void); void kasan_quarantine_remove_cache(struct kmem_cache *cache); diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index ca4529156735..138c57b836f2 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -144,10 +144,6 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) { void *object = qlink_to_object(qlink, cache); struct kasan_free_meta *meta = kasan_get_free_meta(cache, object); - unsigned long flags; - - if (IS_ENABLED(CONFIG_SLAB)) - local_irq_save(flags); /* * If init_on_free is enabled and KASAN's free metadata is stored in @@ -166,9 +162,6 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREE; ___cache_free(cache, object, _THIS_IP_); - - if (IS_ENABLED(CONFIG_SLAB)) - local_irq_restore(flags); } static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache) From a745b067db0f0711974063deb78c6639c24ec5bf Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 16:28:44 +0200 Subject: [PATCH 13/34] KFENCE: cleanup kfence_guarded_alloc() after CONFIG_SLAB removal Some struct slab fields are initialized differently for SLAB and SLUB so we can simplify with SLUB being the only remaining allocator. Reviewed-by: Kees Cook Reviewed-by: Marco Elver Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/kfence/core.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 3872528d0963..8350f5c06f2e 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -463,11 +463,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g /* Set required slab fields. */ slab = virt_to_slab((void *)meta->addr); slab->slab_cache = cache; -#if defined(CONFIG_SLUB) slab->objects = 1; -#elif defined(CONFIG_SLAB) - slab->s_mem = addr; -#endif /* Memory initialization. */ set_canary(meta); From bc3dcb850f1818528fcafb10dd38a4590d9119e3 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 16:33:43 +0200 Subject: [PATCH 14/34] mm/memcontrol: remove CONFIG_SLAB #ifdef guards With SLAB removed, these are never true anymore so we can clean up. Reviewed-by: Kees Cook Acked-by: Michal Hocko Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/memcontrol.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 774bd6e21e27..947fb50eba31 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5149,7 +5149,7 @@ out_kfree: return ret; } -#if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG) static int mem_cgroup_slab_show(struct seq_file *m, void *p) { /* @@ -5258,8 +5258,7 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, -#if defined(CONFIG_MEMCG_KMEM) && \ - (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG) { .name = "kmem.slabinfo", .seq_show = mem_cgroup_slab_show, From 70da1d01edf6da3fde1df98b2125a77083a0fb82 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 16:36:55 +0200 Subject: [PATCH 15/34] cpu/hotplug: remove CPUHP_SLAB_PREPARE hooks The CPUHP_SLAB_PREPARE hooks are only used by SLAB which is removed. SLUB defines them as NULL, so we can remove those altogether. Acked-by: Thomas Gleixner Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- include/linux/cpuhotplug.h | 1 - include/linux/slab.h | 8 -------- kernel/cpu.c | 5 ----- 3 files changed, 14 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index d305db70674b..07cb8f7030b6 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -108,7 +108,6 @@ enum cpuhp_state { CPUHP_X2APIC_PREPARE, CPUHP_SMPCFD_PREPARE, CPUHP_RELAY_PREPARE, - CPUHP_SLAB_PREPARE, CPUHP_MD_RAID5_PREPARE, CPUHP_RCUTREE_PREP, CPUHP_CPUIDLE_COUPLED_PREPARE, diff --git a/include/linux/slab.h b/include/linux/slab.h index d6d6ffeeb9a2..34e43cddc520 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -788,12 +788,4 @@ size_t kmalloc_size_roundup(size_t size); void __init kmem_cache_init_late(void); -#if defined(CONFIG_SMP) && defined(CONFIG_SLAB) -int slab_prepare_cpu(unsigned int cpu); -int slab_dead_cpu(unsigned int cpu); -#else -#define slab_prepare_cpu NULL -#define slab_dead_cpu NULL -#endif - #endif /* _LINUX_SLAB_H */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 9e4c6780adde..530b026d95a1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2125,11 +2125,6 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = relay_prepare_cpu, .teardown.single = NULL, }, - [CPUHP_SLAB_PREPARE] = { - .name = "slab:prepare", - .startup.single = slab_prepare_cpu, - .teardown.single = slab_dead_cpu, - }, [CPUHP_RCUTREE_PREP] = { .name = "RCU/tree:prepare", .startup.single = rcutree_prepare_cpu, From a9e0b9f27266d46ed6e73aac8d0844602cd0cb93 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 17:43:38 +0200 Subject: [PATCH 16/34] mm/slab: remove CONFIG_SLAB code from slab common code In slab_common.c and slab.h headers, we can now remove all code behind CONFIG_SLAB and CONFIG_DEBUG_SLAB ifdefs, and remove all CONFIG_SLUB ifdefs. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 14 ++------- mm/slab.h | 69 ++++---------------------------------------- mm/slab_common.c | 22 ++------------ 3 files changed, 9 insertions(+), 96 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 34e43cddc520..b2015d0e01ad 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -24,7 +24,7 @@ /* * Flags to pass to kmem_cache_create(). - * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set. + * The ones marked DEBUG need CONFIG_SLUB_DEBUG enabled, otherwise are no-op */ /* DEBUG: Perform (expensive) checks on alloc/free */ #define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100U) @@ -302,25 +302,15 @@ static inline unsigned int arch_slab_minalign(void) * Kmalloc array related definitions */ -#ifdef CONFIG_SLAB /* - * SLAB and SLUB directly allocates requests fitting in to an order-1 page + * SLUB directly allocates requests fitting in to an order-1 page * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) #define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW -#define KMALLOC_SHIFT_LOW 5 -#endif -#endif - -#ifdef CONFIG_SLUB -#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) -#ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif -#endif /* Maximum allocatable size */ #define KMALLOC_MAX_SIZE (1UL << KMALLOC_SHIFT_MAX) diff --git a/mm/slab.h b/mm/slab.h index 3d07fb428393..014c36ea51fa 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -42,21 +42,6 @@ typedef union { struct slab { unsigned long __page_flags; -#if defined(CONFIG_SLAB) - - struct kmem_cache *slab_cache; - union { - struct { - struct list_head slab_list; - void *freelist; /* array of free object indexes */ - void *s_mem; /* first object */ - }; - struct rcu_head rcu_head; - }; - unsigned int active; - -#elif defined(CONFIG_SLUB) - struct kmem_cache *slab_cache; union { struct { @@ -91,10 +76,6 @@ struct slab { }; unsigned int __unused; -#else -#error "Unexpected slab allocator configured" -#endif - atomic_t __page_refcount; #ifdef CONFIG_MEMCG unsigned long memcg_data; @@ -111,7 +92,7 @@ SLAB_MATCH(memcg_data, memcg_data); #endif #undef SLAB_MATCH static_assert(sizeof(struct slab) <= sizeof(struct page)); -#if defined(system_has_freelist_aba) && defined(CONFIG_SLUB) +#if defined(system_has_freelist_aba) static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t))); #endif @@ -228,13 +209,7 @@ static inline size_t slab_size(const struct slab *slab) return PAGE_SIZE << slab_order(slab); } -#ifdef CONFIG_SLAB -#include -#endif - -#ifdef CONFIG_SLUB #include -#endif #include #include @@ -320,26 +295,16 @@ static inline bool is_kmalloc_cache(struct kmem_cache *s) SLAB_CACHE_DMA32 | SLAB_PANIC | \ SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) -#if defined(CONFIG_DEBUG_SLAB) -#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) -#elif defined(CONFIG_SLUB_DEBUG) +#ifdef CONFIG_SLUB_DEBUG #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) #else #define SLAB_DEBUG_FLAGS (0) #endif -#if defined(CONFIG_SLAB) -#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ - SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ - SLAB_ACCOUNT | SLAB_NO_MERGE) -#elif defined(CONFIG_SLUB) #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | SLAB_ACCOUNT | \ SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) -#else -#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE) -#endif /* Common flags available with current configuration */ #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) @@ -672,18 +637,14 @@ size_t __ksize(const void *objp); static inline size_t slab_ksize(const struct kmem_cache *s) { -#ifndef CONFIG_SLUB - return s->object_size; - -#else /* CONFIG_SLUB */ -# ifdef CONFIG_SLUB_DEBUG +#ifdef CONFIG_SLUB_DEBUG /* * Debugging requires use of the padding between object * and whatever may come after it. */ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) return s->object_size; -# endif +#endif if (s->flags & SLAB_KASAN) return s->object_size; /* @@ -697,7 +658,6 @@ static inline size_t slab_ksize(const struct kmem_cache *s) * Else we can use all the padding etc for the allocation */ return s->size; -#endif } static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, @@ -775,23 +735,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, * The slab lists for all objects. */ struct kmem_cache_node { -#ifdef CONFIG_SLAB - raw_spinlock_t list_lock; - struct list_head slabs_partial; /* partial list first, better asm code */ - struct list_head slabs_full; - struct list_head slabs_free; - unsigned long total_slabs; /* length of all slab lists */ - unsigned long free_slabs; /* length of free slab list only */ - unsigned long free_objects; - unsigned int free_limit; - unsigned int colour_next; /* Per-node cache coloring */ - struct array_cache *shared; /* shared per node */ - struct alien_cache **alien; /* on other nodes */ - unsigned long next_reap; /* updated without locking */ - int free_touched; /* updated without locking */ -#endif - -#ifdef CONFIG_SLUB spinlock_t list_lock; unsigned long nr_partial; struct list_head partial; @@ -800,8 +743,6 @@ struct kmem_cache_node { atomic_long_t total_objects; struct list_head full; #endif -#endif - }; static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) @@ -818,7 +759,7 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) if ((__n = get_node(__s, __node))) -#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) +#ifdef CONFIG_SLUB_DEBUG void dump_unreclaimable_slab(void); #else static inline void dump_unreclaimable_slab(void) diff --git a/mm/slab_common.c b/mm/slab_common.c index 8d431193c273..63b8411db7ce 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -71,10 +71,8 @@ static int __init setup_slab_merge(char *str) return 1; } -#ifdef CONFIG_SLUB __setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0); __setup_param("slub_merge", slub_merge, setup_slab_merge, 0); -#endif __setup("slab_nomerge", setup_slab_nomerge); __setup("slab_merge", setup_slab_merge); @@ -197,10 +195,6 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, if (s->size - size >= sizeof(void *)) continue; - if (IS_ENABLED(CONFIG_SLAB) && align && - (align > s->align || s->align % align)) - continue; - return s; } return NULL; @@ -1222,12 +1216,8 @@ void cache_random_seq_destroy(struct kmem_cache *cachep) } #endif /* CONFIG_SLAB_FREELIST_RANDOM */ -#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) -#ifdef CONFIG_SLAB -#define SLABINFO_RIGHTS (0600) -#else +#ifdef CONFIG_SLUB_DEBUG #define SLABINFO_RIGHTS (0400) -#endif static void print_slabinfo_header(struct seq_file *m) { @@ -1235,18 +1225,10 @@ static void print_slabinfo_header(struct seq_file *m) * Output format version, so at least we can change it * without _too_ many complaints. */ -#ifdef CONFIG_DEBUG_SLAB - seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); -#else seq_puts(m, "slabinfo - version: 2.1\n"); -#endif seq_puts(m, "# name "); seq_puts(m, " : tunables "); seq_puts(m, " : slabdata "); -#ifdef CONFIG_DEBUG_SLAB - seq_puts(m, " : globalstat "); - seq_puts(m, " : cpustat "); -#endif seq_putc(m, '\n'); } @@ -1370,7 +1352,7 @@ static int __init slab_proc_init(void) } module_init(slab_proc_init); -#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */ +#endif /* CONFIG_SLUB_DEBUG */ static __always_inline __realloc_size(2) void * __do_krealloc(const void *p, size_t new_size, gfp_t flags) From 8c20b29db5087684c2d46ca5a33b504b0743c85e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 18:54:46 +0200 Subject: [PATCH 17/34] mm/mempool/dmapool: remove CONFIG_DEBUG_SLAB ifdefs CONFIG_DEBUG_SLAB is going away with CONFIG_SLAB, so remove dead ifdefs in mempool and dmapool code. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/dmapool.c | 2 +- mm/mempool.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index a151a21e571b..f0bfc6c490f4 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -36,7 +36,7 @@ #include #include -#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) +#ifdef CONFIG_SLUB_DEBUG_ON #define DMAPOOL_DEBUG 1 #endif diff --git a/mm/mempool.c b/mm/mempool.c index 734bcf5afbb7..4759be0ff9de 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -20,7 +20,7 @@ #include #include "slab.h" -#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) +#ifdef CONFIG_SLUB_DEBUG_ON static void poison_error(mempool_t *pool, void *element, size_t size, size_t byte) { @@ -95,14 +95,14 @@ static void poison_element(mempool_t *pool, void *element) kunmap_atomic(addr); } } -#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ +#else /* CONFIG_SLUB_DEBUG_ON */ static inline void check_element(mempool_t *pool, void *element) { } static inline void poison_element(mempool_t *pool, void *element) { } -#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ +#endif /* CONFIG_SLUB_DEBUG_ON */ static __always_inline void kasan_poison_element(mempool_t *pool, void *element) { From 16a1d968358aa9e897ce995fa45cb15d55a0e83d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 20:43:43 +0200 Subject: [PATCH 18/34] mm/slab: remove mm/slab.c and slab_def.h Remove the SLAB implementation. Update CREDITS. Also update and properly sort the SLOB entry there. RIP SLAB allocator (1996 - 2024) Reviewed-by: Kees Cook Acked-by: Christoph Lameter Acked-by: David Rientjes Tested-by: David Rientjes Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- CREDITS | 12 +- include/linux/slab_def.h | 124 -- mm/slab.c | 4005 -------------------------------------- 3 files changed, 8 insertions(+), 4133 deletions(-) delete mode 100644 include/linux/slab_def.h delete mode 100644 mm/slab.c diff --git a/CREDITS b/CREDITS index f33a33fd2371..e9a094a93287 100644 --- a/CREDITS +++ b/CREDITS @@ -9,10 +9,6 @@ Linus ---------- -N: Matt Mackal -E: mpm@selenic.com -D: SLOB slab allocator - N: Matti Aarnio E: mea@nic.funet.fi D: Alpha systems hacking, IPv6 and other network related stuff @@ -1572,6 +1568,10 @@ S: Ampferstr. 50 / 4 S: 6020 Innsbruck S: Austria +N: Mark Hemment +E: markhe@nextd.demon.co.uk +D: SLAB allocator implementation + N: Richard Henderson E: rth@twiddle.net E: rth@cygnus.com @@ -2437,6 +2437,10 @@ D: work on suspend-to-ram/disk, killing duplicates from ioctl32, D: Altera SoCFPGA and Nokia N900 support. S: Czech Republic +N: Olivia Mackall +E: olivia@selenic.com +D: SLOB slab allocator + N: Paul Mackerras E: paulus@samba.org D: PPP driver diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h deleted file mode 100644 index a61e7d55d0d3..000000000000 --- a/include/linux/slab_def.h +++ /dev/null @@ -1,124 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_SLAB_DEF_H -#define _LINUX_SLAB_DEF_H - -#include -#include - -/* - * Definitions unique to the original Linux SLAB allocator. - */ - -struct kmem_cache { - struct array_cache __percpu *cpu_cache; - -/* 1) Cache tunables. Protected by slab_mutex */ - unsigned int batchcount; - unsigned int limit; - unsigned int shared; - - unsigned int size; - struct reciprocal_value reciprocal_buffer_size; -/* 2) touched by every alloc & free from the backend */ - - slab_flags_t flags; /* constant flags */ - unsigned int num; /* # of objs per slab */ - -/* 3) cache_grow/shrink */ - /* order of pgs per slab (2^n) */ - unsigned int gfporder; - - /* force GFP flags, e.g. GFP_DMA */ - gfp_t allocflags; - - size_t colour; /* cache colouring range */ - unsigned int colour_off; /* colour offset */ - unsigned int freelist_size; - - /* constructor func */ - void (*ctor)(void *obj); - -/* 4) cache creation/removal */ - const char *name; - struct list_head list; - int refcount; - int object_size; - int align; - -/* 5) statistics */ -#ifdef CONFIG_DEBUG_SLAB - unsigned long num_active; - unsigned long num_allocations; - unsigned long high_mark; - unsigned long grown; - unsigned long reaped; - unsigned long errors; - unsigned long max_freeable; - unsigned long node_allocs; - unsigned long node_frees; - unsigned long node_overflow; - atomic_t allochit; - atomic_t allocmiss; - atomic_t freehit; - atomic_t freemiss; - - /* - * If debugging is enabled, then the allocator can add additional - * fields and/or padding to every object. 'size' contains the total - * object size including these internal fields, while 'obj_offset' - * and 'object_size' contain the offset to the user object and its - * size. - */ - int obj_offset; -#endif /* CONFIG_DEBUG_SLAB */ - -#ifdef CONFIG_KASAN_GENERIC - struct kasan_cache kasan_info; -#endif - -#ifdef CONFIG_SLAB_FREELIST_RANDOM - unsigned int *random_seq; -#endif - -#ifdef CONFIG_HARDENED_USERCOPY - unsigned int useroffset; /* Usercopy region offset */ - unsigned int usersize; /* Usercopy region size */ -#endif - - struct kmem_cache_node *node[MAX_NUMNODES]; -}; - -static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab, - void *x) -{ - void *object = x - (x - slab->s_mem) % cache->size; - void *last_object = slab->s_mem + (cache->num - 1) * cache->size; - - if (unlikely(object > last_object)) - return last_object; - else - return object; -} - -/* - * We want to avoid an expensive divide : (offset / cache->size) - * Using the fact that size is a constant for a particular cache, - * we can replace (offset / cache->size) by - * reciprocal_divide(offset, cache->reciprocal_buffer_size) - */ -static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct slab *slab, void *obj) -{ - u32 offset = (obj - slab->s_mem); - return reciprocal_divide(offset, cache->reciprocal_buffer_size); -} - -static inline int objs_per_slab(const struct kmem_cache *cache, - const struct slab *slab) -{ - if (is_kfence_address(slab_address(slab))) - return 1; - return cache->num; -} - -#endif /* _LINUX_SLAB_DEF_H */ diff --git a/mm/slab.c b/mm/slab.c deleted file mode 100644 index 37efe3241f9c..000000000000 --- a/mm/slab.c +++ /dev/null @@ -1,4005 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/mm/slab.c - * Written by Mark Hemment, 1996/97. - * (markhe@nextd.demon.co.uk) - * - * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli - * - * Major cleanup, different bufctl logic, per-cpu arrays - * (c) 2000 Manfred Spraul - * - * Cleanup, make the head arrays unconditional, preparation for NUMA - * (c) 2002 Manfred Spraul - * - * An implementation of the Slab Allocator as described in outline in; - * UNIX Internals: The New Frontiers by Uresh Vahalia - * Pub: Prentice Hall ISBN 0-13-101908-2 - * or with a little more detail in; - * The Slab Allocator: An Object-Caching Kernel Memory Allocator - * Jeff Bonwick (Sun Microsystems). - * Presented at: USENIX Summer 1994 Technical Conference - * - * The memory is organized in caches, one cache for each object type. - * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) - * Each cache consists out of many slabs (they are small (usually one - * page long) and always contiguous), and each slab contains multiple - * initialized objects. - * - * This means, that your constructor is used only for newly allocated - * slabs and you must pass objects with the same initializations to - * kmem_cache_free. - * - * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, - * normal). If you need a special memory type, then must create a new - * cache for that memory type. - * - * In order to reduce fragmentation, the slabs are sorted in 3 groups: - * full slabs with 0 free objects - * partial slabs - * empty slabs with no allocated objects - * - * If partial slabs exist, then new allocations come from these slabs, - * otherwise from empty slabs or new slabs are allocated. - * - * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache - * during kmem_cache_destroy(). The caller must prevent concurrent allocs. - * - * Each cache has a short per-cpu head array, most allocs - * and frees go into that array, and if that array overflows, then 1/2 - * of the entries in the array are given back into the global cache. - * The head array is strictly LIFO and should improve the cache hit rates. - * On SMP, it additionally reduces the spinlock operations. - * - * The c_cpuarray may not be read with enabled local interrupts - - * it's changed with a smp_call_function(). - * - * SMP synchronization: - * constructors and destructors are called without any locking. - * Several members in struct kmem_cache and struct slab never change, they - * are accessed without any locking. - * The per-cpu arrays are never accessed from the wrong cpu, no locking, - * and local interrupts are disabled so slab code is preempt-safe. - * The non-constant members are protected with a per-cache irq spinlock. - * - * Many thanks to Mark Hemment, who wrote another per-cpu slab patch - * in 2000 - many ideas in the current implementation are derived from - * his patch. - * - * Further notes from the original documentation: - * - * 11 April '97. Started multi-threading - markhe - * The global cache-chain is protected by the mutex 'slab_mutex'. - * The sem is only needed when accessing/extending the cache-chain, which - * can never happen inside an interrupt (kmem_cache_create(), - * kmem_cache_shrink() and kmem_cache_reap()). - * - * At present, each engine can be growing a cache. This should be blocked. - * - * 15 March 2005. NUMA slab allocator. - * Shai Fultheim . - * Shobhit Dayal - * Alok N Kataria - * Christoph Lameter - * - * Modified the slab allocator to be node aware on NUMA systems. - * Each node has its own list of partial, free and full slabs. - * All object allocations for a node occur from node specific slab lists. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include - -#include "internal.h" - -#include "slab.h" - -/* - * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. - * 0 for faster, smaller code (especially in the critical paths). - * - * STATS - 1 to collect stats for /proc/slabinfo. - * 0 for faster, smaller code (especially in the critical paths). - * - * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) - */ - -#ifdef CONFIG_DEBUG_SLAB -#define DEBUG 1 -#define STATS 1 -#define FORCED_DEBUG 1 -#else -#define DEBUG 0 -#define STATS 0 -#define FORCED_DEBUG 0 -#endif - -/* Shouldn't this be in a header file somewhere? */ -#define BYTES_PER_WORD sizeof(void *) -#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) - -#ifndef ARCH_KMALLOC_FLAGS -#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN -#endif - -#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \ - <= SLAB_OBJ_MIN_SIZE) ? 1 : 0) - -#if FREELIST_BYTE_INDEX -typedef unsigned char freelist_idx_t; -#else -typedef unsigned short freelist_idx_t; -#endif - -#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) - -/* - * struct array_cache - * - * Purpose: - * - LIFO ordering, to hand out cache-warm objects from _alloc - * - reduce the number of linked list operations - * - reduce spinlock operations - * - * The limit is stored in the per-cpu structure to reduce the data cache - * footprint. - * - */ -struct array_cache { - unsigned int avail; - unsigned int limit; - unsigned int batchcount; - unsigned int touched; - void *entry[]; /* - * Must have this definition in here for the proper - * alignment of array_cache. Also simplifies accessing - * the entries. - */ -}; - -struct alien_cache { - spinlock_t lock; - struct array_cache ac; -}; - -/* - * Need this for bootstrapping a per node allocator. - */ -#define NUM_INIT_LISTS (2 * MAX_NUMNODES) -static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; -#define CACHE_CACHE 0 -#define SIZE_NODE (MAX_NUMNODES) - -static int drain_freelist(struct kmem_cache *cache, - struct kmem_cache_node *n, int tofree); -static void free_block(struct kmem_cache *cachep, void **objpp, int len, - int node, struct list_head *list); -static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); -static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); -static void cache_reap(struct work_struct *unused); - -static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, - void **list); -static inline void fixup_slab_list(struct kmem_cache *cachep, - struct kmem_cache_node *n, struct slab *slab, - void **list); - -#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) - -static void kmem_cache_node_init(struct kmem_cache_node *parent) -{ - INIT_LIST_HEAD(&parent->slabs_full); - INIT_LIST_HEAD(&parent->slabs_partial); - INIT_LIST_HEAD(&parent->slabs_free); - parent->total_slabs = 0; - parent->free_slabs = 0; - parent->shared = NULL; - parent->alien = NULL; - parent->colour_next = 0; - raw_spin_lock_init(&parent->list_lock); - parent->free_objects = 0; - parent->free_touched = 0; -} - -#define MAKE_LIST(cachep, listp, slab, nodeid) \ - do { \ - INIT_LIST_HEAD(listp); \ - list_splice(&get_node(cachep, nodeid)->slab, listp); \ - } while (0) - -#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ - do { \ - MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ - MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ - MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ - } while (0) - -#define CFLGS_OBJFREELIST_SLAB ((slab_flags_t __force)0x40000000U) -#define CFLGS_OFF_SLAB ((slab_flags_t __force)0x80000000U) -#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) -#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) - -#define BATCHREFILL_LIMIT 16 -/* - * Optimization question: fewer reaps means less probability for unnecessary - * cpucache drain/refill cycles. - * - * OTOH the cpuarrays can contain lots of objects, - * which could lock up otherwise freeable slabs. - */ -#define REAPTIMEOUT_AC (2*HZ) -#define REAPTIMEOUT_NODE (4*HZ) - -#if STATS -#define STATS_INC_ACTIVE(x) ((x)->num_active++) -#define STATS_DEC_ACTIVE(x) ((x)->num_active--) -#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) -#define STATS_INC_GROWN(x) ((x)->grown++) -#define STATS_ADD_REAPED(x, y) ((x)->reaped += (y)) -#define STATS_SET_HIGH(x) \ - do { \ - if ((x)->num_active > (x)->high_mark) \ - (x)->high_mark = (x)->num_active; \ - } while (0) -#define STATS_INC_ERR(x) ((x)->errors++) -#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) -#define STATS_INC_NODEFREES(x) ((x)->node_frees++) -#define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++) -#define STATS_SET_FREEABLE(x, i) \ - do { \ - if ((x)->max_freeable < i) \ - (x)->max_freeable = i; \ - } while (0) -#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) -#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) -#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) -#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) -#else -#define STATS_INC_ACTIVE(x) do { } while (0) -#define STATS_DEC_ACTIVE(x) do { } while (0) -#define STATS_INC_ALLOCED(x) do { } while (0) -#define STATS_INC_GROWN(x) do { } while (0) -#define STATS_ADD_REAPED(x, y) do { (void)(y); } while (0) -#define STATS_SET_HIGH(x) do { } while (0) -#define STATS_INC_ERR(x) do { } while (0) -#define STATS_INC_NODEALLOCS(x) do { } while (0) -#define STATS_INC_NODEFREES(x) do { } while (0) -#define STATS_INC_ACOVERFLOW(x) do { } while (0) -#define STATS_SET_FREEABLE(x, i) do { } while (0) -#define STATS_INC_ALLOCHIT(x) do { } while (0) -#define STATS_INC_ALLOCMISS(x) do { } while (0) -#define STATS_INC_FREEHIT(x) do { } while (0) -#define STATS_INC_FREEMISS(x) do { } while (0) -#endif - -#if DEBUG - -/* - * memory layout of objects: - * 0 : objp - * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that - * the end of an object is aligned with the end of the real - * allocation. Catches writes behind the end of the allocation. - * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: - * redzone word. - * cachep->obj_offset: The real object. - * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] - * cachep->size - 1* BYTES_PER_WORD: last caller address - * [BYTES_PER_WORD long] - */ -static int obj_offset(struct kmem_cache *cachep) -{ - return cachep->obj_offset; -} - -static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) -{ - BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); - return (unsigned long long *) (objp + obj_offset(cachep) - - sizeof(unsigned long long)); -} - -static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) -{ - BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); - if (cachep->flags & SLAB_STORE_USER) - return (unsigned long long *)(objp + cachep->size - - sizeof(unsigned long long) - - REDZONE_ALIGN); - return (unsigned long long *) (objp + cachep->size - - sizeof(unsigned long long)); -} - -static void **dbg_userword(struct kmem_cache *cachep, void *objp) -{ - BUG_ON(!(cachep->flags & SLAB_STORE_USER)); - return (void **)(objp + cachep->size - BYTES_PER_WORD); -} - -#else - -#define obj_offset(x) 0 -#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) -#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) -#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) - -#endif - -/* - * Do not go above this order unless 0 objects fit into the slab or - * overridden on the command line. - */ -#define SLAB_MAX_ORDER_HI 1 -#define SLAB_MAX_ORDER_LO 0 -static int slab_max_order = SLAB_MAX_ORDER_LO; -static bool slab_max_order_set __initdata; - -static inline void *index_to_obj(struct kmem_cache *cache, - const struct slab *slab, unsigned int idx) -{ - return slab->s_mem + cache->size * idx; -} - -#define BOOT_CPUCACHE_ENTRIES 1 -/* internal cache of cache description objs */ -static struct kmem_cache kmem_cache_boot = { - .batchcount = 1, - .limit = BOOT_CPUCACHE_ENTRIES, - .shared = 1, - .size = sizeof(struct kmem_cache), - .name = "kmem_cache", -}; - -static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); - -static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) -{ - return this_cpu_ptr(cachep->cpu_cache); -} - -/* - * Calculate the number of objects and left-over bytes for a given buffer size. - */ -static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, - slab_flags_t flags, size_t *left_over) -{ - unsigned int num; - size_t slab_size = PAGE_SIZE << gfporder; - - /* - * The slab management structure can be either off the slab or - * on it. For the latter case, the memory allocated for a - * slab is used for: - * - * - @buffer_size bytes for each object - * - One freelist_idx_t for each object - * - * We don't need to consider alignment of freelist because - * freelist will be at the end of slab page. The objects will be - * at the correct alignment. - * - * If the slab management structure is off the slab, then the - * alignment will already be calculated into the size. Because - * the slabs are all pages aligned, the objects will be at the - * correct alignment when allocated. - */ - if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) { - num = slab_size / buffer_size; - *left_over = slab_size % buffer_size; - } else { - num = slab_size / (buffer_size + sizeof(freelist_idx_t)); - *left_over = slab_size % - (buffer_size + sizeof(freelist_idx_t)); - } - - return num; -} - -#if DEBUG -#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) - -static void __slab_error(const char *function, struct kmem_cache *cachep, - char *msg) -{ - pr_err("slab error in %s(): cache `%s': %s\n", - function, cachep->name, msg); - dump_stack(); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); -} -#endif - -/* - * By default on NUMA we use alien caches to stage the freeing of - * objects allocated from other nodes. This causes massive memory - * inefficiencies when using fake NUMA setup to split memory into a - * large number of small nodes, so it can be disabled on the command - * line - */ - -static int use_alien_caches __read_mostly = 1; -static int __init noaliencache_setup(char *s) -{ - use_alien_caches = 0; - return 1; -} -__setup("noaliencache", noaliencache_setup); - -static int __init slab_max_order_setup(char *str) -{ - get_option(&str, &slab_max_order); - slab_max_order = slab_max_order < 0 ? 0 : - min(slab_max_order, MAX_ORDER); - slab_max_order_set = true; - - return 1; -} -__setup("slab_max_order=", slab_max_order_setup); - -#ifdef CONFIG_NUMA -/* - * Special reaping functions for NUMA systems called from cache_reap(). - * These take care of doing round robin flushing of alien caches (containing - * objects freed on different nodes from which they were allocated) and the - * flushing of remote pcps by calling drain_node_pages. - */ -static DEFINE_PER_CPU(unsigned long, slab_reap_node); - -static void init_reap_node(int cpu) -{ - per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu), - node_online_map); -} - -static void next_reap_node(void) -{ - int node = __this_cpu_read(slab_reap_node); - - node = next_node_in(node, node_online_map); - __this_cpu_write(slab_reap_node, node); -} - -#else -#define init_reap_node(cpu) do { } while (0) -#define next_reap_node(void) do { } while (0) -#endif - -/* - * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz - * via the workqueue/eventd. - * Add the CPU number into the expiration time to minimize the possibility of - * the CPUs getting into lockstep and contending for the global cache chain - * lock. - */ -static void start_cpu_timer(int cpu) -{ - struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); - - if (reap_work->work.func == NULL) { - init_reap_node(cpu); - INIT_DEFERRABLE_WORK(reap_work, cache_reap); - schedule_delayed_work_on(cpu, reap_work, - __round_jiffies_relative(HZ, cpu)); - } -} - -static void init_arraycache(struct array_cache *ac, int limit, int batch) -{ - if (ac) { - ac->avail = 0; - ac->limit = limit; - ac->batchcount = batch; - ac->touched = 0; - } -} - -static struct array_cache *alloc_arraycache(int node, int entries, - int batchcount, gfp_t gfp) -{ - size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache); - struct array_cache *ac = NULL; - - ac = kmalloc_node(memsize, gfp, node); - /* - * The array_cache structures contain pointers to free object. - * However, when such objects are allocated or transferred to another - * cache the pointers are not cleared and they could be counted as - * valid references during a kmemleak scan. Therefore, kmemleak must - * not scan such objects. - */ - kmemleak_no_scan(ac); - init_arraycache(ac, entries, batchcount); - return ac; -} - -static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, - struct slab *slab, void *objp) -{ - struct kmem_cache_node *n; - int slab_node; - LIST_HEAD(list); - - slab_node = slab_nid(slab); - n = get_node(cachep, slab_node); - - raw_spin_lock(&n->list_lock); - free_block(cachep, &objp, 1, slab_node, &list); - raw_spin_unlock(&n->list_lock); - - slabs_destroy(cachep, &list); -} - -/* - * Transfer objects in one arraycache to another. - * Locking must be handled by the caller. - * - * Return the number of entries transferred. - */ -static int transfer_objects(struct array_cache *to, - struct array_cache *from, unsigned int max) -{ - /* Figure out how many entries to transfer */ - int nr = min3(from->avail, max, to->limit - to->avail); - - if (!nr) - return 0; - - memcpy(to->entry + to->avail, from->entry + from->avail - nr, - sizeof(void *) *nr); - - from->avail -= nr; - to->avail += nr; - return nr; -} - -/* &alien->lock must be held by alien callers. */ -static __always_inline void __free_one(struct array_cache *ac, void *objp) -{ - /* Avoid trivial double-free. */ - if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && - WARN_ON_ONCE(ac->avail > 0 && ac->entry[ac->avail - 1] == objp)) - return; - ac->entry[ac->avail++] = objp; -} - -#ifndef CONFIG_NUMA - -#define drain_alien_cache(cachep, alien) do { } while (0) -#define reap_alien(cachep, n) do { } while (0) - -static inline struct alien_cache **alloc_alien_cache(int node, - int limit, gfp_t gfp) -{ - return NULL; -} - -static inline void free_alien_cache(struct alien_cache **ac_ptr) -{ -} - -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) -{ - return 0; -} - -static inline gfp_t gfp_exact_node(gfp_t flags) -{ - return flags & ~__GFP_NOFAIL; -} - -#else /* CONFIG_NUMA */ - -static struct alien_cache *__alloc_alien_cache(int node, int entries, - int batch, gfp_t gfp) -{ - size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache); - struct alien_cache *alc = NULL; - - alc = kmalloc_node(memsize, gfp, node); - if (alc) { - kmemleak_no_scan(alc); - init_arraycache(&alc->ac, entries, batch); - spin_lock_init(&alc->lock); - } - return alc; -} - -static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) -{ - struct alien_cache **alc_ptr; - int i; - - if (limit > 1) - limit = 12; - alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node); - if (!alc_ptr) - return NULL; - - for_each_node(i) { - if (i == node || !node_online(i)) - continue; - alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp); - if (!alc_ptr[i]) { - for (i--; i >= 0; i--) - kfree(alc_ptr[i]); - kfree(alc_ptr); - return NULL; - } - } - return alc_ptr; -} - -static void free_alien_cache(struct alien_cache **alc_ptr) -{ - int i; - - if (!alc_ptr) - return; - for_each_node(i) - kfree(alc_ptr[i]); - kfree(alc_ptr); -} - -static void __drain_alien_cache(struct kmem_cache *cachep, - struct array_cache *ac, int node, - struct list_head *list) -{ - struct kmem_cache_node *n = get_node(cachep, node); - - if (ac->avail) { - raw_spin_lock(&n->list_lock); - /* - * Stuff objects into the remote nodes shared array first. - * That way we could avoid the overhead of putting the objects - * into the free lists and getting them back later. - */ - if (n->shared) - transfer_objects(n->shared, ac, ac->limit); - - free_block(cachep, ac->entry, ac->avail, node, list); - ac->avail = 0; - raw_spin_unlock(&n->list_lock); - } -} - -/* - * Called from cache_reap() to regularly drain alien caches round robin. - */ -static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) -{ - int node = __this_cpu_read(slab_reap_node); - - if (n->alien) { - struct alien_cache *alc = n->alien[node]; - struct array_cache *ac; - - if (alc) { - ac = &alc->ac; - if (ac->avail && spin_trylock_irq(&alc->lock)) { - LIST_HEAD(list); - - __drain_alien_cache(cachep, ac, node, &list); - spin_unlock_irq(&alc->lock); - slabs_destroy(cachep, &list); - } - } - } -} - -static void drain_alien_cache(struct kmem_cache *cachep, - struct alien_cache **alien) -{ - int i = 0; - struct alien_cache *alc; - struct array_cache *ac; - unsigned long flags; - - for_each_online_node(i) { - alc = alien[i]; - if (alc) { - LIST_HEAD(list); - - ac = &alc->ac; - spin_lock_irqsave(&alc->lock, flags); - __drain_alien_cache(cachep, ac, i, &list); - spin_unlock_irqrestore(&alc->lock, flags); - slabs_destroy(cachep, &list); - } - } -} - -static int __cache_free_alien(struct kmem_cache *cachep, void *objp, - int node, int slab_node) -{ - struct kmem_cache_node *n; - struct alien_cache *alien = NULL; - struct array_cache *ac; - LIST_HEAD(list); - - n = get_node(cachep, node); - STATS_INC_NODEFREES(cachep); - if (n->alien && n->alien[slab_node]) { - alien = n->alien[slab_node]; - ac = &alien->ac; - spin_lock(&alien->lock); - if (unlikely(ac->avail == ac->limit)) { - STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, ac, slab_node, &list); - } - __free_one(ac, objp); - spin_unlock(&alien->lock); - slabs_destroy(cachep, &list); - } else { - n = get_node(cachep, slab_node); - raw_spin_lock(&n->list_lock); - free_block(cachep, &objp, 1, slab_node, &list); - raw_spin_unlock(&n->list_lock); - slabs_destroy(cachep, &list); - } - return 1; -} - -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) -{ - int slab_node = slab_nid(virt_to_slab(objp)); - int node = numa_mem_id(); - /* - * Make sure we are not freeing an object from another node to the array - * cache on this cpu. - */ - if (likely(node == slab_node)) - return 0; - - return __cache_free_alien(cachep, objp, node, slab_node); -} - -/* - * Construct gfp mask to allocate from a specific node but do not reclaim or - * warn about failures. - */ -static inline gfp_t gfp_exact_node(gfp_t flags) -{ - return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL); -} -#endif - -static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) -{ - struct kmem_cache_node *n; - - /* - * Set up the kmem_cache_node for cpu before we can - * begin anything. Make sure some other cpu on this - * node has not already allocated this - */ - n = get_node(cachep, node); - if (n) { - raw_spin_lock_irq(&n->list_lock); - n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + - cachep->num; - raw_spin_unlock_irq(&n->list_lock); - - return 0; - } - - n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); - if (!n) - return -ENOMEM; - - kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_NODE + - ((unsigned long)cachep) % REAPTIMEOUT_NODE; - - n->free_limit = - (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - - /* - * The kmem_cache_nodes don't come and go as CPUs - * come and go. slab_mutex provides sufficient - * protection here. - */ - cachep->node[node] = n; - - return 0; -} - -#if defined(CONFIG_NUMA) || defined(CONFIG_SMP) -/* - * Allocates and initializes node for a node on each slab cache, used for - * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node - * will be allocated off-node since memory is not yet online for the new node. - * When hotplugging memory or a cpu, existing nodes are not replaced if - * already in use. - * - * Must hold slab_mutex. - */ -static int init_cache_node_node(int node) -{ - int ret; - struct kmem_cache *cachep; - - list_for_each_entry(cachep, &slab_caches, list) { - ret = init_cache_node(cachep, node, GFP_KERNEL); - if (ret) - return ret; - } - - return 0; -} -#endif - -static int setup_kmem_cache_node(struct kmem_cache *cachep, - int node, gfp_t gfp, bool force_change) -{ - int ret = -ENOMEM; - struct kmem_cache_node *n; - struct array_cache *old_shared = NULL; - struct array_cache *new_shared = NULL; - struct alien_cache **new_alien = NULL; - LIST_HEAD(list); - - if (use_alien_caches) { - new_alien = alloc_alien_cache(node, cachep->limit, gfp); - if (!new_alien) - goto fail; - } - - if (cachep->shared) { - new_shared = alloc_arraycache(node, - cachep->shared * cachep->batchcount, 0xbaadf00d, gfp); - if (!new_shared) - goto fail; - } - - ret = init_cache_node(cachep, node, gfp); - if (ret) - goto fail; - - n = get_node(cachep, node); - raw_spin_lock_irq(&n->list_lock); - if (n->shared && force_change) { - free_block(cachep, n->shared->entry, - n->shared->avail, node, &list); - n->shared->avail = 0; - } - - if (!n->shared || force_change) { - old_shared = n->shared; - n->shared = new_shared; - new_shared = NULL; - } - - if (!n->alien) { - n->alien = new_alien; - new_alien = NULL; - } - - raw_spin_unlock_irq(&n->list_lock); - slabs_destroy(cachep, &list); - - /* - * To protect lockless access to n->shared during irq disabled context. - * If n->shared isn't NULL in irq disabled context, accessing to it is - * guaranteed to be valid until irq is re-enabled, because it will be - * freed after synchronize_rcu(). - */ - if (old_shared && force_change) - synchronize_rcu(); - -fail: - kfree(old_shared); - kfree(new_shared); - free_alien_cache(new_alien); - - return ret; -} - -#ifdef CONFIG_SMP - -static void cpuup_canceled(long cpu) -{ - struct kmem_cache *cachep; - struct kmem_cache_node *n = NULL; - int node = cpu_to_mem(cpu); - const struct cpumask *mask = cpumask_of_node(node); - - list_for_each_entry(cachep, &slab_caches, list) { - struct array_cache *nc; - struct array_cache *shared; - struct alien_cache **alien; - LIST_HEAD(list); - - n = get_node(cachep, node); - if (!n) - continue; - - raw_spin_lock_irq(&n->list_lock); - - /* Free limit for this kmem_cache_node */ - n->free_limit -= cachep->batchcount; - - /* cpu is dead; no one can alloc from it. */ - nc = per_cpu_ptr(cachep->cpu_cache, cpu); - free_block(cachep, nc->entry, nc->avail, node, &list); - nc->avail = 0; - - if (!cpumask_empty(mask)) { - raw_spin_unlock_irq(&n->list_lock); - goto free_slab; - } - - shared = n->shared; - if (shared) { - free_block(cachep, shared->entry, - shared->avail, node, &list); - n->shared = NULL; - } - - alien = n->alien; - n->alien = NULL; - - raw_spin_unlock_irq(&n->list_lock); - - kfree(shared); - if (alien) { - drain_alien_cache(cachep, alien); - free_alien_cache(alien); - } - -free_slab: - slabs_destroy(cachep, &list); - } - /* - * In the previous loop, all the objects were freed to - * the respective cache's slabs, now we can go ahead and - * shrink each nodelist to its limit. - */ - list_for_each_entry(cachep, &slab_caches, list) { - n = get_node(cachep, node); - if (!n) - continue; - drain_freelist(cachep, n, INT_MAX); - } -} - -static int cpuup_prepare(long cpu) -{ - struct kmem_cache *cachep; - int node = cpu_to_mem(cpu); - int err; - - /* - * We need to do this right in the beginning since - * alloc_arraycache's are going to use this list. - * kmalloc_node allows us to add the slab to the right - * kmem_cache_node and not this cpu's kmem_cache_node - */ - err = init_cache_node_node(node); - if (err < 0) - goto bad; - - /* - * Now we can go ahead with allocating the shared arrays and - * array caches - */ - list_for_each_entry(cachep, &slab_caches, list) { - err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false); - if (err) - goto bad; - } - - return 0; -bad: - cpuup_canceled(cpu); - return -ENOMEM; -} - -int slab_prepare_cpu(unsigned int cpu) -{ - int err; - - mutex_lock(&slab_mutex); - err = cpuup_prepare(cpu); - mutex_unlock(&slab_mutex); - return err; -} - -/* - * This is called for a failed online attempt and for a successful - * offline. - * - * Even if all the cpus of a node are down, we don't free the - * kmem_cache_node of any cache. This is to avoid a race between cpu_down, and - * a kmalloc allocation from another cpu for memory from the node of - * the cpu going down. The kmem_cache_node structure is usually allocated from - * kmem_cache_create() and gets destroyed at kmem_cache_destroy(). - */ -int slab_dead_cpu(unsigned int cpu) -{ - mutex_lock(&slab_mutex); - cpuup_canceled(cpu); - mutex_unlock(&slab_mutex); - return 0; -} -#endif - -static int slab_online_cpu(unsigned int cpu) -{ - start_cpu_timer(cpu); - return 0; -} - -static int slab_offline_cpu(unsigned int cpu) -{ - /* - * Shutdown cache reaper. Note that the slab_mutex is held so - * that if cache_reap() is invoked it cannot do anything - * expensive but will only modify reap_work and reschedule the - * timer. - */ - cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); - /* Now the cache_reaper is guaranteed to be not running. */ - per_cpu(slab_reap_work, cpu).work.func = NULL; - return 0; -} - -#if defined(CONFIG_NUMA) -/* - * Drains freelist for a node on each slab cache, used for memory hot-remove. - * Returns -EBUSY if all objects cannot be drained so that the node is not - * removed. - * - * Must hold slab_mutex. - */ -static int __meminit drain_cache_node_node(int node) -{ - struct kmem_cache *cachep; - int ret = 0; - - list_for_each_entry(cachep, &slab_caches, list) { - struct kmem_cache_node *n; - - n = get_node(cachep, node); - if (!n) - continue; - - drain_freelist(cachep, n, INT_MAX); - - if (!list_empty(&n->slabs_full) || - !list_empty(&n->slabs_partial)) { - ret = -EBUSY; - break; - } - } - return ret; -} - -static int __meminit slab_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct memory_notify *mnb = arg; - int ret = 0; - int nid; - - nid = mnb->status_change_nid; - if (nid < 0) - goto out; - - switch (action) { - case MEM_GOING_ONLINE: - mutex_lock(&slab_mutex); - ret = init_cache_node_node(nid); - mutex_unlock(&slab_mutex); - break; - case MEM_GOING_OFFLINE: - mutex_lock(&slab_mutex); - ret = drain_cache_node_node(nid); - mutex_unlock(&slab_mutex); - break; - case MEM_ONLINE: - case MEM_OFFLINE: - case MEM_CANCEL_ONLINE: - case MEM_CANCEL_OFFLINE: - break; - } -out: - return notifier_from_errno(ret); -} -#endif /* CONFIG_NUMA */ - -/* - * swap the static kmem_cache_node with kmalloced memory - */ -static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list, - int nodeid) -{ - struct kmem_cache_node *ptr; - - ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid); - BUG_ON(!ptr); - - memcpy(ptr, list, sizeof(struct kmem_cache_node)); - /* - * Do not assume that spinlocks can be initialized via memcpy: - */ - raw_spin_lock_init(&ptr->list_lock); - - MAKE_ALL_LISTS(cachep, ptr, nodeid); - cachep->node[nodeid] = ptr; -} - -/* - * For setting up all the kmem_cache_node for cache whose buffer_size is same as - * size of kmem_cache_node. - */ -static void __init set_up_node(struct kmem_cache *cachep, int index) -{ - int node; - - for_each_online_node(node) { - cachep->node[node] = &init_kmem_cache_node[index + node]; - cachep->node[node]->next_reap = jiffies + - REAPTIMEOUT_NODE + - ((unsigned long)cachep) % REAPTIMEOUT_NODE; - } -} - -/* - * Initialisation. Called after the page allocator have been initialised and - * before smp_init(). - */ -void __init kmem_cache_init(void) -{ - int i; - - kmem_cache = &kmem_cache_boot; - - if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1) - use_alien_caches = 0; - - for (i = 0; i < NUM_INIT_LISTS; i++) - kmem_cache_node_init(&init_kmem_cache_node[i]); - - /* - * Fragmentation resistance on low memory - only use bigger - * page orders on machines with more than 32MB of memory if - * not overridden on the command line. - */ - if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT) - slab_max_order = SLAB_MAX_ORDER_HI; - - /* Bootstrap is tricky, because several objects are allocated - * from caches that do not exist yet: - * 1) initialize the kmem_cache cache: it contains the struct - * kmem_cache structures of all caches, except kmem_cache itself: - * kmem_cache is statically allocated. - * Initially an __init data area is used for the head array and the - * kmem_cache_node structures, it's replaced with a kmalloc allocated - * array at the end of the bootstrap. - * 2) Create the first kmalloc cache. - * The struct kmem_cache for the new cache is allocated normally. - * An __init data area is used for the head array. - * 3) Create the remaining kmalloc caches, with minimally sized - * head arrays. - * 4) Replace the __init data head arrays for kmem_cache and the first - * kmalloc cache with kmalloc allocated arrays. - * 5) Replace the __init data for kmem_cache_node for kmem_cache and - * the other cache's with kmalloc allocated memory. - * 6) Resize the head arrays of the kmalloc caches to their final sizes. - */ - - /* 1) create the kmem_cache */ - - /* - * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids - */ - create_boot_cache(kmem_cache, "kmem_cache", - offsetof(struct kmem_cache, node) + - nr_node_ids * sizeof(struct kmem_cache_node *), - SLAB_HWCACHE_ALIGN, 0, 0); - list_add(&kmem_cache->list, &slab_caches); - slab_state = PARTIAL; - - /* - * Initialize the caches that provide memory for the kmem_cache_node - * structures first. Without this, further allocations will bug. - */ - new_kmalloc_cache(INDEX_NODE, KMALLOC_NORMAL, ARCH_KMALLOC_FLAGS); - slab_state = PARTIAL_NODE; - setup_kmalloc_cache_index_table(); - - /* 5) Replace the bootstrap kmem_cache_node */ - { - int nid; - - for_each_online_node(nid) { - init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); - - init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE], - &init_kmem_cache_node[SIZE_NODE + nid], nid); - } - } - - create_kmalloc_caches(ARCH_KMALLOC_FLAGS); -} - -void __init kmem_cache_init_late(void) -{ - struct kmem_cache *cachep; - - /* 6) resize the head arrays to their final sizes */ - mutex_lock(&slab_mutex); - list_for_each_entry(cachep, &slab_caches, list) - if (enable_cpucache(cachep, GFP_NOWAIT)) - BUG(); - mutex_unlock(&slab_mutex); - - /* Done! */ - slab_state = FULL; - -#ifdef CONFIG_NUMA - /* - * Register a memory hotplug callback that initializes and frees - * node. - */ - hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); -#endif - - /* - * The reap timers are started later, with a module init call: That part - * of the kernel is not yet operational. - */ -} - -static int __init cpucache_init(void) -{ - int ret; - - /* - * Register the timers that return unneeded pages to the page allocator - */ - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "SLAB online", - slab_online_cpu, slab_offline_cpu); - WARN_ON(ret < 0); - - return 0; -} -__initcall(cpucache_init); - -static noinline void -slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) -{ -#if DEBUG - struct kmem_cache_node *n; - unsigned long flags; - int node; - static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - - if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) - return; - - pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", - nodeid, gfpflags, &gfpflags); - pr_warn(" cache: %s, object size: %d, order: %d\n", - cachep->name, cachep->size, cachep->gfporder); - - for_each_kmem_cache_node(cachep, node, n) { - unsigned long total_slabs, free_slabs, free_objs; - - raw_spin_lock_irqsave(&n->list_lock, flags); - total_slabs = n->total_slabs; - free_slabs = n->free_slabs; - free_objs = n->free_objects; - raw_spin_unlock_irqrestore(&n->list_lock, flags); - - pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n", - node, total_slabs - free_slabs, total_slabs, - (total_slabs * cachep->num) - free_objs, - total_slabs * cachep->num); - } -#endif -} - -/* - * Interface to system's page allocator. No need to hold the - * kmem_cache_node ->list_lock. - * - * If we requested dmaable memory, we will get it. Even if we - * did not request dmaable memory, we might get it, but that - * would be relatively rare and ignorable. - */ -static struct slab *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, - int nodeid) -{ - struct folio *folio; - struct slab *slab; - - flags |= cachep->allocflags; - - folio = (struct folio *) __alloc_pages_node(nodeid, flags, cachep->gfporder); - if (!folio) { - slab_out_of_memory(cachep, flags, nodeid); - return NULL; - } - - slab = folio_slab(folio); - - account_slab(slab, cachep->gfporder, cachep, flags); - __folio_set_slab(folio); - /* Make the flag visible before any changes to folio->mapping */ - smp_wmb(); - /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ - if (sk_memalloc_socks() && folio_is_pfmemalloc(folio)) - slab_set_pfmemalloc(slab); - - return slab; -} - -/* - * Interface to system's page release. - */ -static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab) -{ - int order = cachep->gfporder; - struct folio *folio = slab_folio(slab); - - BUG_ON(!folio_test_slab(folio)); - __slab_clear_pfmemalloc(slab); - page_mapcount_reset(&folio->page); - folio->mapping = NULL; - /* Make the mapping reset visible before clearing the flag */ - smp_wmb(); - __folio_clear_slab(folio); - - mm_account_reclaimed_pages(1 << order); - unaccount_slab(slab, order, cachep); - __free_pages(&folio->page, order); -} - -static void kmem_rcu_free(struct rcu_head *head) -{ - struct kmem_cache *cachep; - struct slab *slab; - - slab = container_of(head, struct slab, rcu_head); - cachep = slab->slab_cache; - - kmem_freepages(cachep, slab); -} - -#if DEBUG -static inline bool is_debug_pagealloc_cache(struct kmem_cache *cachep) -{ - return debug_pagealloc_enabled_static() && OFF_SLAB(cachep) && - ((cachep->size % PAGE_SIZE) == 0); -} - -#ifdef CONFIG_DEBUG_PAGEALLOC -static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map) -{ - if (!is_debug_pagealloc_cache(cachep)) - return; - - __kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); -} - -#else -static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp, - int map) {} - -#endif - -static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) -{ - int size = cachep->object_size; - addr = &((char *)addr)[obj_offset(cachep)]; - - memset(addr, val, size); - *(unsigned char *)(addr + size - 1) = POISON_END; -} - -static void dump_line(char *data, int offset, int limit) -{ - int i; - unsigned char error = 0; - int bad_count = 0; - - pr_err("%03x: ", offset); - for (i = 0; i < limit; i++) { - if (data[offset + i] != POISON_FREE) { - error = data[offset + i]; - bad_count++; - } - } - print_hex_dump(KERN_CONT, "", 0, 16, 1, - &data[offset], limit, 1); - - if (bad_count == 1) { - error ^= POISON_FREE; - if (!(error & (error - 1))) { - pr_err("Single bit error detected. Probably bad RAM.\n"); -#ifdef CONFIG_X86 - pr_err("Run memtest86+ or a similar memory test tool.\n"); -#else - pr_err("Run a memory test tool.\n"); -#endif - } - } -} -#endif - -#if DEBUG - -static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) -{ - int i, size; - char *realobj; - - if (cachep->flags & SLAB_RED_ZONE) { - pr_err("Redzone: 0x%llx/0x%llx\n", - *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); - } - - if (cachep->flags & SLAB_STORE_USER) - pr_err("Last user: (%pSR)\n", *dbg_userword(cachep, objp)); - realobj = (char *)objp + obj_offset(cachep); - size = cachep->object_size; - for (i = 0; i < size && lines; i += 16, lines--) { - int limit; - limit = 16; - if (i + limit > size) - limit = size - i; - dump_line(realobj, i, limit); - } -} - -static void check_poison_obj(struct kmem_cache *cachep, void *objp) -{ - char *realobj; - int size, i; - int lines = 0; - - if (is_debug_pagealloc_cache(cachep)) - return; - - realobj = (char *)objp + obj_offset(cachep); - size = cachep->object_size; - - for (i = 0; i < size; i++) { - char exp = POISON_FREE; - if (i == size - 1) - exp = POISON_END; - if (realobj[i] != exp) { - int limit; - /* Mismatch ! */ - /* Print header */ - if (lines == 0) { - pr_err("Slab corruption (%s): %s start=%px, len=%d\n", - print_tainted(), cachep->name, - realobj, size); - print_objinfo(cachep, objp, 0); - } - /* Hexdump the affected line */ - i = (i / 16) * 16; - limit = 16; - if (i + limit > size) - limit = size - i; - dump_line(realobj, i, limit); - i += 16; - lines++; - /* Limit to 5 lines */ - if (lines > 5) - break; - } - } - if (lines != 0) { - /* Print some data about the neighboring objects, if they - * exist: - */ - struct slab *slab = virt_to_slab(objp); - unsigned int objnr; - - objnr = obj_to_index(cachep, slab, objp); - if (objnr) { - objp = index_to_obj(cachep, slab, objnr - 1); - realobj = (char *)objp + obj_offset(cachep); - pr_err("Prev obj: start=%px, len=%d\n", realobj, size); - print_objinfo(cachep, objp, 2); - } - if (objnr + 1 < cachep->num) { - objp = index_to_obj(cachep, slab, objnr + 1); - realobj = (char *)objp + obj_offset(cachep); - pr_err("Next obj: start=%px, len=%d\n", realobj, size); - print_objinfo(cachep, objp, 2); - } - } -} -#endif - -#if DEBUG -static void slab_destroy_debugcheck(struct kmem_cache *cachep, - struct slab *slab) -{ - int i; - - if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) { - poison_obj(cachep, slab->freelist - obj_offset(cachep), - POISON_FREE); - } - - for (i = 0; i < cachep->num; i++) { - void *objp = index_to_obj(cachep, slab, i); - - if (cachep->flags & SLAB_POISON) { - check_poison_obj(cachep, objp); - slab_kernel_map(cachep, objp, 1); - } - if (cachep->flags & SLAB_RED_ZONE) { - if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "start of a freed object was overwritten"); - if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "end of a freed object was overwritten"); - } - } -} -#else -static void slab_destroy_debugcheck(struct kmem_cache *cachep, - struct slab *slab) -{ -} -#endif - -/** - * slab_destroy - destroy and release all objects in a slab - * @cachep: cache pointer being destroyed - * @slab: slab being destroyed - * - * Destroy all the objs in a slab, and release the mem back to the system. - * Before calling the slab must have been unlinked from the cache. The - * kmem_cache_node ->list_lock is not held/needed. - */ -static void slab_destroy(struct kmem_cache *cachep, struct slab *slab) -{ - void *freelist; - - freelist = slab->freelist; - slab_destroy_debugcheck(cachep, slab); - if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU)) - call_rcu(&slab->rcu_head, kmem_rcu_free); - else - kmem_freepages(cachep, slab); - - /* - * From now on, we don't use freelist - * although actual page can be freed in rcu context - */ - if (OFF_SLAB(cachep)) - kfree(freelist); -} - -/* - * Update the size of the caches before calling slabs_destroy as it may - * recursively call kfree. - */ -static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) -{ - struct slab *slab, *n; - - list_for_each_entry_safe(slab, n, list, slab_list) { - list_del(&slab->slab_list); - slab_destroy(cachep, slab); - } -} - -/** - * calculate_slab_order - calculate size (page order) of slabs - * @cachep: pointer to the cache that is being created - * @size: size of objects to be created in this cache. - * @flags: slab allocation flags - * - * Also calculates the number of objects per slab. - * - * This could be made much more intelligent. For now, try to avoid using - * high order pages for slabs. When the gfp() functions are more friendly - * towards high-order requests, this should be changed. - * - * Return: number of left-over bytes in a slab - */ -static size_t calculate_slab_order(struct kmem_cache *cachep, - size_t size, slab_flags_t flags) -{ - size_t left_over = 0; - int gfporder; - - for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { - unsigned int num; - size_t remainder; - - num = cache_estimate(gfporder, size, flags, &remainder); - if (!num) - continue; - - /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ - if (num > SLAB_OBJ_MAX_NUM) - break; - - if (flags & CFLGS_OFF_SLAB) { - struct kmem_cache *freelist_cache; - size_t freelist_size; - size_t freelist_cache_size; - - freelist_size = num * sizeof(freelist_idx_t); - if (freelist_size > KMALLOC_MAX_CACHE_SIZE) { - freelist_cache_size = PAGE_SIZE << get_order(freelist_size); - } else { - freelist_cache = kmalloc_slab(freelist_size, 0u, _RET_IP_); - if (!freelist_cache) - continue; - freelist_cache_size = freelist_cache->size; - - /* - * Needed to avoid possible looping condition - * in cache_grow_begin() - */ - if (OFF_SLAB(freelist_cache)) - continue; - } - - /* check if off slab has enough benefit */ - if (freelist_cache_size > cachep->size / 2) - continue; - } - - /* Found something acceptable - save it away */ - cachep->num = num; - cachep->gfporder = gfporder; - left_over = remainder; - - /* - * A VFS-reclaimable slab tends to have most allocations - * as GFP_NOFS and we really don't want to have to be allocating - * higher-order pages when we are unable to shrink dcache. - */ - if (flags & SLAB_RECLAIM_ACCOUNT) - break; - - /* - * Large number of objects is good, but very large slabs are - * currently bad for the gfp()s. - */ - if (gfporder >= slab_max_order) - break; - - /* - * Acceptable internal fragmentation? - */ - if (left_over * 8 <= (PAGE_SIZE << gfporder)) - break; - } - return left_over; -} - -static struct array_cache __percpu *alloc_kmem_cache_cpus( - struct kmem_cache *cachep, int entries, int batchcount) -{ - int cpu; - size_t size; - struct array_cache __percpu *cpu_cache; - - size = sizeof(void *) * entries + sizeof(struct array_cache); - cpu_cache = __alloc_percpu(size, sizeof(void *)); - - if (!cpu_cache) - return NULL; - - for_each_possible_cpu(cpu) { - init_arraycache(per_cpu_ptr(cpu_cache, cpu), - entries, batchcount); - } - - return cpu_cache; -} - -static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) -{ - if (slab_state >= FULL) - return enable_cpucache(cachep, gfp); - - cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1); - if (!cachep->cpu_cache) - return 1; - - if (slab_state == DOWN) { - /* Creation of first cache (kmem_cache). */ - set_up_node(kmem_cache, CACHE_CACHE); - } else if (slab_state == PARTIAL) { - /* For kmem_cache_node */ - set_up_node(cachep, SIZE_NODE); - } else { - int node; - - for_each_online_node(node) { - cachep->node[node] = kmalloc_node( - sizeof(struct kmem_cache_node), gfp, node); - BUG_ON(!cachep->node[node]); - kmem_cache_node_init(cachep->node[node]); - } - } - - cachep->node[numa_mem_id()]->next_reap = - jiffies + REAPTIMEOUT_NODE + - ((unsigned long)cachep) % REAPTIMEOUT_NODE; - - cpu_cache_get(cachep)->avail = 0; - cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; - cpu_cache_get(cachep)->batchcount = 1; - cpu_cache_get(cachep)->touched = 0; - cachep->batchcount = 1; - cachep->limit = BOOT_CPUCACHE_ENTRIES; - return 0; -} - -slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name) -{ - return flags; -} - -struct kmem_cache * -__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, - slab_flags_t flags, void (*ctor)(void *)) -{ - struct kmem_cache *cachep; - - cachep = find_mergeable(size, align, flags, name, ctor); - if (cachep) { - cachep->refcount++; - - /* - * Adjust the object sizes so that we clear - * the complete object on kzalloc. - */ - cachep->object_size = max_t(int, cachep->object_size, size); - } - return cachep; -} - -static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, - size_t size, slab_flags_t flags) -{ - size_t left; - - cachep->num = 0; - - /* - * If slab auto-initialization on free is enabled, store the freelist - * off-slab, so that its contents don't end up in one of the allocated - * objects. - */ - if (unlikely(slab_want_init_on_free(cachep))) - return false; - - if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU) - return false; - - left = calculate_slab_order(cachep, size, - flags | CFLGS_OBJFREELIST_SLAB); - if (!cachep->num) - return false; - - if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size) - return false; - - cachep->colour = left / cachep->colour_off; - - return true; -} - -static bool set_off_slab_cache(struct kmem_cache *cachep, - size_t size, slab_flags_t flags) -{ - size_t left; - - cachep->num = 0; - - /* - * Always use on-slab management when SLAB_NOLEAKTRACE - * to avoid recursive calls into kmemleak. - */ - if (flags & SLAB_NOLEAKTRACE) - return false; - - /* - * Size is large, assume best to place the slab management obj - * off-slab (should allow better packing of objs). - */ - left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB); - if (!cachep->num) - return false; - - /* - * If the slab has been placed off-slab, and we have enough space then - * move it on-slab. This is at the expense of any extra colouring. - */ - if (left >= cachep->num * sizeof(freelist_idx_t)) - return false; - - cachep->colour = left / cachep->colour_off; - - return true; -} - -static bool set_on_slab_cache(struct kmem_cache *cachep, - size_t size, slab_flags_t flags) -{ - size_t left; - - cachep->num = 0; - - left = calculate_slab_order(cachep, size, flags); - if (!cachep->num) - return false; - - cachep->colour = left / cachep->colour_off; - - return true; -} - -/* - * __kmem_cache_create - Create a cache. - * @cachep: cache management descriptor - * @flags: SLAB flags - * - * Returns zero on success, nonzero on failure. - * - * The flags are - * - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) - * to catch references to uninitialised memory. - * - * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check - * for buffer overruns. - * - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware - * cacheline. This can be beneficial if you're counting cycles as closely - * as davem. - */ -int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) -{ - size_t ralign = BYTES_PER_WORD; - gfp_t gfp; - int err; - unsigned int size = cachep->size; - -#if DEBUG -#if FORCED_DEBUG - /* - * Enable redzoning and last user accounting, except for caches with - * large objects, if the increased size would increase the object size - * above the next power of two: caches with object sizes just above a - * power of two have a significant amount of internal fragmentation. - */ - if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + - 2 * sizeof(unsigned long long))) - flags |= SLAB_RED_ZONE | SLAB_STORE_USER; - if (!(flags & SLAB_TYPESAFE_BY_RCU)) - flags |= SLAB_POISON; -#endif -#endif - - /* - * Check that size is in terms of words. This is needed to avoid - * unaligned accesses for some archs when redzoning is used, and makes - * sure any on-slab bufctl's are also correctly aligned. - */ - size = ALIGN(size, BYTES_PER_WORD); - - if (flags & SLAB_RED_ZONE) { - ralign = REDZONE_ALIGN; - /* If redzoning, ensure that the second redzone is suitably - * aligned, by adjusting the object size accordingly. */ - size = ALIGN(size, REDZONE_ALIGN); - } - - /* 3) caller mandated alignment */ - if (ralign < cachep->align) { - ralign = cachep->align; - } - /* disable debug if necessary */ - if (ralign > __alignof__(unsigned long long)) - flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); - /* - * 4) Store it. - */ - cachep->align = ralign; - cachep->colour_off = cache_line_size(); - /* Offset must be a multiple of the alignment. */ - if (cachep->colour_off < cachep->align) - cachep->colour_off = cachep->align; - - if (slab_is_available()) - gfp = GFP_KERNEL; - else - gfp = GFP_NOWAIT; - -#if DEBUG - - /* - * Both debugging options require word-alignment which is calculated - * into align above. - */ - if (flags & SLAB_RED_ZONE) { - /* add space for red zone words */ - cachep->obj_offset += sizeof(unsigned long long); - size += 2 * sizeof(unsigned long long); - } - if (flags & SLAB_STORE_USER) { - /* user store requires one word storage behind the end of - * the real object. But if the second red zone needs to be - * aligned to 64 bits, we must allow that much space. - */ - if (flags & SLAB_RED_ZONE) - size += REDZONE_ALIGN; - else - size += BYTES_PER_WORD; - } -#endif - - kasan_cache_create(cachep, &size, &flags); - - size = ALIGN(size, cachep->align); - /* - * We should restrict the number of objects in a slab to implement - * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. - */ - if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) - size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); - -#if DEBUG - /* - * To activate debug pagealloc, off-slab management is necessary - * requirement. In early phase of initialization, small sized slab - * doesn't get initialized so it would not be possible. So, we need - * to check size >= 256. It guarantees that all necessary small - * sized slab is initialized in current slab initialization sequence. - */ - if (debug_pagealloc_enabled_static() && (flags & SLAB_POISON) && - size >= 256 && cachep->object_size > cache_line_size()) { - if (size < PAGE_SIZE || size % PAGE_SIZE == 0) { - size_t tmp_size = ALIGN(size, PAGE_SIZE); - - if (set_off_slab_cache(cachep, tmp_size, flags)) { - flags |= CFLGS_OFF_SLAB; - cachep->obj_offset += tmp_size - size; - size = tmp_size; - goto done; - } - } - } -#endif - - if (set_objfreelist_slab_cache(cachep, size, flags)) { - flags |= CFLGS_OBJFREELIST_SLAB; - goto done; - } - - if (set_off_slab_cache(cachep, size, flags)) { - flags |= CFLGS_OFF_SLAB; - goto done; - } - - if (set_on_slab_cache(cachep, size, flags)) - goto done; - - return -E2BIG; - -done: - cachep->freelist_size = cachep->num * sizeof(freelist_idx_t); - cachep->flags = flags; - cachep->allocflags = __GFP_COMP; - if (flags & SLAB_CACHE_DMA) - cachep->allocflags |= GFP_DMA; - if (flags & SLAB_CACHE_DMA32) - cachep->allocflags |= GFP_DMA32; - if (flags & SLAB_RECLAIM_ACCOUNT) - cachep->allocflags |= __GFP_RECLAIMABLE; - cachep->size = size; - cachep->reciprocal_buffer_size = reciprocal_value(size); - -#if DEBUG - /* - * If we're going to use the generic kernel_map_pages() - * poisoning, then it's going to smash the contents of - * the redzone and userword anyhow, so switch them off. - */ - if (IS_ENABLED(CONFIG_PAGE_POISONING) && - (cachep->flags & SLAB_POISON) && - is_debug_pagealloc_cache(cachep)) - cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); -#endif - - err = setup_cpu_cache(cachep, gfp); - if (err) { - __kmem_cache_release(cachep); - return err; - } - - return 0; -} - -#if DEBUG -static void check_irq_off(void) -{ - BUG_ON(!irqs_disabled()); -} - -static void check_irq_on(void) -{ - BUG_ON(irqs_disabled()); -} - -static void check_mutex_acquired(void) -{ - BUG_ON(!mutex_is_locked(&slab_mutex)); -} - -static void check_spinlock_acquired(struct kmem_cache *cachep) -{ -#ifdef CONFIG_SMP - check_irq_off(); - assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); -#endif -} - -static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) -{ -#ifdef CONFIG_SMP - check_irq_off(); - assert_raw_spin_locked(&get_node(cachep, node)->list_lock); -#endif -} - -#else -#define check_irq_off() do { } while(0) -#define check_irq_on() do { } while(0) -#define check_mutex_acquired() do { } while(0) -#define check_spinlock_acquired(x) do { } while(0) -#define check_spinlock_acquired_node(x, y) do { } while(0) -#endif - -static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, - int node, bool free_all, struct list_head *list) -{ - int tofree; - - if (!ac || !ac->avail) - return; - - tofree = free_all ? ac->avail : (ac->limit + 4) / 5; - if (tofree > ac->avail) - tofree = (ac->avail + 1) / 2; - - free_block(cachep, ac->entry, tofree, node, list); - ac->avail -= tofree; - memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); -} - -static void do_drain(void *arg) -{ - struct kmem_cache *cachep = arg; - struct array_cache *ac; - int node = numa_mem_id(); - struct kmem_cache_node *n; - LIST_HEAD(list); - - check_irq_off(); - ac = cpu_cache_get(cachep); - n = get_node(cachep, node); - raw_spin_lock(&n->list_lock); - free_block(cachep, ac->entry, ac->avail, node, &list); - raw_spin_unlock(&n->list_lock); - ac->avail = 0; - slabs_destroy(cachep, &list); -} - -static void drain_cpu_caches(struct kmem_cache *cachep) -{ - struct kmem_cache_node *n; - int node; - LIST_HEAD(list); - - on_each_cpu(do_drain, cachep, 1); - check_irq_on(); - for_each_kmem_cache_node(cachep, node, n) - if (n->alien) - drain_alien_cache(cachep, n->alien); - - for_each_kmem_cache_node(cachep, node, n) { - raw_spin_lock_irq(&n->list_lock); - drain_array_locked(cachep, n->shared, node, true, &list); - raw_spin_unlock_irq(&n->list_lock); - - slabs_destroy(cachep, &list); - } -} - -/* - * Remove slabs from the list of free slabs. - * Specify the number of slabs to drain in tofree. - * - * Returns the actual number of slabs released. - */ -static int drain_freelist(struct kmem_cache *cache, - struct kmem_cache_node *n, int tofree) -{ - struct list_head *p; - int nr_freed; - struct slab *slab; - - nr_freed = 0; - while (nr_freed < tofree && !list_empty(&n->slabs_free)) { - - raw_spin_lock_irq(&n->list_lock); - p = n->slabs_free.prev; - if (p == &n->slabs_free) { - raw_spin_unlock_irq(&n->list_lock); - goto out; - } - - slab = list_entry(p, struct slab, slab_list); - list_del(&slab->slab_list); - n->free_slabs--; - n->total_slabs--; - /* - * Safe to drop the lock. The slab is no longer linked - * to the cache. - */ - n->free_objects -= cache->num; - raw_spin_unlock_irq(&n->list_lock); - slab_destroy(cache, slab); - nr_freed++; - - cond_resched(); - } -out: - return nr_freed; -} - -bool __kmem_cache_empty(struct kmem_cache *s) -{ - int node; - struct kmem_cache_node *n; - - for_each_kmem_cache_node(s, node, n) - if (!list_empty(&n->slabs_full) || - !list_empty(&n->slabs_partial)) - return false; - return true; -} - -int __kmem_cache_shrink(struct kmem_cache *cachep) -{ - int ret = 0; - int node; - struct kmem_cache_node *n; - - drain_cpu_caches(cachep); - - check_irq_on(); - for_each_kmem_cache_node(cachep, node, n) { - drain_freelist(cachep, n, INT_MAX); - - ret += !list_empty(&n->slabs_full) || - !list_empty(&n->slabs_partial); - } - return (ret ? 1 : 0); -} - -int __kmem_cache_shutdown(struct kmem_cache *cachep) -{ - return __kmem_cache_shrink(cachep); -} - -void __kmem_cache_release(struct kmem_cache *cachep) -{ - int i; - struct kmem_cache_node *n; - - cache_random_seq_destroy(cachep); - - free_percpu(cachep->cpu_cache); - - /* NUMA: free the node structures */ - for_each_kmem_cache_node(cachep, i, n) { - kfree(n->shared); - free_alien_cache(n->alien); - kfree(n); - cachep->node[i] = NULL; - } -} - -/* - * Get the memory for a slab management obj. - * - * For a slab cache when the slab descriptor is off-slab, the - * slab descriptor can't come from the same cache which is being created, - * Because if it is the case, that means we defer the creation of - * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. - * And we eventually call down to __kmem_cache_create(), which - * in turn looks up in the kmalloc_{dma,}_caches for the desired-size one. - * This is a "chicken-and-egg" problem. - * - * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, - * which are all initialized during kmem_cache_init(). - */ -static void *alloc_slabmgmt(struct kmem_cache *cachep, - struct slab *slab, int colour_off, - gfp_t local_flags, int nodeid) -{ - void *freelist; - void *addr = slab_address(slab); - - slab->s_mem = addr + colour_off; - slab->active = 0; - - if (OBJFREELIST_SLAB(cachep)) - freelist = NULL; - else if (OFF_SLAB(cachep)) { - /* Slab management obj is off-slab. */ - freelist = kmalloc_node(cachep->freelist_size, - local_flags, nodeid); - } else { - /* We will use last bytes at the slab for freelist */ - freelist = addr + (PAGE_SIZE << cachep->gfporder) - - cachep->freelist_size; - } - - return freelist; -} - -static inline freelist_idx_t get_free_obj(struct slab *slab, unsigned int idx) -{ - return ((freelist_idx_t *) slab->freelist)[idx]; -} - -static inline void set_free_obj(struct slab *slab, - unsigned int idx, freelist_idx_t val) -{ - ((freelist_idx_t *)(slab->freelist))[idx] = val; -} - -static void cache_init_objs_debug(struct kmem_cache *cachep, struct slab *slab) -{ -#if DEBUG - int i; - - for (i = 0; i < cachep->num; i++) { - void *objp = index_to_obj(cachep, slab, i); - - if (cachep->flags & SLAB_STORE_USER) - *dbg_userword(cachep, objp) = NULL; - - if (cachep->flags & SLAB_RED_ZONE) { - *dbg_redzone1(cachep, objp) = RED_INACTIVE; - *dbg_redzone2(cachep, objp) = RED_INACTIVE; - } - /* - * Constructors are not allowed to allocate memory from the same - * cache which they are a constructor for. Otherwise, deadlock. - * They must also be threaded. - */ - if (cachep->ctor && !(cachep->flags & SLAB_POISON)) { - kasan_unpoison_object_data(cachep, - objp + obj_offset(cachep)); - cachep->ctor(objp + obj_offset(cachep)); - kasan_poison_object_data( - cachep, objp + obj_offset(cachep)); - } - - if (cachep->flags & SLAB_RED_ZONE) { - if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "constructor overwrote the end of an object"); - if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "constructor overwrote the start of an object"); - } - /* need to poison the objs? */ - if (cachep->flags & SLAB_POISON) { - poison_obj(cachep, objp, POISON_FREE); - slab_kernel_map(cachep, objp, 0); - } - } -#endif -} - -#ifdef CONFIG_SLAB_FREELIST_RANDOM -/* Hold information during a freelist initialization */ -struct freelist_init_state { - unsigned int pos; - unsigned int *list; - unsigned int count; -}; - -/* - * Initialize the state based on the randomization method available. - * return true if the pre-computed list is available, false otherwise. - */ -static bool freelist_state_initialize(struct freelist_init_state *state, - struct kmem_cache *cachep, - unsigned int count) -{ - bool ret; - if (!cachep->random_seq) { - ret = false; - } else { - state->list = cachep->random_seq; - state->count = count; - state->pos = get_random_u32_below(count); - ret = true; - } - return ret; -} - -/* Get the next entry on the list and randomize it using a random shift */ -static freelist_idx_t next_random_slot(struct freelist_init_state *state) -{ - if (state->pos >= state->count) - state->pos = 0; - return state->list[state->pos++]; -} - -/* Swap two freelist entries */ -static void swap_free_obj(struct slab *slab, unsigned int a, unsigned int b) -{ - swap(((freelist_idx_t *) slab->freelist)[a], - ((freelist_idx_t *) slab->freelist)[b]); -} - -/* - * Shuffle the freelist initialization state based on pre-computed lists. - * return true if the list was successfully shuffled, false otherwise. - */ -static bool shuffle_freelist(struct kmem_cache *cachep, struct slab *slab) -{ - unsigned int objfreelist = 0, i, rand, count = cachep->num; - struct freelist_init_state state; - bool precomputed; - - if (count < 2) - return false; - - precomputed = freelist_state_initialize(&state, cachep, count); - - /* Take a random entry as the objfreelist */ - if (OBJFREELIST_SLAB(cachep)) { - if (!precomputed) - objfreelist = count - 1; - else - objfreelist = next_random_slot(&state); - slab->freelist = index_to_obj(cachep, slab, objfreelist) + - obj_offset(cachep); - count--; - } - - /* - * On early boot, generate the list dynamically. - * Later use a pre-computed list for speed. - */ - if (!precomputed) { - for (i = 0; i < count; i++) - set_free_obj(slab, i, i); - - /* Fisher-Yates shuffle */ - for (i = count - 1; i > 0; i--) { - rand = get_random_u32_below(i + 1); - swap_free_obj(slab, i, rand); - } - } else { - for (i = 0; i < count; i++) - set_free_obj(slab, i, next_random_slot(&state)); - } - - if (OBJFREELIST_SLAB(cachep)) - set_free_obj(slab, cachep->num - 1, objfreelist); - - return true; -} -#else -static inline bool shuffle_freelist(struct kmem_cache *cachep, - struct slab *slab) -{ - return false; -} -#endif /* CONFIG_SLAB_FREELIST_RANDOM */ - -static void cache_init_objs(struct kmem_cache *cachep, - struct slab *slab) -{ - int i; - void *objp; - bool shuffled; - - cache_init_objs_debug(cachep, slab); - - /* Try to randomize the freelist if enabled */ - shuffled = shuffle_freelist(cachep, slab); - - if (!shuffled && OBJFREELIST_SLAB(cachep)) { - slab->freelist = index_to_obj(cachep, slab, cachep->num - 1) + - obj_offset(cachep); - } - - for (i = 0; i < cachep->num; i++) { - objp = index_to_obj(cachep, slab, i); - objp = kasan_init_slab_obj(cachep, objp); - - /* constructor could break poison info */ - if (DEBUG == 0 && cachep->ctor) { - kasan_unpoison_object_data(cachep, objp); - cachep->ctor(objp); - kasan_poison_object_data(cachep, objp); - } - - if (!shuffled) - set_free_obj(slab, i, i); - } -} - -static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slab) -{ - void *objp; - - objp = index_to_obj(cachep, slab, get_free_obj(slab, slab->active)); - slab->active++; - - return objp; -} - -static void slab_put_obj(struct kmem_cache *cachep, - struct slab *slab, void *objp) -{ - unsigned int objnr = obj_to_index(cachep, slab, objp); -#if DEBUG - unsigned int i; - - /* Verify double free bug */ - for (i = slab->active; i < cachep->num; i++) { - if (get_free_obj(slab, i) == objnr) { - pr_err("slab: double free detected in cache '%s', objp %px\n", - cachep->name, objp); - BUG(); - } - } -#endif - slab->active--; - if (!slab->freelist) - slab->freelist = objp + obj_offset(cachep); - - set_free_obj(slab, slab->active, objnr); -} - -/* - * Grow (by 1) the number of slabs within a cache. This is called by - * kmem_cache_alloc() when there are no active objs left in a cache. - */ -static struct slab *cache_grow_begin(struct kmem_cache *cachep, - gfp_t flags, int nodeid) -{ - void *freelist; - size_t offset; - gfp_t local_flags; - int slab_node; - struct kmem_cache_node *n; - struct slab *slab; - - /* - * Be lazy and only check for valid flags here, keeping it out of the - * critical path in kmem_cache_alloc(). - */ - if (unlikely(flags & GFP_SLAB_BUG_MASK)) - flags = kmalloc_fix_flags(flags); - - WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); - local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); - - check_irq_off(); - if (gfpflags_allow_blocking(local_flags)) - local_irq_enable(); - - /* - * Get mem for the objs. Attempt to allocate a physical page from - * 'nodeid'. - */ - slab = kmem_getpages(cachep, local_flags, nodeid); - if (!slab) - goto failed; - - slab_node = slab_nid(slab); - n = get_node(cachep, slab_node); - - /* Get colour for the slab, and cal the next value. */ - n->colour_next++; - if (n->colour_next >= cachep->colour) - n->colour_next = 0; - - offset = n->colour_next; - if (offset >= cachep->colour) - offset = 0; - - offset *= cachep->colour_off; - - /* - * Call kasan_poison_slab() before calling alloc_slabmgmt(), so - * page_address() in the latter returns a non-tagged pointer, - * as it should be for slab pages. - */ - kasan_poison_slab(slab); - - /* Get slab management. */ - freelist = alloc_slabmgmt(cachep, slab, offset, - local_flags & ~GFP_CONSTRAINT_MASK, slab_node); - if (OFF_SLAB(cachep) && !freelist) - goto opps1; - - slab->slab_cache = cachep; - slab->freelist = freelist; - - cache_init_objs(cachep, slab); - - if (gfpflags_allow_blocking(local_flags)) - local_irq_disable(); - - return slab; - -opps1: - kmem_freepages(cachep, slab); -failed: - if (gfpflags_allow_blocking(local_flags)) - local_irq_disable(); - return NULL; -} - -static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab) -{ - struct kmem_cache_node *n; - void *list = NULL; - - check_irq_off(); - - if (!slab) - return; - - INIT_LIST_HEAD(&slab->slab_list); - n = get_node(cachep, slab_nid(slab)); - - raw_spin_lock(&n->list_lock); - n->total_slabs++; - if (!slab->active) { - list_add_tail(&slab->slab_list, &n->slabs_free); - n->free_slabs++; - } else - fixup_slab_list(cachep, n, slab, &list); - - STATS_INC_GROWN(cachep); - n->free_objects += cachep->num - slab->active; - raw_spin_unlock(&n->list_lock); - - fixup_objfreelist_debug(cachep, &list); -} - -#if DEBUG - -/* - * Perform extra freeing checks: - * - detect bad pointers. - * - POISON/RED_ZONE checking - */ -static void kfree_debugcheck(const void *objp) -{ - if (!virt_addr_valid(objp)) { - pr_err("kfree_debugcheck: out of range ptr %lxh\n", - (unsigned long)objp); - BUG(); - } -} - -static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) -{ - unsigned long long redzone1, redzone2; - - redzone1 = *dbg_redzone1(cache, obj); - redzone2 = *dbg_redzone2(cache, obj); - - /* - * Redzone is ok. - */ - if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) - return; - - if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) - slab_error(cache, "double free detected"); - else - slab_error(cache, "memory outside object was overwritten"); - - pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n", - obj, redzone1, redzone2); -} - -static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, - unsigned long caller) -{ - unsigned int objnr; - struct slab *slab; - - BUG_ON(virt_to_cache(objp) != cachep); - - objp -= obj_offset(cachep); - kfree_debugcheck(objp); - slab = virt_to_slab(objp); - - if (cachep->flags & SLAB_RED_ZONE) { - verify_redzone_free(cachep, objp); - *dbg_redzone1(cachep, objp) = RED_INACTIVE; - *dbg_redzone2(cachep, objp) = RED_INACTIVE; - } - if (cachep->flags & SLAB_STORE_USER) - *dbg_userword(cachep, objp) = (void *)caller; - - objnr = obj_to_index(cachep, slab, objp); - - BUG_ON(objnr >= cachep->num); - BUG_ON(objp != index_to_obj(cachep, slab, objnr)); - - if (cachep->flags & SLAB_POISON) { - poison_obj(cachep, objp, POISON_FREE); - slab_kernel_map(cachep, objp, 0); - } - return objp; -} - -#else -#define kfree_debugcheck(x) do { } while(0) -#define cache_free_debugcheck(x, objp, z) (objp) -#endif - -static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, - void **list) -{ -#if DEBUG - void *next = *list; - void *objp; - - while (next) { - objp = next - obj_offset(cachep); - next = *(void **)next; - poison_obj(cachep, objp, POISON_FREE); - } -#endif -} - -static inline void fixup_slab_list(struct kmem_cache *cachep, - struct kmem_cache_node *n, struct slab *slab, - void **list) -{ - /* move slabp to correct slabp list: */ - list_del(&slab->slab_list); - if (slab->active == cachep->num) { - list_add(&slab->slab_list, &n->slabs_full); - if (OBJFREELIST_SLAB(cachep)) { -#if DEBUG - /* Poisoning will be done without holding the lock */ - if (cachep->flags & SLAB_POISON) { - void **objp = slab->freelist; - - *objp = *list; - *list = objp; - } -#endif - slab->freelist = NULL; - } - } else - list_add(&slab->slab_list, &n->slabs_partial); -} - -/* Try to find non-pfmemalloc slab if needed */ -static noinline struct slab *get_valid_first_slab(struct kmem_cache_node *n, - struct slab *slab, bool pfmemalloc) -{ - if (!slab) - return NULL; - - if (pfmemalloc) - return slab; - - if (!slab_test_pfmemalloc(slab)) - return slab; - - /* No need to keep pfmemalloc slab if we have enough free objects */ - if (n->free_objects > n->free_limit) { - slab_clear_pfmemalloc(slab); - return slab; - } - - /* Move pfmemalloc slab to the end of list to speed up next search */ - list_del(&slab->slab_list); - if (!slab->active) { - list_add_tail(&slab->slab_list, &n->slabs_free); - n->free_slabs++; - } else - list_add_tail(&slab->slab_list, &n->slabs_partial); - - list_for_each_entry(slab, &n->slabs_partial, slab_list) { - if (!slab_test_pfmemalloc(slab)) - return slab; - } - - n->free_touched = 1; - list_for_each_entry(slab, &n->slabs_free, slab_list) { - if (!slab_test_pfmemalloc(slab)) { - n->free_slabs--; - return slab; - } - } - - return NULL; -} - -static struct slab *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) -{ - struct slab *slab; - - assert_raw_spin_locked(&n->list_lock); - slab = list_first_entry_or_null(&n->slabs_partial, struct slab, - slab_list); - if (!slab) { - n->free_touched = 1; - slab = list_first_entry_or_null(&n->slabs_free, struct slab, - slab_list); - if (slab) - n->free_slabs--; - } - - if (sk_memalloc_socks()) - slab = get_valid_first_slab(n, slab, pfmemalloc); - - return slab; -} - -static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, - struct kmem_cache_node *n, gfp_t flags) -{ - struct slab *slab; - void *obj; - void *list = NULL; - - if (!gfp_pfmemalloc_allowed(flags)) - return NULL; - - raw_spin_lock(&n->list_lock); - slab = get_first_slab(n, true); - if (!slab) { - raw_spin_unlock(&n->list_lock); - return NULL; - } - - obj = slab_get_obj(cachep, slab); - n->free_objects--; - - fixup_slab_list(cachep, n, slab, &list); - - raw_spin_unlock(&n->list_lock); - fixup_objfreelist_debug(cachep, &list); - - return obj; -} - -/* - * Slab list should be fixed up by fixup_slab_list() for existing slab - * or cache_grow_end() for new slab - */ -static __always_inline int alloc_block(struct kmem_cache *cachep, - struct array_cache *ac, struct slab *slab, int batchcount) -{ - /* - * There must be at least one object available for - * allocation. - */ - BUG_ON(slab->active >= cachep->num); - - while (slab->active < cachep->num && batchcount--) { - STATS_INC_ALLOCED(cachep); - STATS_INC_ACTIVE(cachep); - STATS_SET_HIGH(cachep); - - ac->entry[ac->avail++] = slab_get_obj(cachep, slab); - } - - return batchcount; -} - -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) -{ - int batchcount; - struct kmem_cache_node *n; - struct array_cache *ac, *shared; - int node; - void *list = NULL; - struct slab *slab; - - check_irq_off(); - node = numa_mem_id(); - - ac = cpu_cache_get(cachep); - batchcount = ac->batchcount; - if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { - /* - * If there was little recent activity on this cache, then - * perform only a partial refill. Otherwise we could generate - * refill bouncing. - */ - batchcount = BATCHREFILL_LIMIT; - } - n = get_node(cachep, node); - - BUG_ON(ac->avail > 0 || !n); - shared = READ_ONCE(n->shared); - if (!n->free_objects && (!shared || !shared->avail)) - goto direct_grow; - - raw_spin_lock(&n->list_lock); - shared = READ_ONCE(n->shared); - - /* See if we can refill from the shared array */ - if (shared && transfer_objects(ac, shared, batchcount)) { - shared->touched = 1; - goto alloc_done; - } - - while (batchcount > 0) { - /* Get slab alloc is to come from. */ - slab = get_first_slab(n, false); - if (!slab) - goto must_grow; - - check_spinlock_acquired(cachep); - - batchcount = alloc_block(cachep, ac, slab, batchcount); - fixup_slab_list(cachep, n, slab, &list); - } - -must_grow: - n->free_objects -= ac->avail; -alloc_done: - raw_spin_unlock(&n->list_lock); - fixup_objfreelist_debug(cachep, &list); - -direct_grow: - if (unlikely(!ac->avail)) { - /* Check if we can use obj in pfmemalloc slab */ - if (sk_memalloc_socks()) { - void *obj = cache_alloc_pfmemalloc(cachep, n, flags); - - if (obj) - return obj; - } - - slab = cache_grow_begin(cachep, gfp_exact_node(flags), node); - - /* - * cache_grow_begin() can reenable interrupts, - * then ac could change. - */ - ac = cpu_cache_get(cachep); - if (!ac->avail && slab) - alloc_block(cachep, ac, slab, batchcount); - cache_grow_end(cachep, slab); - - if (!ac->avail) - return NULL; - } - ac->touched = 1; - - return ac->entry[--ac->avail]; -} - -#if DEBUG -static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, - gfp_t flags, void *objp, unsigned long caller) -{ - WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); - if (!objp || is_kfence_address(objp)) - return objp; - if (cachep->flags & SLAB_POISON) { - check_poison_obj(cachep, objp); - slab_kernel_map(cachep, objp, 1); - poison_obj(cachep, objp, POISON_INUSE); - } - if (cachep->flags & SLAB_STORE_USER) - *dbg_userword(cachep, objp) = (void *)caller; - - if (cachep->flags & SLAB_RED_ZONE) { - if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || - *dbg_redzone2(cachep, objp) != RED_INACTIVE) { - slab_error(cachep, "double free, or memory outside object was overwritten"); - pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n", - objp, *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); - } - *dbg_redzone1(cachep, objp) = RED_ACTIVE; - *dbg_redzone2(cachep, objp) = RED_ACTIVE; - } - - objp += obj_offset(cachep); - if (cachep->ctor && cachep->flags & SLAB_POISON) - cachep->ctor(objp); - if ((unsigned long)objp & (arch_slab_minalign() - 1)) { - pr_err("0x%px: not aligned to arch_slab_minalign()=%u\n", objp, - arch_slab_minalign()); - } - return objp; -} -#else -#define cache_alloc_debugcheck_after(a, b, objp, d) (objp) -#endif - -static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) -{ - void *objp; - struct array_cache *ac; - - check_irq_off(); - - ac = cpu_cache_get(cachep); - if (likely(ac->avail)) { - ac->touched = 1; - objp = ac->entry[--ac->avail]; - - STATS_INC_ALLOCHIT(cachep); - goto out; - } - - STATS_INC_ALLOCMISS(cachep); - objp = cache_alloc_refill(cachep, flags); - /* - * the 'ac' may be updated by cache_alloc_refill(), - * and kmemleak_erase() requires its correct value. - */ - ac = cpu_cache_get(cachep); - -out: - /* - * To avoid a false negative, if an object that is in one of the - * per-CPU caches is leaked, we need to make sure kmemleak doesn't - * treat the array pointers as a reference to the object. - */ - if (objp) - kmemleak_erase(&ac->entry[ac->avail]); - return objp; -} - -#ifdef CONFIG_NUMA -static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); - -/* - * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set. - * - * If we are in_interrupt, then process context, including cpusets and - * mempolicy, may not apply and should not be used for allocation policy. - */ -static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) -{ - int nid_alloc, nid_here; - - if (in_interrupt() || (flags & __GFP_THISNODE)) - return NULL; - nid_alloc = nid_here = numa_mem_id(); - if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) - nid_alloc = cpuset_slab_spread_node(); - else if (current->mempolicy) - nid_alloc = mempolicy_slab_node(); - if (nid_alloc != nid_here) - return ____cache_alloc_node(cachep, flags, nid_alloc); - return NULL; -} - -/* - * Fallback function if there was no memory available and no objects on a - * certain node and fall back is permitted. First we scan all the - * available node for available objects. If that fails then we - * perform an allocation without specifying a node. This allows the page - * allocator to do its reclaim / fallback magic. We then insert the - * slab into the proper nodelist and then allocate from it. - */ -static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) -{ - struct zonelist *zonelist; - struct zoneref *z; - struct zone *zone; - enum zone_type highest_zoneidx = gfp_zone(flags); - void *obj = NULL; - struct slab *slab; - int nid; - unsigned int cpuset_mems_cookie; - - if (flags & __GFP_THISNODE) - return NULL; - -retry_cpuset: - cpuset_mems_cookie = read_mems_allowed_begin(); - zonelist = node_zonelist(mempolicy_slab_node(), flags); - -retry: - /* - * Look through allowed nodes for objects available - * from existing per node queues. - */ - for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { - nid = zone_to_nid(zone); - - if (cpuset_zone_allowed(zone, flags) && - get_node(cache, nid) && - get_node(cache, nid)->free_objects) { - obj = ____cache_alloc_node(cache, - gfp_exact_node(flags), nid); - if (obj) - break; - } - } - - if (!obj) { - /* - * This allocation will be performed within the constraints - * of the current cpuset / memory policy requirements. - * We may trigger various forms of reclaim on the allowed - * set and go into memory reserves if necessary. - */ - slab = cache_grow_begin(cache, flags, numa_mem_id()); - cache_grow_end(cache, slab); - if (slab) { - nid = slab_nid(slab); - obj = ____cache_alloc_node(cache, - gfp_exact_node(flags), nid); - - /* - * Another processor may allocate the objects in - * the slab since we are not holding any locks. - */ - if (!obj) - goto retry; - } - } - - if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) - goto retry_cpuset; - return obj; -} - -/* - * An interface to enable slab creation on nodeid - */ -static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, - int nodeid) -{ - struct slab *slab; - struct kmem_cache_node *n; - void *obj = NULL; - void *list = NULL; - - VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); - n = get_node(cachep, nodeid); - BUG_ON(!n); - - check_irq_off(); - raw_spin_lock(&n->list_lock); - slab = get_first_slab(n, false); - if (!slab) - goto must_grow; - - check_spinlock_acquired_node(cachep, nodeid); - - STATS_INC_NODEALLOCS(cachep); - STATS_INC_ACTIVE(cachep); - STATS_SET_HIGH(cachep); - - BUG_ON(slab->active == cachep->num); - - obj = slab_get_obj(cachep, slab); - n->free_objects--; - - fixup_slab_list(cachep, n, slab, &list); - - raw_spin_unlock(&n->list_lock); - fixup_objfreelist_debug(cachep, &list); - return obj; - -must_grow: - raw_spin_unlock(&n->list_lock); - slab = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); - if (slab) { - /* This slab isn't counted yet so don't update free_objects */ - obj = slab_get_obj(cachep, slab); - } - cache_grow_end(cachep, slab); - - return obj ? obj : fallback_alloc(cachep, flags); -} - -static __always_inline void * -__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid) -{ - void *objp = NULL; - int slab_node = numa_mem_id(); - - if (nodeid == NUMA_NO_NODE) { - if (current->mempolicy || cpuset_do_slab_mem_spread()) { - objp = alternate_node_alloc(cachep, flags); - if (objp) - goto out; - } - /* - * Use the locally cached objects if possible. - * However ____cache_alloc does not allow fallback - * to other nodes. It may fail while we still have - * objects on other nodes available. - */ - objp = ____cache_alloc(cachep, flags); - nodeid = slab_node; - } else if (nodeid == slab_node) { - objp = ____cache_alloc(cachep, flags); - } else if (!get_node(cachep, nodeid)) { - /* Node not bootstrapped yet */ - objp = fallback_alloc(cachep, flags); - goto out; - } - - /* - * We may just have run out of memory on the local node. - * ____cache_alloc_node() knows how to locate memory on other nodes - */ - if (!objp) - objp = ____cache_alloc_node(cachep, flags, nodeid); -out: - return objp; -} -#else - -static __always_inline void * -__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid __maybe_unused) -{ - return ____cache_alloc(cachep, flags); -} - -#endif /* CONFIG_NUMA */ - -static __always_inline void * -slab_alloc_node(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, - int nodeid, size_t orig_size, unsigned long caller) -{ - unsigned long save_flags; - void *objp; - struct obj_cgroup *objcg = NULL; - bool init = false; - - flags &= gfp_allowed_mask; - cachep = slab_pre_alloc_hook(cachep, lru, &objcg, 1, flags); - if (unlikely(!cachep)) - return NULL; - - objp = kfence_alloc(cachep, orig_size, flags); - if (unlikely(objp)) - goto out; - - local_irq_save(save_flags); - objp = __do_cache_alloc(cachep, flags, nodeid); - local_irq_restore(save_flags); - objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); - prefetchw(objp); - init = slab_want_init_on_alloc(flags, cachep); - -out: - slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init, - cachep->object_size); - return objp; -} - -static __always_inline void * -slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, - size_t orig_size, unsigned long caller) -{ - return slab_alloc_node(cachep, lru, flags, NUMA_NO_NODE, orig_size, - caller); -} - -/* - * Caller needs to acquire correct kmem_cache_node's list_lock - * @list: List of detached free slabs should be freed by caller - */ -static void free_block(struct kmem_cache *cachep, void **objpp, - int nr_objects, int node, struct list_head *list) -{ - int i; - struct kmem_cache_node *n = get_node(cachep, node); - struct slab *slab; - - n->free_objects += nr_objects; - - for (i = 0; i < nr_objects; i++) { - void *objp; - struct slab *slab; - - objp = objpp[i]; - - slab = virt_to_slab(objp); - list_del(&slab->slab_list); - check_spinlock_acquired_node(cachep, node); - slab_put_obj(cachep, slab, objp); - STATS_DEC_ACTIVE(cachep); - - /* fixup slab chains */ - if (slab->active == 0) { - list_add(&slab->slab_list, &n->slabs_free); - n->free_slabs++; - } else { - /* Unconditionally move a slab to the end of the - * partial list on free - maximum time for the - * other objects to be freed, too. - */ - list_add_tail(&slab->slab_list, &n->slabs_partial); - } - } - - while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) { - n->free_objects -= cachep->num; - - slab = list_last_entry(&n->slabs_free, struct slab, slab_list); - list_move(&slab->slab_list, list); - n->free_slabs--; - n->total_slabs--; - } -} - -static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) -{ - int batchcount; - struct kmem_cache_node *n; - int node = numa_mem_id(); - LIST_HEAD(list); - - batchcount = ac->batchcount; - - check_irq_off(); - n = get_node(cachep, node); - raw_spin_lock(&n->list_lock); - if (n->shared) { - struct array_cache *shared_array = n->shared; - int max = shared_array->limit - shared_array->avail; - if (max) { - if (batchcount > max) - batchcount = max; - memcpy(&(shared_array->entry[shared_array->avail]), - ac->entry, sizeof(void *) * batchcount); - shared_array->avail += batchcount; - goto free_done; - } - } - - free_block(cachep, ac->entry, batchcount, node, &list); -free_done: -#if STATS - { - int i = 0; - struct slab *slab; - - list_for_each_entry(slab, &n->slabs_free, slab_list) { - BUG_ON(slab->active); - - i++; - } - STATS_SET_FREEABLE(cachep, i); - } -#endif - raw_spin_unlock(&n->list_lock); - ac->avail -= batchcount; - memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); - slabs_destroy(cachep, &list); -} - -/* - * Release an obj back to its cache. If the obj has a constructed state, it must - * be in this state _before_ it is released. Called with disabled ints. - */ -static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, - unsigned long caller) -{ - bool init; - - memcg_slab_free_hook(cachep, virt_to_slab(objp), &objp, 1); - - if (is_kfence_address(objp)) { - kmemleak_free_recursive(objp, cachep->flags); - __kfence_free(objp); - return; - } - - /* - * As memory initialization might be integrated into KASAN, - * kasan_slab_free and initialization memset must be - * kept together to avoid discrepancies in behavior. - */ - init = slab_want_init_on_free(cachep); - if (init && !kasan_has_integrated_init()) - memset(objp, 0, cachep->object_size); - /* KASAN might put objp into memory quarantine, delaying its reuse. */ - if (kasan_slab_free(cachep, objp, init)) - return; - - /* Use KCSAN to help debug racy use-after-free. */ - if (!(cachep->flags & SLAB_TYPESAFE_BY_RCU)) - __kcsan_check_access(objp, cachep->object_size, - KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); - - ___cache_free(cachep, objp, caller); -} - -void ___cache_free(struct kmem_cache *cachep, void *objp, - unsigned long caller) -{ - struct array_cache *ac = cpu_cache_get(cachep); - - check_irq_off(); - kmemleak_free_recursive(objp, cachep->flags); - objp = cache_free_debugcheck(cachep, objp, caller); - - /* - * Skip calling cache_free_alien() when the platform is not numa. - * This will avoid cache misses that happen while accessing slabp (which - * is per page memory reference) to get nodeid. Instead use a global - * variable to skip the call, which is mostly likely to be present in - * the cache. - */ - if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) - return; - - if (ac->avail < ac->limit) { - STATS_INC_FREEHIT(cachep); - } else { - STATS_INC_FREEMISS(cachep); - cache_flusharray(cachep, ac); - } - - if (sk_memalloc_socks()) { - struct slab *slab = virt_to_slab(objp); - - if (unlikely(slab_test_pfmemalloc(slab))) { - cache_free_pfmemalloc(cachep, slab, objp); - return; - } - } - - __free_one(ac, objp); -} - -static __always_inline -void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, - gfp_t flags) -{ - void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_); - - trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, NUMA_NO_NODE); - - return ret; -} - -void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) -{ - return __kmem_cache_alloc_lru(cachep, NULL, flags); -} -EXPORT_SYMBOL(kmem_cache_alloc); - -void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, - gfp_t flags) -{ - return __kmem_cache_alloc_lru(cachep, lru, flags); -} -EXPORT_SYMBOL(kmem_cache_alloc_lru); - -static __always_inline void -cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, - size_t size, void **p, unsigned long caller) -{ - size_t i; - - for (i = 0; i < size; i++) - p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller); -} - -int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) -{ - struct obj_cgroup *objcg = NULL; - unsigned long irqflags; - size_t i; - - s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); - if (!s) - return 0; - - local_irq_save(irqflags); - for (i = 0; i < size; i++) { - void *objp = kfence_alloc(s, s->object_size, flags) ?: - __do_cache_alloc(s, flags, NUMA_NO_NODE); - - if (unlikely(!objp)) - goto error; - p[i] = objp; - } - local_irq_restore(irqflags); - - cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_); - - /* - * memcg and kmem_cache debug support and memory initialization. - * Done outside of the IRQ disabled section. - */ - slab_post_alloc_hook(s, objcg, flags, size, p, - slab_want_init_on_alloc(flags, s), s->object_size); - /* FIXME: Trace call missing. Christoph would like a bulk variant */ - return size; -error: - local_irq_restore(irqflags); - cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); - slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); - kmem_cache_free_bulk(s, i, p); - return 0; -} -EXPORT_SYMBOL(kmem_cache_alloc_bulk); - -void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) -{ - void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_); - - trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, nodeid); - - return ret; -} -EXPORT_SYMBOL(kmem_cache_alloc_node); - -void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, - int nodeid, size_t orig_size, - unsigned long caller) -{ - return slab_alloc_node(cachep, NULL, flags, nodeid, - orig_size, caller); -} - -#ifdef CONFIG_PRINTK -void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) -{ - struct kmem_cache *cachep; - unsigned int objnr; - void *objp; - - kpp->kp_ptr = object; - kpp->kp_slab = slab; - cachep = slab->slab_cache; - kpp->kp_slab_cache = cachep; - objp = object - obj_offset(cachep); - kpp->kp_data_offset = obj_offset(cachep); - slab = virt_to_slab(objp); - objnr = obj_to_index(cachep, slab, objp); - objp = index_to_obj(cachep, slab, objnr); - kpp->kp_objp = objp; - if (DEBUG && cachep->flags & SLAB_STORE_USER) - kpp->kp_ret = *dbg_userword(cachep, objp); -} -#endif - -static __always_inline -void __do_kmem_cache_free(struct kmem_cache *cachep, void *objp, - unsigned long caller) -{ - unsigned long flags; - - local_irq_save(flags); - debug_check_no_locks_freed(objp, cachep->object_size); - if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(objp, cachep->object_size); - __cache_free(cachep, objp, caller); - local_irq_restore(flags); -} - -void __kmem_cache_free(struct kmem_cache *cachep, void *objp, - unsigned long caller) -{ - __do_kmem_cache_free(cachep, objp, caller); -} - -void kmem_cache_free(struct kmem_cache *cachep, void *objp) -{ - cachep = cache_from_obj(cachep, objp); - if (!cachep) - return; - - trace_kmem_cache_free(_RET_IP_, objp, cachep); - __do_kmem_cache_free(cachep, objp, _RET_IP_); -} -EXPORT_SYMBOL(kmem_cache_free); - -void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) -{ - unsigned long flags; - - local_irq_save(flags); - for (int i = 0; i < size; i++) { - void *objp = p[i]; - struct kmem_cache *s; - - if (!orig_s) { - struct folio *folio = virt_to_folio(objp); - - /* called via kfree_bulk */ - if (!folio_test_slab(folio)) { - local_irq_restore(flags); - free_large_kmalloc(folio, objp); - local_irq_save(flags); - continue; - } - s = folio_slab(folio)->slab_cache; - } else { - s = cache_from_obj(orig_s, objp); - } - - if (!s) - continue; - - debug_check_no_locks_freed(objp, s->object_size); - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(objp, s->object_size); - - __cache_free(s, objp, _RET_IP_); - } - local_irq_restore(flags); - - /* FIXME: add tracing */ -} -EXPORT_SYMBOL(kmem_cache_free_bulk); - -/* - * This initializes kmem_cache_node or resizes various caches for all nodes. - */ -static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp) -{ - int ret; - int node; - struct kmem_cache_node *n; - - for_each_online_node(node) { - ret = setup_kmem_cache_node(cachep, node, gfp, true); - if (ret) - goto fail; - - } - - return 0; - -fail: - if (!cachep->list.next) { - /* Cache is not active yet. Roll back what we did */ - node--; - while (node >= 0) { - n = get_node(cachep, node); - if (n) { - kfree(n->shared); - free_alien_cache(n->alien); - kfree(n); - cachep->node[node] = NULL; - } - node--; - } - } - return -ENOMEM; -} - -/* Always called with the slab_mutex held */ -static int do_tune_cpucache(struct kmem_cache *cachep, int limit, - int batchcount, int shared, gfp_t gfp) -{ - struct array_cache __percpu *cpu_cache, *prev; - int cpu; - - cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount); - if (!cpu_cache) - return -ENOMEM; - - prev = cachep->cpu_cache; - cachep->cpu_cache = cpu_cache; - /* - * Without a previous cpu_cache there's no need to synchronize remote - * cpus, so skip the IPIs. - */ - if (prev) - kick_all_cpus_sync(); - - check_irq_on(); - cachep->batchcount = batchcount; - cachep->limit = limit; - cachep->shared = shared; - - if (!prev) - goto setup_node; - - for_each_online_cpu(cpu) { - LIST_HEAD(list); - int node; - struct kmem_cache_node *n; - struct array_cache *ac = per_cpu_ptr(prev, cpu); - - node = cpu_to_mem(cpu); - n = get_node(cachep, node); - raw_spin_lock_irq(&n->list_lock); - free_block(cachep, ac->entry, ac->avail, node, &list); - raw_spin_unlock_irq(&n->list_lock); - slabs_destroy(cachep, &list); - } - free_percpu(prev); - -setup_node: - return setup_kmem_cache_nodes(cachep, gfp); -} - -/* Called with slab_mutex held always */ -static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) -{ - int err; - int limit = 0; - int shared = 0; - int batchcount = 0; - - err = cache_random_seq_create(cachep, cachep->num, gfp); - if (err) - goto end; - - /* - * The head array serves three purposes: - * - create a LIFO ordering, i.e. return objects that are cache-warm - * - reduce the number of spinlock operations. - * - reduce the number of linked list operations on the slab and - * bufctl chains: array operations are cheaper. - * The numbers are guessed, we should auto-tune as described by - * Bonwick. - */ - if (cachep->size > 131072) - limit = 1; - else if (cachep->size > PAGE_SIZE) - limit = 8; - else if (cachep->size > 1024) - limit = 24; - else if (cachep->size > 256) - limit = 54; - else - limit = 120; - - /* - * CPU bound tasks (e.g. network routing) can exhibit cpu bound - * allocation behaviour: Most allocs on one cpu, most free operations - * on another cpu. For these cases, an efficient object passing between - * cpus is necessary. This is provided by a shared array. The array - * replaces Bonwick's magazine layer. - * On uniprocessor, it's functionally equivalent (but less efficient) - * to a larger limit. Thus disabled by default. - */ - shared = 0; - if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) - shared = 8; - -#if DEBUG - /* - * With debugging enabled, large batchcount lead to excessively long - * periods with disabled local interrupts. Limit the batchcount - */ - if (limit > 32) - limit = 32; -#endif - batchcount = (limit + 1) / 2; - err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); -end: - if (err) - pr_err("enable_cpucache failed for %s, error %d\n", - cachep->name, -err); - return err; -} - -/* - * Drain an array if it contains any elements taking the node lock only if - * necessary. Note that the node listlock also protects the array_cache - * if drain_array() is used on the shared array. - */ -static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, - struct array_cache *ac, int node) -{ - LIST_HEAD(list); - - /* ac from n->shared can be freed if we don't hold the slab_mutex. */ - check_mutex_acquired(); - - if (!ac || !ac->avail) - return; - - if (ac->touched) { - ac->touched = 0; - return; - } - - raw_spin_lock_irq(&n->list_lock); - drain_array_locked(cachep, ac, node, false, &list); - raw_spin_unlock_irq(&n->list_lock); - - slabs_destroy(cachep, &list); -} - -/** - * cache_reap - Reclaim memory from caches. - * @w: work descriptor - * - * Called from workqueue/eventd every few seconds. - * Purpose: - * - clear the per-cpu caches for this CPU. - * - return freeable pages to the main free memory pool. - * - * If we cannot acquire the cache chain mutex then just give up - we'll try - * again on the next iteration. - */ -static void cache_reap(struct work_struct *w) -{ - struct kmem_cache *searchp; - struct kmem_cache_node *n; - int node = numa_mem_id(); - struct delayed_work *work = to_delayed_work(w); - - if (!mutex_trylock(&slab_mutex)) - /* Give up. Setup the next iteration. */ - goto out; - - list_for_each_entry(searchp, &slab_caches, list) { - check_irq_on(); - - /* - * We only take the node lock if absolutely necessary and we - * have established with reasonable certainty that - * we can do some work if the lock was obtained. - */ - n = get_node(searchp, node); - - reap_alien(searchp, n); - - drain_array(searchp, n, cpu_cache_get(searchp), node); - - /* - * These are racy checks but it does not matter - * if we skip one check or scan twice. - */ - if (time_after(n->next_reap, jiffies)) - goto next; - - n->next_reap = jiffies + REAPTIMEOUT_NODE; - - drain_array(searchp, n, n->shared, node); - - if (n->free_touched) - n->free_touched = 0; - else { - int freed; - - freed = drain_freelist(searchp, n, (n->free_limit + - 5 * searchp->num - 1) / (5 * searchp->num)); - STATS_ADD_REAPED(searchp, freed); - } -next: - cond_resched(); - } - check_irq_on(); - mutex_unlock(&slab_mutex); - next_reap_node(); -out: - /* Set up the next iteration */ - schedule_delayed_work_on(smp_processor_id(), work, - round_jiffies_relative(REAPTIMEOUT_AC)); -} - -void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) -{ - unsigned long active_objs, num_objs, active_slabs; - unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0; - unsigned long free_slabs = 0; - int node; - struct kmem_cache_node *n; - - for_each_kmem_cache_node(cachep, node, n) { - check_irq_on(); - raw_spin_lock_irq(&n->list_lock); - - total_slabs += n->total_slabs; - free_slabs += n->free_slabs; - free_objs += n->free_objects; - - if (n->shared) - shared_avail += n->shared->avail; - - raw_spin_unlock_irq(&n->list_lock); - } - num_objs = total_slabs * cachep->num; - active_slabs = total_slabs - free_slabs; - active_objs = num_objs - free_objs; - - sinfo->active_objs = active_objs; - sinfo->num_objs = num_objs; - sinfo->active_slabs = active_slabs; - sinfo->num_slabs = total_slabs; - sinfo->shared_avail = shared_avail; - sinfo->limit = cachep->limit; - sinfo->batchcount = cachep->batchcount; - sinfo->shared = cachep->shared; - sinfo->objects_per_slab = cachep->num; - sinfo->cache_order = cachep->gfporder; -} - -void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) -{ -#if STATS - { /* node stats */ - unsigned long high = cachep->high_mark; - unsigned long allocs = cachep->num_allocations; - unsigned long grown = cachep->grown; - unsigned long reaped = cachep->reaped; - unsigned long errors = cachep->errors; - unsigned long max_freeable = cachep->max_freeable; - unsigned long node_allocs = cachep->node_allocs; - unsigned long node_frees = cachep->node_frees; - unsigned long overflows = cachep->node_overflow; - - seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu", - allocs, high, grown, - reaped, errors, max_freeable, node_allocs, - node_frees, overflows); - } - /* cpu stats */ - { - unsigned long allochit = atomic_read(&cachep->allochit); - unsigned long allocmiss = atomic_read(&cachep->allocmiss); - unsigned long freehit = atomic_read(&cachep->freehit); - unsigned long freemiss = atomic_read(&cachep->freemiss); - - seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", - allochit, allocmiss, freehit, freemiss); - } -#endif -} - -#define MAX_SLABINFO_WRITE 128 -/** - * slabinfo_write - Tuning for the slab allocator - * @file: unused - * @buffer: user buffer - * @count: data length - * @ppos: unused - * - * Return: %0 on success, negative error code otherwise. - */ -ssize_t slabinfo_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; - int limit, batchcount, shared, res; - struct kmem_cache *cachep; - - if (count > MAX_SLABINFO_WRITE) - return -EINVAL; - if (copy_from_user(&kbuf, buffer, count)) - return -EFAULT; - kbuf[MAX_SLABINFO_WRITE] = '\0'; - - tmp = strchr(kbuf, ' '); - if (!tmp) - return -EINVAL; - *tmp = '\0'; - tmp++; - if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) - return -EINVAL; - - /* Find the cache in the chain of caches. */ - mutex_lock(&slab_mutex); - res = -EINVAL; - list_for_each_entry(cachep, &slab_caches, list) { - if (!strcmp(cachep->name, kbuf)) { - if (limit < 1 || batchcount < 1 || - batchcount > limit || shared < 0) { - res = 0; - } else { - res = do_tune_cpucache(cachep, limit, - batchcount, shared, - GFP_KERNEL); - } - break; - } - } - mutex_unlock(&slab_mutex); - if (res >= 0) - res = count; - return res; -} - -#ifdef CONFIG_HARDENED_USERCOPY -/* - * Rejects incorrectly sized objects and objects that are to be copied - * to/from userspace but do not fall entirely within the containing slab - * cache's usercopy region. - * - * Returns NULL if check passes, otherwise const char * to name of cache - * to indicate an error. - */ -void __check_heap_object(const void *ptr, unsigned long n, - const struct slab *slab, bool to_user) -{ - struct kmem_cache *cachep; - unsigned int objnr; - unsigned long offset; - - ptr = kasan_reset_tag(ptr); - - /* Find and validate object. */ - cachep = slab->slab_cache; - objnr = obj_to_index(cachep, slab, (void *)ptr); - BUG_ON(objnr >= cachep->num); - - /* Find offset within object. */ - if (is_kfence_address(ptr)) - offset = ptr - kfence_object_start(ptr); - else - offset = ptr - index_to_obj(cachep, slab, objnr) - obj_offset(cachep); - - /* Allow address range falling entirely within usercopy region. */ - if (offset >= cachep->useroffset && - offset - cachep->useroffset <= cachep->usersize && - n <= cachep->useroffset - offset + cachep->usersize) - return; - - usercopy_abort("SLAB object", cachep->name, to_user, offset, n); -} -#endif /* CONFIG_HARDENED_USERCOPY */ From 7ef08ae8277c66657127844179912214c67fb4bc Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 3 Oct 2023 09:54:15 +0200 Subject: [PATCH 19/34] mm/slab: move struct kmem_cache_cpu declaration to slub.c Nothing outside SLUB itself accesses the struct kmem_cache_cpu fields so it does not need to be declared in slub_def.h. This allows also to move enum stat_item. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- include/linux/slub_def.h | 54 ---------------------------------------- mm/slub.c | 54 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 54 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index deb90cf4bffb..a0229ea42977 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -12,60 +12,6 @@ #include #include -enum stat_item { - ALLOC_FASTPATH, /* Allocation from cpu slab */ - ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ - FREE_FASTPATH, /* Free to cpu slab */ - FREE_SLOWPATH, /* Freeing not to cpu slab */ - FREE_FROZEN, /* Freeing to frozen slab */ - FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ - FREE_REMOVE_PARTIAL, /* Freeing removes last object */ - ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ - ALLOC_SLAB, /* Cpu slab acquired from page allocator */ - ALLOC_REFILL, /* Refill cpu slab from slab freelist */ - ALLOC_NODE_MISMATCH, /* Switching cpu slab */ - FREE_SLAB, /* Slab freed to the page allocator */ - CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ - DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ - DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ - DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ - DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ - DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ - DEACTIVATE_BYPASS, /* Implicit deactivation */ - ORDER_FALLBACK, /* Number of times fallback was necessary */ - CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */ - CMPXCHG_DOUBLE_FAIL, /* Number of times that cmpxchg double did not match */ - CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ - CPU_PARTIAL_FREE, /* Refill cpu partial on free */ - CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ - CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ - NR_SLUB_STAT_ITEMS -}; - -#ifndef CONFIG_SLUB_TINY -/* - * When changing the layout, make sure freelist and tid are still compatible - * with this_cpu_cmpxchg_double() alignment requirements. - */ -struct kmem_cache_cpu { - union { - struct { - void **freelist; /* Pointer to next available object */ - unsigned long tid; /* Globally unique transaction id */ - }; - freelist_aba_t freelist_tid; - }; - struct slab *slab; /* The slab from which we are allocating */ -#ifdef CONFIG_SLUB_CPU_PARTIAL - struct slab *partial; /* Partially allocated frozen slabs */ -#endif - local_lock_t lock; /* Protects the fields above */ -#ifdef CONFIG_SLUB_STATS - unsigned stat[NR_SLUB_STAT_ITEMS]; -#endif -}; -#endif /* CONFIG_SLUB_TINY */ - #ifdef CONFIG_SLUB_CPU_PARTIAL #define slub_percpu_partial(c) ((c)->partial) diff --git a/mm/slub.c b/mm/slub.c index 3e01731783df..979932d046fd 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -330,6 +330,60 @@ static void debugfs_slab_add(struct kmem_cache *); static inline void debugfs_slab_add(struct kmem_cache *s) { } #endif +enum stat_item { + ALLOC_FASTPATH, /* Allocation from cpu slab */ + ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ + FREE_FASTPATH, /* Free to cpu slab */ + FREE_SLOWPATH, /* Freeing not to cpu slab */ + FREE_FROZEN, /* Freeing to frozen slab */ + FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ + FREE_REMOVE_PARTIAL, /* Freeing removes last object */ + ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ + ALLOC_SLAB, /* Cpu slab acquired from page allocator */ + ALLOC_REFILL, /* Refill cpu slab from slab freelist */ + ALLOC_NODE_MISMATCH, /* Switching cpu slab */ + FREE_SLAB, /* Slab freed to the page allocator */ + CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ + DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ + DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ + DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ + DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ + DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ + DEACTIVATE_BYPASS, /* Implicit deactivation */ + ORDER_FALLBACK, /* Number of times fallback was necessary */ + CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */ + CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */ + CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ + CPU_PARTIAL_FREE, /* Refill cpu partial on free */ + CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ + CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ + NR_SLUB_STAT_ITEMS +}; + +#ifndef CONFIG_SLUB_TINY +/* + * When changing the layout, make sure freelist and tid are still compatible + * with this_cpu_cmpxchg_double() alignment requirements. + */ +struct kmem_cache_cpu { + union { + struct { + void **freelist; /* Pointer to next available object */ + unsigned long tid; /* Globally unique transaction id */ + }; + freelist_aba_t freelist_tid; + }; + struct slab *slab; /* The slab from which we are allocating */ +#ifdef CONFIG_SLUB_CPU_PARTIAL + struct slab *partial; /* Partially allocated frozen slabs */ +#endif + local_lock_t lock; /* Protects the fields above */ +#ifdef CONFIG_SLUB_STATS + unsigned int stat[NR_SLUB_STAT_ITEMS]; +#endif +}; +#endif /* CONFIG_SLUB_TINY */ + static inline void stat(const struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS From 19975f83412fbb9b1458f3dfbf16ca043a57788a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 3 Oct 2023 09:59:48 +0200 Subject: [PATCH 20/34] mm/slab: move the rest of slub_def.h to mm/slab.h mm/slab.h is the only place to include include/linux/slub_def.h which has allowed switching between SLAB and SLUB. Now we can simply move the contents over and remove slub_def.h. Use this opportunity to fix up some whitespace (alignment) issues. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- include/linux/slub_def.h | 150 --------------------------------------- mm/slab.h | 138 ++++++++++++++++++++++++++++++++++- 2 files changed, 137 insertions(+), 151 deletions(-) delete mode 100644 include/linux/slub_def.h diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h deleted file mode 100644 index a0229ea42977..000000000000 --- a/include/linux/slub_def.h +++ /dev/null @@ -1,150 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_SLUB_DEF_H -#define _LINUX_SLUB_DEF_H - -/* - * SLUB : A Slab allocator without object queues. - * - * (C) 2007 SGI, Christoph Lameter - */ -#include -#include -#include -#include - -#ifdef CONFIG_SLUB_CPU_PARTIAL -#define slub_percpu_partial(c) ((c)->partial) - -#define slub_set_percpu_partial(c, p) \ -({ \ - slub_percpu_partial(c) = (p)->next; \ -}) - -#define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) -#else -#define slub_percpu_partial(c) NULL - -#define slub_set_percpu_partial(c, p) - -#define slub_percpu_partial_read_once(c) NULL -#endif // CONFIG_SLUB_CPU_PARTIAL - -/* - * Word size structure that can be atomically updated or read and that - * contains both the order and the number of objects that a slab of the - * given order would contain. - */ -struct kmem_cache_order_objects { - unsigned int x; -}; - -/* - * Slab cache management. - */ -struct kmem_cache { -#ifndef CONFIG_SLUB_TINY - struct kmem_cache_cpu __percpu *cpu_slab; -#endif - /* Used for retrieving partial slabs, etc. */ - slab_flags_t flags; - unsigned long min_partial; - unsigned int size; /* The size of an object including metadata */ - unsigned int object_size;/* The size of an object without metadata */ - struct reciprocal_value reciprocal_size; - unsigned int offset; /* Free pointer offset */ -#ifdef CONFIG_SLUB_CPU_PARTIAL - /* Number of per cpu partial objects to keep around */ - unsigned int cpu_partial; - /* Number of per cpu partial slabs to keep around */ - unsigned int cpu_partial_slabs; -#endif - struct kmem_cache_order_objects oo; - - /* Allocation and freeing of slabs */ - struct kmem_cache_order_objects min; - gfp_t allocflags; /* gfp flags to use on each alloc */ - int refcount; /* Refcount for slab cache destroy */ - void (*ctor)(void *); - unsigned int inuse; /* Offset to metadata */ - unsigned int align; /* Alignment */ - unsigned int red_left_pad; /* Left redzone padding size */ - const char *name; /* Name (only for display!) */ - struct list_head list; /* List of slab caches */ -#ifdef CONFIG_SYSFS - struct kobject kobj; /* For sysfs */ -#endif -#ifdef CONFIG_SLAB_FREELIST_HARDENED - unsigned long random; -#endif - -#ifdef CONFIG_NUMA - /* - * Defragmentation by allocating from a remote node. - */ - unsigned int remote_node_defrag_ratio; -#endif - -#ifdef CONFIG_SLAB_FREELIST_RANDOM - unsigned int *random_seq; -#endif - -#ifdef CONFIG_KASAN_GENERIC - struct kasan_cache kasan_info; -#endif - -#ifdef CONFIG_HARDENED_USERCOPY - unsigned int useroffset; /* Usercopy region offset */ - unsigned int usersize; /* Usercopy region size */ -#endif - - struct kmem_cache_node *node[MAX_NUMNODES]; -}; - -#if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) -#define SLAB_SUPPORTS_SYSFS -void sysfs_slab_unlink(struct kmem_cache *); -void sysfs_slab_release(struct kmem_cache *); -#else -static inline void sysfs_slab_unlink(struct kmem_cache *s) -{ -} -static inline void sysfs_slab_release(struct kmem_cache *s) -{ -} -#endif - -void *fixup_red_left(struct kmem_cache *s, void *p); - -static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab, - void *x) { - void *object = x - (x - slab_address(slab)) % cache->size; - void *last_object = slab_address(slab) + - (slab->objects - 1) * cache->size; - void *result = (unlikely(object > last_object)) ? last_object : object; - - result = fixup_red_left(cache, result); - return result; -} - -/* Determine object index from a given position */ -static inline unsigned int __obj_to_index(const struct kmem_cache *cache, - void *addr, void *obj) -{ - return reciprocal_divide(kasan_reset_tag(obj) - addr, - cache->reciprocal_size); -} - -static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct slab *slab, void *obj) -{ - if (is_kfence_address(obj)) - return 0; - return __obj_to_index(cache, slab_address(slab), obj); -} - -static inline int objs_per_slab(const struct kmem_cache *cache, - const struct slab *slab) -{ - return slab->objects; -} -#endif /* _LINUX_SLUB_DEF_H */ diff --git a/mm/slab.h b/mm/slab.h index 014c36ea51fa..3a8d13c099fa 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -209,7 +209,143 @@ static inline size_t slab_size(const struct slab *slab) return PAGE_SIZE << slab_order(slab); } -#include +#include +#include +#include +#include + +#ifdef CONFIG_SLUB_CPU_PARTIAL +#define slub_percpu_partial(c) ((c)->partial) + +#define slub_set_percpu_partial(c, p) \ +({ \ + slub_percpu_partial(c) = (p)->next; \ +}) + +#define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) +#else +#define slub_percpu_partial(c) NULL + +#define slub_set_percpu_partial(c, p) + +#define slub_percpu_partial_read_once(c) NULL +#endif // CONFIG_SLUB_CPU_PARTIAL + +/* + * Word size structure that can be atomically updated or read and that + * contains both the order and the number of objects that a slab of the + * given order would contain. + */ +struct kmem_cache_order_objects { + unsigned int x; +}; + +/* + * Slab cache management. + */ +struct kmem_cache { +#ifndef CONFIG_SLUB_TINY + struct kmem_cache_cpu __percpu *cpu_slab; +#endif + /* Used for retrieving partial slabs, etc. */ + slab_flags_t flags; + unsigned long min_partial; + unsigned int size; /* Object size including metadata */ + unsigned int object_size; /* Object size without metadata */ + struct reciprocal_value reciprocal_size; + unsigned int offset; /* Free pointer offset */ +#ifdef CONFIG_SLUB_CPU_PARTIAL + /* Number of per cpu partial objects to keep around */ + unsigned int cpu_partial; + /* Number of per cpu partial slabs to keep around */ + unsigned int cpu_partial_slabs; +#endif + struct kmem_cache_order_objects oo; + + /* Allocation and freeing of slabs */ + struct kmem_cache_order_objects min; + gfp_t allocflags; /* gfp flags to use on each alloc */ + int refcount; /* Refcount for slab cache destroy */ + void (*ctor)(void *object); /* Object constructor */ + unsigned int inuse; /* Offset to metadata */ + unsigned int align; /* Alignment */ + unsigned int red_left_pad; /* Left redzone padding size */ + const char *name; /* Name (only for display!) */ + struct list_head list; /* List of slab caches */ +#ifdef CONFIG_SYSFS + struct kobject kobj; /* For sysfs */ +#endif +#ifdef CONFIG_SLAB_FREELIST_HARDENED + unsigned long random; +#endif + +#ifdef CONFIG_NUMA + /* + * Defragmentation by allocating from a remote node. + */ + unsigned int remote_node_defrag_ratio; +#endif + +#ifdef CONFIG_SLAB_FREELIST_RANDOM + unsigned int *random_seq; +#endif + +#ifdef CONFIG_KASAN_GENERIC + struct kasan_cache kasan_info; +#endif + +#ifdef CONFIG_HARDENED_USERCOPY + unsigned int useroffset; /* Usercopy region offset */ + unsigned int usersize; /* Usercopy region size */ +#endif + + struct kmem_cache_node *node[MAX_NUMNODES]; +}; + +#if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) +#define SLAB_SUPPORTS_SYSFS +void sysfs_slab_unlink(struct kmem_cache *s); +void sysfs_slab_release(struct kmem_cache *s); +#else +static inline void sysfs_slab_unlink(struct kmem_cache *s) { } +static inline void sysfs_slab_release(struct kmem_cache *s) { } +#endif + +void *fixup_red_left(struct kmem_cache *s, void *p); + +static inline void *nearest_obj(struct kmem_cache *cache, + const struct slab *slab, void *x) +{ + void *object = x - (x - slab_address(slab)) % cache->size; + void *last_object = slab_address(slab) + + (slab->objects - 1) * cache->size; + void *result = (unlikely(object > last_object)) ? last_object : object; + + result = fixup_red_left(cache, result); + return result; +} + +/* Determine object index from a given position */ +static inline unsigned int __obj_to_index(const struct kmem_cache *cache, + void *addr, void *obj) +{ + return reciprocal_divide(kasan_reset_tag(obj) - addr, + cache->reciprocal_size); +} + +static inline unsigned int obj_to_index(const struct kmem_cache *cache, + const struct slab *slab, void *obj) +{ + if (is_kfence_address(obj)) + return 0; + return __obj_to_index(cache, slab_address(slab), obj); +} + +static inline int objs_per_slab(const struct kmem_cache *cache, + const struct slab *slab) +{ + return slab->objects; +} #include #include From 89c2d061bfa7fe2b5bcb1393a7a79bb5db8d4140 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 3 Oct 2023 11:15:54 +0200 Subject: [PATCH 21/34] mm/slab: consolidate includes in the internal mm/slab.h The #include's are scattered at several places of the file, but it does not seem this is needed to prevent any include loops (anymore?) so consolidate them at the top. Also move the misplaced kmem_cache_init() declaration away from the top. Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slab.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 3a8d13c099fa..1ac3a2f8d4c0 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -1,10 +1,22 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef MM_SLAB_H #define MM_SLAB_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + /* * Internal slab definitions */ -void __init kmem_cache_init(void); #ifdef CONFIG_64BIT # ifdef system_has_cmpxchg128 @@ -209,11 +221,6 @@ static inline size_t slab_size(const struct slab *slab) return PAGE_SIZE << slab_order(slab); } -#include -#include -#include -#include - #ifdef CONFIG_SLUB_CPU_PARTIAL #define slub_percpu_partial(c) ((c)->partial) @@ -347,14 +354,6 @@ static inline int objs_per_slab(const struct kmem_cache *cache, return slab->objects; } -#include -#include -#include -#include -#include -#include -#include - /* * State of the slab allocator. * @@ -405,6 +404,7 @@ gfp_t kmalloc_fix_flags(gfp_t flags); /* Functions provided by the slab allocators */ int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); +void __init kmem_cache_init(void); void __init new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags); extern void create_boot_cache(struct kmem_cache *, const char *name, From 6011be59910fb12b757f9d37793d21763268b4a1 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 3 Oct 2023 11:57:45 +0200 Subject: [PATCH 22/34] mm/slab: move pre/post-alloc hooks from slab.h to slub.c We don't share the hooks between two slab implementations anymore so they can be moved away from the header. As part of the move, also move should_failslab() from slab_common.c as the pre_alloc hook uses it. This means slab.h can stop including fault-inject.h and kmemleak.h. Fix up some files that were depending on the includes transitively. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/kasan/report.c | 1 + mm/memcontrol.c | 1 + mm/slab.h | 72 ----------------------------------------- mm/slab_common.c | 8 +---- mm/slub.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 84 insertions(+), 79 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index e77facb62900..011f727bfaff 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 947fb50eba31..8a0603517065 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -64,6 +64,7 @@ #include #include #include +#include #include "internal.h" #include #include diff --git a/mm/slab.h b/mm/slab.h index 1ac3a2f8d4c0..65ebf86b3fe9 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -9,8 +9,6 @@ #include #include #include -#include -#include #include #include @@ -796,76 +794,6 @@ static inline size_t slab_ksize(const struct kmem_cache *s) return s->size; } -static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t size, gfp_t flags) -{ - flags &= gfp_allowed_mask; - - might_alloc(flags); - - if (should_failslab(s, flags)) - return NULL; - - if (!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags)) - return NULL; - - return s; -} - -static inline void slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, gfp_t flags, - size_t size, void **p, bool init, - unsigned int orig_size) -{ - unsigned int zero_size = s->object_size; - bool kasan_init = init; - size_t i; - - flags &= gfp_allowed_mask; - - /* - * For kmalloc object, the allocated memory size(object_size) is likely - * larger than the requested size(orig_size). If redzone check is - * enabled for the extra space, don't zero it, as it will be redzoned - * soon. The redzone operation for this extra space could be seen as a - * replacement of current poisoning under certain debug option, and - * won't break other sanity checks. - */ - if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) && - (s->flags & SLAB_KMALLOC)) - zero_size = orig_size; - - /* - * When slub_debug is enabled, avoid memory initialization integrated - * into KASAN and instead zero out the memory via the memset below with - * the proper size. Otherwise, KASAN might overwrite SLUB redzones and - * cause false-positive reports. This does not lead to a performance - * penalty on production builds, as slub_debug is not intended to be - * enabled there. - */ - if (__slub_debug_enabled()) - kasan_init = false; - - /* - * As memory initialization might be integrated into KASAN, - * kasan_slab_alloc and initialization memset must be - * kept together to avoid discrepancies in behavior. - * - * As p[i] might get tagged, memset and kmemleak hook come after KASAN. - */ - for (i = 0; i < size; i++) { - p[i] = kasan_slab_alloc(s, p[i], flags, kasan_init); - if (p[i] && init && (!kasan_init || !kasan_has_integrated_init())) - memset(p[i], 0, zero_size); - kmemleak_alloc_recursive(p[i], s->object_size, 1, - s->flags, flags); - kmsan_slab_alloc(s, p[i], flags); - } - - memcg_slab_post_alloc_hook(s, objcg, flags, size, p); -} /* * The slab lists for all objects. diff --git a/mm/slab_common.c b/mm/slab_common.c index 63b8411db7ce..bbc2e3f061f1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -1470,10 +1471,3 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); -int should_failslab(struct kmem_cache *s, gfp_t gfpflags) -{ - if (__should_failslab(s, gfpflags)) - return -ENOMEM; - return 0; -} -ALLOW_ERROR_INJECTION(should_failslab, ERRNO); diff --git a/mm/slub.c b/mm/slub.c index 979932d046fd..9eb6508152c2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -3494,6 +3495,86 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, 0, sizeof(void *)); } +noinline int should_failslab(struct kmem_cache *s, gfp_t gfpflags) +{ + if (__should_failslab(s, gfpflags)) + return -ENOMEM; + return 0; +} +ALLOW_ERROR_INJECTION(should_failslab, ERRNO); + +static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + struct obj_cgroup **objcgp, + size_t size, gfp_t flags) +{ + flags &= gfp_allowed_mask; + + might_alloc(flags); + + if (should_failslab(s, flags)) + return NULL; + + if (!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags)) + return NULL; + + return s; +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, + struct obj_cgroup *objcg, gfp_t flags, + size_t size, void **p, bool init, + unsigned int orig_size) +{ + unsigned int zero_size = s->object_size; + bool kasan_init = init; + size_t i; + + flags &= gfp_allowed_mask; + + /* + * For kmalloc object, the allocated memory size(object_size) is likely + * larger than the requested size(orig_size). If redzone check is + * enabled for the extra space, don't zero it, as it will be redzoned + * soon. The redzone operation for this extra space could be seen as a + * replacement of current poisoning under certain debug option, and + * won't break other sanity checks. + */ + if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) && + (s->flags & SLAB_KMALLOC)) + zero_size = orig_size; + + /* + * When slub_debug is enabled, avoid memory initialization integrated + * into KASAN and instead zero out the memory via the memset below with + * the proper size. Otherwise, KASAN might overwrite SLUB redzones and + * cause false-positive reports. This does not lead to a performance + * penalty on production builds, as slub_debug is not intended to be + * enabled there. + */ + if (__slub_debug_enabled()) + kasan_init = false; + + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_alloc and initialization memset must be + * kept together to avoid discrepancies in behavior. + * + * As p[i] might get tagged, memset and kmemleak hook come after KASAN. + */ + for (i = 0; i < size; i++) { + p[i] = kasan_slab_alloc(s, p[i], flags, kasan_init); + if (p[i] && init && (!kasan_init || + !kasan_has_integrated_init())) + memset(p[i], 0, zero_size); + kmemleak_alloc_recursive(p[i], s->object_size, 1, + s->flags, flags); + kmsan_slab_alloc(s, p[i], flags); + } + + memcg_slab_post_alloc_hook(s, objcg, flags, size, p); +} + /* * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) * have the fastpath folded into their functions. So no function call From 0bedcc66d2a43a50ab660273842f4737a293dd8a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 3 Oct 2023 14:52:47 +0200 Subject: [PATCH 23/34] mm/slab: move memcg related functions from slab.h to slub.c We don't share those between SLAB and SLUB anymore, so most memcg related functions can be moved to slub.c proper. Reviewed-by: Kees Cook Acked-by: Michal Hocko Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slab.h | 206 ------------------------------------------------------ mm/slub.c | 205 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 205 insertions(+), 206 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 65ebf86b3fe9..a81ef7c9282d 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -486,12 +486,6 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos); -static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) -{ - return (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; -} - #ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG_ON DECLARE_STATIC_KEY_TRUE(slub_debug_enabled); @@ -551,220 +545,20 @@ int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab); void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr); - -static inline void memcg_free_slab_cgroups(struct slab *slab) -{ - kfree(slab_objcgs(slab)); - slab->memcg_data = 0; -} - -static inline size_t obj_full_size(struct kmem_cache *s) -{ - /* - * For each accounted object there is an extra space which is used - * to store obj_cgroup membership. Charge it too. - */ - return s->size + sizeof(struct obj_cgroup *); -} - -/* - * Returns false if the allocation should fail. - */ -static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t objects, gfp_t flags) -{ - struct obj_cgroup *objcg; - - if (!memcg_kmem_online()) - return true; - - if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) - return true; - - /* - * The obtained objcg pointer is safe to use within the current scope, - * defined by current task or set_active_memcg() pair. - * obj_cgroup_get() is used to get a permanent reference. - */ - objcg = current_obj_cgroup(); - if (!objcg) - return true; - - if (lru) { - int ret; - struct mem_cgroup *memcg; - - memcg = get_mem_cgroup_from_objcg(objcg); - ret = memcg_list_lru_alloc(memcg, lru, flags); - css_put(&memcg->css); - - if (ret) - return false; - } - - if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) - return false; - - *objcgp = objcg; - return true; -} - -static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, - gfp_t flags, size_t size, - void **p) -{ - struct slab *slab; - unsigned long off; - size_t i; - - if (!memcg_kmem_online() || !objcg) - return; - - for (i = 0; i < size; i++) { - if (likely(p[i])) { - slab = virt_to_slab(p[i]); - - if (!slab_objcgs(slab) && - memcg_alloc_slab_cgroups(slab, s, flags, - false)) { - obj_cgroup_uncharge(objcg, obj_full_size(s)); - continue; - } - - off = obj_to_index(s, slab, p[i]); - obj_cgroup_get(objcg); - slab_objcgs(slab)[off] = objcg; - mod_objcg_state(objcg, slab_pgdat(slab), - cache_vmstat_idx(s), obj_full_size(s)); - } else { - obj_cgroup_uncharge(objcg, obj_full_size(s)); - } - } -} - -static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, - void **p, int objects) -{ - struct obj_cgroup **objcgs; - int i; - - if (!memcg_kmem_online()) - return; - - objcgs = slab_objcgs(slab); - if (!objcgs) - return; - - for (i = 0; i < objects; i++) { - struct obj_cgroup *objcg; - unsigned int off; - - off = obj_to_index(s, slab, p[i]); - objcg = objcgs[off]; - if (!objcg) - continue; - - objcgs[off] = NULL; - obj_cgroup_uncharge(objcg, obj_full_size(s)); - mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), - -obj_full_size(s)); - obj_cgroup_put(objcg); - } -} - #else /* CONFIG_MEMCG_KMEM */ static inline struct obj_cgroup **slab_objcgs(struct slab *slab) { return NULL; } -static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) -{ - return NULL; -} - static inline int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab) { return 0; } - -static inline void memcg_free_slab_cgroups(struct slab *slab) -{ -} - -static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t objects, gfp_t flags) -{ - return true; -} - -static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, - gfp_t flags, size_t size, - void **p) -{ -} - -static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, - void **p, int objects) -{ -} #endif /* CONFIG_MEMCG_KMEM */ -static inline struct kmem_cache *virt_to_cache(const void *obj) -{ - struct slab *slab; - - slab = virt_to_slab(obj); - if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", - __func__)) - return NULL; - return slab->slab_cache; -} - -static __always_inline void account_slab(struct slab *slab, int order, - struct kmem_cache *s, gfp_t gfp) -{ - if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) - memcg_alloc_slab_cgroups(slab, s, gfp, true); - - mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), - PAGE_SIZE << order); -} - -static __always_inline void unaccount_slab(struct slab *slab, int order, - struct kmem_cache *s) -{ - if (memcg_kmem_online()) - memcg_free_slab_cgroups(slab); - - mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), - -(PAGE_SIZE << order)); -} - -static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) -{ - struct kmem_cache *cachep; - - if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && - !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) - return s; - - cachep = virt_to_cache(x); - if (WARN(cachep && cachep != s, - "%s: Wrong slab cache. %s but object is from %s\n", - __func__, s->name, cachep->name)) - print_tracking(cachep, x); - return cachep; -} - void free_large_kmalloc(struct folio *folio, void *object); size_t __ksize(const void *objp); diff --git a/mm/slub.c b/mm/slub.c index 9eb6508152c2..844e0beb84ee 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1814,6 +1814,165 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, #endif #endif /* CONFIG_SLUB_DEBUG */ +static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) +{ + return (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; +} + +#ifdef CONFIG_MEMCG_KMEM +static inline void memcg_free_slab_cgroups(struct slab *slab) +{ + kfree(slab_objcgs(slab)); + slab->memcg_data = 0; +} + +static inline size_t obj_full_size(struct kmem_cache *s) +{ + /* + * For each accounted object there is an extra space which is used + * to store obj_cgroup membership. Charge it too. + */ + return s->size + sizeof(struct obj_cgroup *); +} + +/* + * Returns false if the allocation should fail. + */ +static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + struct obj_cgroup **objcgp, + size_t objects, gfp_t flags) +{ + struct obj_cgroup *objcg; + + if (!memcg_kmem_online()) + return true; + + if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) + return true; + + /* + * The obtained objcg pointer is safe to use within the current scope, + * defined by current task or set_active_memcg() pair. + * obj_cgroup_get() is used to get a permanent reference. + */ + objcg = current_obj_cgroup(); + if (!objcg) + return true; + + if (lru) { + int ret; + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(objcg); + ret = memcg_list_lru_alloc(memcg, lru, flags); + css_put(&memcg->css); + + if (ret) + return false; + } + + if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) + return false; + + *objcgp = objcg; + return true; +} + +static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, + struct obj_cgroup *objcg, + gfp_t flags, size_t size, + void **p) +{ + struct slab *slab; + unsigned long off; + size_t i; + + if (!memcg_kmem_online() || !objcg) + return; + + for (i = 0; i < size; i++) { + if (likely(p[i])) { + slab = virt_to_slab(p[i]); + + if (!slab_objcgs(slab) && + memcg_alloc_slab_cgroups(slab, s, flags, false)) { + obj_cgroup_uncharge(objcg, obj_full_size(s)); + continue; + } + + off = obj_to_index(s, slab, p[i]); + obj_cgroup_get(objcg); + slab_objcgs(slab)[off] = objcg; + mod_objcg_state(objcg, slab_pgdat(slab), + cache_vmstat_idx(s), obj_full_size(s)); + } else { + obj_cgroup_uncharge(objcg, obj_full_size(s)); + } + } +} + +static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects) +{ + struct obj_cgroup **objcgs; + int i; + + if (!memcg_kmem_online()) + return; + + objcgs = slab_objcgs(slab); + if (!objcgs) + return; + + for (i = 0; i < objects; i++) { + struct obj_cgroup *objcg; + unsigned int off; + + off = obj_to_index(s, slab, p[i]); + objcg = objcgs[off]; + if (!objcg) + continue; + + objcgs[off] = NULL; + obj_cgroup_uncharge(objcg, obj_full_size(s)); + mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), + -obj_full_size(s)); + obj_cgroup_put(objcg); + } +} +#else /* CONFIG_MEMCG_KMEM */ +static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) +{ + return NULL; +} + +static inline void memcg_free_slab_cgroups(struct slab *slab) +{ +} + +static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + struct obj_cgroup **objcgp, + size_t objects, gfp_t flags) +{ + return true; +} + +static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, + struct obj_cgroup *objcg, + gfp_t flags, size_t size, + void **p) +{ +} + +static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + /* * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. @@ -2048,6 +2207,26 @@ static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) } #endif /* CONFIG_SLAB_FREELIST_RANDOM */ +static __always_inline void account_slab(struct slab *slab, int order, + struct kmem_cache *s, gfp_t gfp) +{ + if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) + memcg_alloc_slab_cgroups(slab, s, gfp, true); + + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), + PAGE_SIZE << order); +} + +static __always_inline void unaccount_slab(struct slab *slab, int order, + struct kmem_cache *s) +{ + if (memcg_kmem_online()) + memcg_free_slab_cgroups(slab); + + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), + -(PAGE_SIZE << order)); +} + static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { struct slab *slab; @@ -3965,6 +4144,32 @@ void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) } #endif +static inline struct kmem_cache *virt_to_cache(const void *obj) +{ + struct slab *slab; + + slab = virt_to_slab(obj); + if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__)) + return NULL; + return slab->slab_cache; +} + +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + + if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && + !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) + return s; + + cachep = virt_to_cache(x); + if (WARN(cachep && cachep != s, + "%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name)) + print_tracking(cachep, x); + return cachep; +} + void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller) { slab_free(s, virt_to_slab(x), x, NULL, &x, 1, caller); From b52ef56e9b324b172053b03d8c775ef4708fbc23 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 3 Oct 2023 14:57:49 +0200 Subject: [PATCH 24/34] mm/slab: move struct kmem_cache_node from slab.h to slub.c The declaration and associated helpers are not used anywhere else anymore. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slab.h | 29 ----------------------------- mm/slub.c | 27 +++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 29 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index a81ef7c9282d..5ae6a978e9c2 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -588,35 +588,6 @@ static inline size_t slab_ksize(const struct kmem_cache *s) return s->size; } - -/* - * The slab lists for all objects. - */ -struct kmem_cache_node { - spinlock_t list_lock; - unsigned long nr_partial; - struct list_head partial; -#ifdef CONFIG_SLUB_DEBUG - atomic_long_t nr_slabs; - atomic_long_t total_objects; - struct list_head full; -#endif -}; - -static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) -{ - return s->node[node]; -} - -/* - * Iterator over all nodes. The body will be executed for each node that has - * a kmem_cache_node structure allocated (which is true for all online nodes) - */ -#define for_each_kmem_cache_node(__s, __node, __n) \ - for (__node = 0; __node < nr_node_ids; __node++) \ - if ((__n = get_node(__s, __node))) - - #ifdef CONFIG_SLUB_DEBUG void dump_unreclaimable_slab(void); #else diff --git a/mm/slub.c b/mm/slub.c index 844e0beb84ee..cc801f8258fe 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -396,6 +396,33 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) #endif } +/* + * The slab lists for all objects. + */ +struct kmem_cache_node { + spinlock_t list_lock; + unsigned long nr_partial; + struct list_head partial; +#ifdef CONFIG_SLUB_DEBUG + atomic_long_t nr_slabs; + atomic_long_t total_objects; + struct list_head full; +#endif +}; + +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ + return s->node[node]; +} + +/* + * Iterator over all nodes. The body will be executed for each node that has + * a kmem_cache_node structure allocated (which is true for all online nodes) + */ +#define for_each_kmem_cache_node(__s, __node, __n) \ + for (__node = 0; __node < nr_node_ids; __node++) \ + if ((__n = get_node(__s, __node))) + /* * Tracks for which NUMA nodes we have kmem_cache_nodes allocated. * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily From b774d3e326d30fc8ef841101c399e44bdac2aa48 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 3 Oct 2023 15:27:11 +0200 Subject: [PATCH 25/34] mm/slab: move kfree() from slab_common.c to slub.c This should result in better code. Currently kfree() makes a function call between compilation units to __kmem_cache_free() which does its own virt_to_slab(), throwing away the struct slab pointer we already had in kfree(). Now it can be reused. Additionally kfree() can now inline the whole SLUB freeing fastpath. Also move over free_large_kmalloc() as the only callsites are now in slub.c, and make it static. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slab.h | 4 ---- mm/slab_common.c | 45 ------------------------------------------ mm/slub.c | 51 +++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 46 insertions(+), 54 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 5ae6a978e9c2..35a55c4a407d 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -395,8 +395,6 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller); void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, size_t orig_size, unsigned long caller); -void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller); - gfp_t kmalloc_fix_flags(gfp_t flags); /* Functions provided by the slab allocators */ @@ -559,8 +557,6 @@ static inline int memcg_alloc_slab_cgroups(struct slab *slab, } #endif /* CONFIG_MEMCG_KMEM */ -void free_large_kmalloc(struct folio *folio, void *object); - size_t __ksize(const void *objp); static inline size_t slab_ksize(const struct kmem_cache *s) diff --git a/mm/slab_common.c b/mm/slab_common.c index bbc2e3f061f1..f4f275613d2a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -963,22 +963,6 @@ void __init create_kmalloc_caches(slab_flags_t flags) slab_state = UP; } -void free_large_kmalloc(struct folio *folio, void *object) -{ - unsigned int order = folio_order(folio); - - if (WARN_ON_ONCE(order == 0)) - pr_warn_once("object pointer: 0x%p\n", object); - - kmemleak_free(object); - kasan_kfree_large(object); - kmsan_kfree_large(object); - - mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, - -(PAGE_SIZE << order)); - __free_pages(folio_page(folio, 0), order); -} - static void *__kmalloc_large_node(size_t size, gfp_t flags, int node); static __always_inline void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) @@ -1023,35 +1007,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, } EXPORT_SYMBOL(__kmalloc_node_track_caller); -/** - * kfree - free previously allocated memory - * @object: pointer returned by kmalloc() or kmem_cache_alloc() - * - * If @object is NULL, no operation is performed. - */ -void kfree(const void *object) -{ - struct folio *folio; - struct slab *slab; - struct kmem_cache *s; - - trace_kfree(_RET_IP_, object); - - if (unlikely(ZERO_OR_NULL_PTR(object))) - return; - - folio = virt_to_folio(object); - if (unlikely(!folio_test_slab(folio))) { - free_large_kmalloc(folio, (void *)object); - return; - } - - slab = folio_slab(folio); - s = slab->slab_cache; - __kmem_cache_free(s, (void *)object, _RET_IP_); -} -EXPORT_SYMBOL(kfree); - /** * __ksize -- Report full size of underlying allocation * @object: pointer to the object diff --git a/mm/slub.c b/mm/slub.c index cc801f8258fe..2baa9e94d9df 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4197,11 +4197,6 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return cachep; } -void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller) -{ - slab_free(s, virt_to_slab(x), x, NULL, &x, 1, caller); -} - /** * kmem_cache_free - Deallocate an object * @s: The cache the allocation was from. @@ -4220,6 +4215,52 @@ void kmem_cache_free(struct kmem_cache *s, void *x) } EXPORT_SYMBOL(kmem_cache_free); +static void free_large_kmalloc(struct folio *folio, void *object) +{ + unsigned int order = folio_order(folio); + + if (WARN_ON_ONCE(order == 0)) + pr_warn_once("object pointer: 0x%p\n", object); + + kmemleak_free(object); + kasan_kfree_large(object); + kmsan_kfree_large(object); + + mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); + __free_pages(folio_page(folio, 0), order); +} + +/** + * kfree - free previously allocated memory + * @object: pointer returned by kmalloc() or kmem_cache_alloc() + * + * If @object is NULL, no operation is performed. + */ +void kfree(const void *object) +{ + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + void *x = (void *)object; + + trace_kfree(_RET_IP_, object); + + if (unlikely(ZERO_OR_NULL_PTR(object))) + return; + + folio = virt_to_folio(object); + if (unlikely(!folio_test_slab(folio))) { + free_large_kmalloc(folio, (void *)object); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + slab_free(s, slab, x, NULL, &x, 1, _RET_IP_); +} +EXPORT_SYMBOL(kfree); + struct detached_freelist { struct slab *slab; void *tail; From 5a9d31d980cbc9cefcee18e186bd4c5d51f3cba2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 13 Nov 2023 12:02:02 +0100 Subject: [PATCH 26/34] mm/slab: move kmalloc_slab() to mm/slab.h In preparation for the next patch, move the kmalloc_slab() function to the header, as it will have callers from two files, and make it inline. To avoid unnecessary bloat, remove all size checks/warnings from kmalloc_slab() as they just duplicate those in callers, especially after recent changes to kmalloc_size_roundup(). We just need to adjust handling of zero size in __do_kmalloc_node(). Also we can stop handling NULL result from kmalloc_slab() there as that now cannot happen (unless called too early during boot). The size_index array becomes visible so rename it to a more specific kmalloc_size_index. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slab.h | 28 ++++++++++++++++++++++++++-- mm/slab_common.c | 43 ++++++++----------------------------------- 2 files changed, 34 insertions(+), 37 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 35a55c4a407d..7d7cc7af614e 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -389,8 +389,32 @@ extern const struct kmalloc_info_struct { void setup_kmalloc_cache_index_table(void); void create_kmalloc_caches(slab_flags_t); -/* Find the kmalloc slab corresponding for a certain size */ -struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller); +extern u8 kmalloc_size_index[24]; + +static inline unsigned int size_index_elem(unsigned int bytes) +{ + return (bytes - 1) / 8; +} + +/* + * Find the kmem_cache structure that serves a given size of + * allocation + * + * This assumes size is larger than zero and not larger than + * KMALLOC_MAX_CACHE_SIZE and the caller must check that. + */ +static inline struct kmem_cache * +kmalloc_slab(size_t size, gfp_t flags, unsigned long caller) +{ + unsigned int index; + + if (size <= 192) + index = kmalloc_size_index[size_index_elem(size)]; + else + index = fls(size - 1); + + return kmalloc_caches[kmalloc_type(flags, caller)][index]; +} void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, size_t orig_size, diff --git a/mm/slab_common.c b/mm/slab_common.c index f4f275613d2a..31ade17a7ad9 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -665,7 +665,7 @@ EXPORT_SYMBOL(random_kmalloc_seed); * of two cache sizes there. The size of larger slabs can be determined using * fls. */ -static u8 size_index[24] __ro_after_init = { +u8 kmalloc_size_index[24] __ro_after_init = { 3, /* 8 */ 4, /* 16 */ 5, /* 24 */ @@ -692,33 +692,6 @@ static u8 size_index[24] __ro_after_init = { 2 /* 192 */ }; -static inline unsigned int size_index_elem(unsigned int bytes) -{ - return (bytes - 1) / 8; -} - -/* - * Find the kmem_cache structure that serves a given size of - * allocation - */ -struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller) -{ - unsigned int index; - - if (size <= 192) { - if (!size) - return ZERO_SIZE_PTR; - - index = size_index[size_index_elem(size)]; - } else { - if (WARN_ON_ONCE(size > KMALLOC_MAX_CACHE_SIZE)) - return NULL; - index = fls(size - 1); - } - - return kmalloc_caches[kmalloc_type(flags, caller)][index]; -} - size_t kmalloc_size_roundup(size_t size) { if (size && size <= KMALLOC_MAX_CACHE_SIZE) { @@ -843,9 +816,9 @@ void __init setup_kmalloc_cache_index_table(void) for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { unsigned int elem = size_index_elem(i); - if (elem >= ARRAY_SIZE(size_index)) + if (elem >= ARRAY_SIZE(kmalloc_size_index)) break; - size_index[elem] = KMALLOC_SHIFT_LOW; + kmalloc_size_index[elem] = KMALLOC_SHIFT_LOW; } if (KMALLOC_MIN_SIZE >= 64) { @@ -854,7 +827,7 @@ void __init setup_kmalloc_cache_index_table(void) * is 64 byte. */ for (i = 64 + 8; i <= 96; i += 8) - size_index[size_index_elem(i)] = 7; + kmalloc_size_index[size_index_elem(i)] = 7; } @@ -865,7 +838,7 @@ void __init setup_kmalloc_cache_index_table(void) * instead. */ for (i = 128 + 8; i <= 192; i += 8) - size_index[size_index_elem(i)] = 8; + kmalloc_size_index[size_index_elem(i)] = 8; } } @@ -977,10 +950,10 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller return ret; } - s = kmalloc_slab(size, flags, caller); + if (unlikely(!size)) + return ZERO_SIZE_PTR; - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; + s = kmalloc_slab(size, flags, caller); ret = __kmem_cache_alloc_node(s, flags, node, size, caller); ret = kasan_kmalloc(s, ret, size, flags); From 4862caa5cba027bf7de925e05e4d1a64c89d81d6 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 3 Oct 2023 16:57:59 +0200 Subject: [PATCH 27/34] mm/slab: move kmalloc() functions from slab_common.c to slub.c This will eliminate a call between compilation units through __kmem_cache_alloc_node() and allow better inlining of the allocation fast path. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slab.h | 3 -- mm/slab_common.c | 119 -------------------------------------------- mm/slub.c | 126 ++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 118 insertions(+), 130 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 7d7cc7af614e..54deeb0428c6 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -416,9 +416,6 @@ kmalloc_slab(size_t size, gfp_t flags, unsigned long caller) return kmalloc_caches[kmalloc_type(flags, caller)][index]; } -void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t orig_size, - unsigned long caller); gfp_t kmalloc_fix_flags(gfp_t flags); /* Functions provided by the slab allocators */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 31ade17a7ad9..238293b1dbe1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -936,50 +936,6 @@ void __init create_kmalloc_caches(slab_flags_t flags) slab_state = UP; } -static void *__kmalloc_large_node(size_t size, gfp_t flags, int node); -static __always_inline -void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = __kmalloc_large_node(size, flags, node); - trace_kmalloc(caller, ret, size, - PAGE_SIZE << get_order(size), flags, node); - return ret; - } - - if (unlikely(!size)) - return ZERO_SIZE_PTR; - - s = kmalloc_slab(size, flags, caller); - - ret = __kmem_cache_alloc_node(s, flags, node, size, caller); - ret = kasan_kmalloc(s, ret, size, flags); - trace_kmalloc(caller, ret, size, s->size, flags, node); - return ret; -} - -void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - return __do_kmalloc_node(size, flags, node, _RET_IP_); -} -EXPORT_SYMBOL(__kmalloc_node); - -void *__kmalloc(size_t size, gfp_t flags) -{ - return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); -} -EXPORT_SYMBOL(__kmalloc); - -void *__kmalloc_node_track_caller(size_t size, gfp_t flags, - int node, unsigned long caller) -{ - return __do_kmalloc_node(size, flags, node, caller); -} -EXPORT_SYMBOL(__kmalloc_node_track_caller); - /** * __ksize -- Report full size of underlying allocation * @object: pointer to the object @@ -1016,30 +972,6 @@ size_t __ksize(const void *object) return slab_ksize(folio_slab(folio)->slab_cache); } -void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) -{ - void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE, - size, _RET_IP_); - - trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE); - - ret = kasan_kmalloc(s, ret, size, gfpflags); - return ret; -} -EXPORT_SYMBOL(kmalloc_trace); - -void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) -{ - void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_); - - trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node); - - ret = kasan_kmalloc(s, ret, size, gfpflags); - return ret; -} -EXPORT_SYMBOL(kmalloc_node_trace); - gfp_t kmalloc_fix_flags(gfp_t flags) { gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; @@ -1052,57 +984,6 @@ gfp_t kmalloc_fix_flags(gfp_t flags) return flags; } -/* - * To avoid unnecessary overhead, we pass through large allocation requests - * directly to the page allocator. We use __GFP_COMP, because we will need to - * know the allocation order to free the pages properly in kfree. - */ - -static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) -{ - struct page *page; - void *ptr = NULL; - unsigned int order = get_order(size); - - if (unlikely(flags & GFP_SLAB_BUG_MASK)) - flags = kmalloc_fix_flags(flags); - - flags |= __GFP_COMP; - page = alloc_pages_node(node, flags, order); - if (page) { - ptr = page_address(page); - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, - PAGE_SIZE << order); - } - - ptr = kasan_kmalloc_large(ptr, size, flags); - /* As ptr might get tagged, call kmemleak hook after KASAN. */ - kmemleak_alloc(ptr, size, 1, flags); - kmsan_kmalloc_large(ptr, size, flags); - - return ptr; -} - -void *kmalloc_large(size_t size, gfp_t flags) -{ - void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); - - trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), - flags, NUMA_NO_NODE); - return ret; -} -EXPORT_SYMBOL(kmalloc_large); - -void *kmalloc_large_node(size_t size, gfp_t flags, int node) -{ - void *ret = __kmalloc_large_node(size, flags, node); - - trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), - flags, node); - return ret; -} -EXPORT_SYMBOL(kmalloc_large_node); - #ifdef CONFIG_SLAB_FREELIST_RANDOM /* Randomize a generic freelist */ static void freelist_randomize(unsigned int *list, diff --git a/mm/slub.c b/mm/slub.c index 2baa9e94d9df..d6bc15929d22 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3851,14 +3851,6 @@ void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, } EXPORT_SYMBOL(kmem_cache_alloc_lru); -void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t orig_size, - unsigned long caller) -{ - return slab_alloc_node(s, NULL, gfpflags, node, - caller, orig_size); -} - /** * kmem_cache_alloc_node - Allocate an object on the specified node * @s: The cache to allocate from. @@ -3882,6 +3874,124 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) } EXPORT_SYMBOL(kmem_cache_alloc_node); +/* + * To avoid unnecessary overhead, we pass through large allocation requests + * directly to the page allocator. We use __GFP_COMP, because we will need to + * know the allocation order to free the pages properly in kfree. + */ +static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) +{ + struct page *page; + void *ptr = NULL; + unsigned int order = get_order(size); + + if (unlikely(flags & GFP_SLAB_BUG_MASK)) + flags = kmalloc_fix_flags(flags); + + flags |= __GFP_COMP; + page = alloc_pages_node(node, flags, order); + if (page) { + ptr = page_address(page); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + PAGE_SIZE << order); + } + + ptr = kasan_kmalloc_large(ptr, size, flags); + /* As ptr might get tagged, call kmemleak hook after KASAN. */ + kmemleak_alloc(ptr, size, 1, flags); + kmsan_kmalloc_large(ptr, size, flags); + + return ptr; +} + +void *kmalloc_large(size_t size, gfp_t flags) +{ + void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); + + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, NUMA_NO_NODE); + return ret; +} +EXPORT_SYMBOL(kmalloc_large); + +void *kmalloc_large_node(size_t size, gfp_t flags, int node) +{ + void *ret = __kmalloc_large_node(size, flags, node); + + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, node); + return ret; +} +EXPORT_SYMBOL(kmalloc_large_node); + +static __always_inline +void *__do_kmalloc_node(size_t size, gfp_t flags, int node, + unsigned long caller) +{ + struct kmem_cache *s; + void *ret; + + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { + ret = __kmalloc_large_node(size, flags, node); + trace_kmalloc(caller, ret, size, + PAGE_SIZE << get_order(size), flags, node); + return ret; + } + + if (unlikely(!size)) + return ZERO_SIZE_PTR; + + s = kmalloc_slab(size, flags, caller); + + ret = slab_alloc_node(s, NULL, flags, node, caller, size); + ret = kasan_kmalloc(s, ret, size, flags); + trace_kmalloc(caller, ret, size, s->size, flags, node); + return ret; +} + +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + return __do_kmalloc_node(size, flags, node, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc_node); + +void *__kmalloc(size_t size, gfp_t flags) +{ + return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc); + +void *__kmalloc_node_track_caller(size_t size, gfp_t flags, + int node, unsigned long caller) +{ + return __do_kmalloc_node(size, flags, node, caller); +} +EXPORT_SYMBOL(__kmalloc_node_track_caller); + +void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +{ + void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, + _RET_IP_, size); + + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE); + + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmalloc_trace); + +void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) +{ + void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size); + + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node); + + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmalloc_node_trace); + static noinline void free_to_partial_list( struct kmem_cache *s, struct slab *slab, void *head, void *tail, int bulk_cnt, From 49378a05ce7f01a203550eb7c2ef772f6d24565c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 26 Oct 2023 17:45:42 +0200 Subject: [PATCH 28/34] mm/slub: remove slab_alloc() and __kmem_cache_alloc_lru() wrappers slab_alloc() is a thin wrapper around slab_alloc_node() with only one caller. Replace with direct call of slab_alloc_node(). __kmem_cache_alloc_lru() itself is a thin wrapper with two callers, so replace it with direct calls of slab_alloc_node() and trace_kmem_cache_alloc(). This also makes sure _RET_IP_ has always the expected value and not depending on inlining decisions. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index d6bc15929d22..5683f1d02e4f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3821,33 +3821,26 @@ out: return object; } -static __fastpath_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru, - gfp_t gfpflags, unsigned long addr, size_t orig_size) +void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - return slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, addr, orig_size); -} - -static __fastpath_inline -void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, - gfp_t gfpflags) -{ - void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size); + void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_, + s->object_size); trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE); return ret; } - -void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) -{ - return __kmem_cache_alloc_lru(s, NULL, gfpflags); -} EXPORT_SYMBOL(kmem_cache_alloc); void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags) { - return __kmem_cache_alloc_lru(s, lru, gfpflags); + void *ret = slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, _RET_IP_, + s->object_size); + + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc_lru); From 3450a0e5a6fc4cdbd70853f12c0c332dd24c1349 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 13 Nov 2023 18:04:05 +0100 Subject: [PATCH 29/34] mm/slub: optimize alloc fastpath code layout With allocation fastpaths no longer divided between two .c files, we have better inlining, however checking the disassembly of kmem_cache_alloc() reveals we can do better to make the fastpaths smaller and move the less common situations out of line or to separate functions, to reduce instruction cache pressure. - split memcg pre/post alloc hooks to inlined checks that use likely() to assume there will be no objcg handling necessary, and non-inline functions doing the actual handling - add some more likely/unlikely() to pre/post alloc hooks to indicate which scenarios should be out of line - change gfp_allowed_mask handling in slab_post_alloc_hook() so the code can be optimized away when kasan/kmsan/kmemleak is configured out bloat-o-meter shows: add/remove: 4/2 grow/shrink: 1/8 up/down: 521/-2924 (-2403) Function old new delta __memcg_slab_post_alloc_hook - 461 +461 kmem_cache_alloc_bulk 775 791 +16 __pfx_should_failslab.constprop - 16 +16 __pfx___memcg_slab_post_alloc_hook - 16 +16 should_failslab.constprop - 12 +12 __pfx_memcg_slab_post_alloc_hook 16 - -16 kmem_cache_alloc_lru 1295 1023 -272 kmem_cache_alloc_node 1118 817 -301 kmem_cache_alloc 1076 772 -304 kmalloc_node_trace 1149 838 -311 kmalloc_trace 1102 789 -313 __kmalloc_node_track_caller 1393 1080 -313 __kmalloc_node 1397 1082 -315 __kmalloc 1374 1059 -315 memcg_slab_post_alloc_hook 464 - -464 Note that gcc still decided to inline __memcg_pre_alloc_hook(), but the code is out of line. Forcing noinline did not improve the results. As a result the fastpaths are shorter and overal code size is reduced. Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 89 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 54 insertions(+), 35 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 5683f1d02e4f..77d259f3d592 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1866,25 +1866,17 @@ static inline size_t obj_full_size(struct kmem_cache *s) /* * Returns false if the allocation should fail. */ -static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t objects, gfp_t flags) +static bool __memcg_slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + struct obj_cgroup **objcgp, + size_t objects, gfp_t flags) { - struct obj_cgroup *objcg; - - if (!memcg_kmem_online()) - return true; - - if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) - return true; - /* * The obtained objcg pointer is safe to use within the current scope, * defined by current task or set_active_memcg() pair. * obj_cgroup_get() is used to get a permanent reference. */ - objcg = current_obj_cgroup(); + struct obj_cgroup *objcg = current_obj_cgroup(); if (!objcg) return true; @@ -1907,17 +1899,34 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, return true; } -static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, - gfp_t flags, size_t size, - void **p) +/* + * Returns false if the allocation should fail. + */ +static __fastpath_inline +bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, struct list_lru *lru, + struct obj_cgroup **objcgp, size_t objects, + gfp_t flags) +{ + if (!memcg_kmem_online()) + return true; + + if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))) + return true; + + return likely(__memcg_slab_pre_alloc_hook(s, lru, objcgp, objects, + flags)); +} + +static void __memcg_slab_post_alloc_hook(struct kmem_cache *s, + struct obj_cgroup *objcg, + gfp_t flags, size_t size, + void **p) { struct slab *slab; unsigned long off; size_t i; - if (!memcg_kmem_online() || !objcg) - return; + flags &= gfp_allowed_mask; for (i = 0; i < size; i++) { if (likely(p[i])) { @@ -1940,6 +1949,16 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, } } +static __fastpath_inline +void memcg_slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, + gfp_t flags, size_t size, void **p) +{ + if (likely(!memcg_kmem_online() || !objcg)) + return; + + return __memcg_slab_post_alloc_hook(s, objcg, flags, size, p); +} + static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { @@ -3709,34 +3728,34 @@ noinline int should_failslab(struct kmem_cache *s, gfp_t gfpflags) } ALLOW_ERROR_INJECTION(should_failslab, ERRNO); -static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t size, gfp_t flags) +static __fastpath_inline +struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + struct obj_cgroup **objcgp, + size_t size, gfp_t flags) { flags &= gfp_allowed_mask; might_alloc(flags); - if (should_failslab(s, flags)) + if (unlikely(should_failslab(s, flags))) return NULL; - if (!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags)) + if (unlikely(!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags))) return NULL; return s; } -static inline void slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, gfp_t flags, - size_t size, void **p, bool init, - unsigned int orig_size) +static __fastpath_inline +void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, + gfp_t flags, size_t size, void **p, bool init, + unsigned int orig_size) { unsigned int zero_size = s->object_size; bool kasan_init = init; size_t i; - - flags &= gfp_allowed_mask; + gfp_t init_flags = flags & gfp_allowed_mask; /* * For kmalloc object, the allocated memory size(object_size) is likely @@ -3769,13 +3788,13 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, * As p[i] might get tagged, memset and kmemleak hook come after KASAN. */ for (i = 0; i < size; i++) { - p[i] = kasan_slab_alloc(s, p[i], flags, kasan_init); + p[i] = kasan_slab_alloc(s, p[i], init_flags, kasan_init); if (p[i] && init && (!kasan_init || !kasan_has_integrated_init())) memset(p[i], 0, zero_size); kmemleak_alloc_recursive(p[i], s->object_size, 1, - s->flags, flags); - kmsan_slab_alloc(s, p[i], flags); + s->flags, init_flags); + kmsan_slab_alloc(s, p[i], init_flags); } memcg_slab_post_alloc_hook(s, objcg, flags, size, p); @@ -3799,7 +3818,7 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list bool init = false; s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags); - if (!s) + if (unlikely(!s)) return NULL; object = kfence_alloc(s, orig_size, gfpflags); From ecf9a253ce120082ce0a8aff806c4de4865cfcc5 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 27 Oct 2023 12:34:18 +0200 Subject: [PATCH 30/34] mm/slub: optimize free fast path code layout Inspection of kmem_cache_free() disassembly showed we could make the fast path smaller by providing few more hints to the compiler, and splitting the memcg_slab_free_hook() into an inline part that only checks if there's work to do, and an out of line part doing the actual uncharge. bloat-o-meter results: add/remove: 2/0 grow/shrink: 0/3 up/down: 286/-554 (-268) Function old new delta __memcg_slab_free_hook - 270 +270 __pfx___memcg_slab_free_hook - 16 +16 kfree 828 665 -163 kmem_cache_free 1116 948 -168 kmem_cache_free_bulk.part 1701 1478 -223 Checking kmem_cache_free() disassembly now shows the non-fastpath cases are handled out of line, which should reduce instruction cache usage. Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 77d259f3d592..3f8b95757106 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1959,20 +1959,11 @@ void memcg_slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, return __memcg_slab_post_alloc_hook(s, objcg, flags, size, p); } -static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, - void **p, int objects) +static void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects, + struct obj_cgroup **objcgs) { - struct obj_cgroup **objcgs; - int i; - - if (!memcg_kmem_online()) - return; - - objcgs = slab_objcgs(slab); - if (!objcgs) - return; - - for (i = 0; i < objects; i++) { + for (int i = 0; i < objects; i++) { struct obj_cgroup *objcg; unsigned int off; @@ -1988,6 +1979,22 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, obj_cgroup_put(objcg); } } + +static __fastpath_inline +void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) +{ + struct obj_cgroup **objcgs; + + if (!memcg_kmem_online()) + return; + + objcgs = slab_objcgs(slab); + if (likely(!objcgs)) + return; + + __memcg_slab_free_hook(s, slab, p, objects, objcgs); +} #else /* CONFIG_MEMCG_KMEM */ static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) { @@ -2047,7 +2054,7 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, * The initialization memset's clear the object and the metadata, * but don't touch the SLAB redzone. */ - if (init) { + if (unlikely(init)) { int rsize; if (!kasan_has_integrated_init()) @@ -2083,7 +2090,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, next = get_freepointer(s, object); /* If object's reuse doesn't have to be delayed */ - if (!slab_free_hook(s, object, slab_want_init_on_free(s))) { + if (likely(!slab_free_hook(s, object, + slab_want_init_on_free(s)))) { /* Move object to the new freelist */ set_freepointer(s, object, *head); *head = object; @@ -4282,7 +4290,7 @@ static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, * With KASAN enabled slab_free_freelist_hook modifies the freelist * to remove objects, whose reuse must be delayed. */ - if (slab_free_freelist_hook(s, &head, &tail, &cnt)) + if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) do_slab_free(s, slab, head, tail, cnt, addr); } From 6f3dd2c31d7d703a814c59f60daf95c57fa6a4c2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 7 Aug 2023 20:50:44 +0200 Subject: [PATCH 31/34] mm/slub: fix bulk alloc and free stats The SLUB sysfs stats enabled CONFIG_SLUB_STATS have two deficiencies identified wrt bulk alloc/free operations: - Bulk allocations from cpu freelist are not counted. Add the ALLOC_FASTPATH counter there. - Bulk fastpath freeing will count a list of multiple objects with a single FREE_FASTPATH inc. Add a stat_add() variant to count them all. Reviewed-by: Chengming Zhou Signed-off-by: Vlastimil Babka --- mm/slub.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 3f8b95757106..d7b0ca6012e0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -396,6 +396,14 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) #endif } +static inline +void stat_add(const struct kmem_cache *s, enum stat_item si, int v) +{ +#ifdef CONFIG_SLUB_STATS + raw_cpu_add(s->cpu_slab->stat[si], v); +#endif +} + /* * The slab lists for all objects. */ @@ -4268,7 +4276,7 @@ redo: local_unlock(&s->cpu_slab->lock); } - stat(s, FREE_FASTPATH); + stat_add(s, FREE_FASTPATH, cnt); } #else /* CONFIG_SLUB_TINY */ static void do_slab_free(struct kmem_cache *s, @@ -4545,6 +4553,7 @@ static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, c->freelist = get_freepointer(s, object); p[i] = object; maybe_wipe_obj_freeptr(s, p[i]); + stat(s, ALLOC_FASTPATH); } c->tid = next_tid(c->tid); local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); From 520a688a2edfddba97968bf9e133b9a3d7c78059 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 2 Nov 2023 16:34:39 +0100 Subject: [PATCH 32/34] mm/slub: introduce __kmem_cache_free_bulk() without free hooks Currently, when __kmem_cache_alloc_bulk() fails, it frees back the objects that were allocated before the failure, using kmem_cache_free_bulk(). Because kmem_cache_free_bulk() calls the free hooks (KASAN etc.) and those expect objects that were processed by the post alloc hooks, slab_post_alloc_hook() is called before kmem_cache_free_bulk(). This is wasteful, although not a big concern in practice for the rare error path. But in order to efficiently handle percpu array batch refill and free in the near future, we will also need a variant of kmem_cache_free_bulk() that avoids the free hooks. So introduce it now and use it for the failure path. In case of failure we however still need to perform memcg uncharge so handle that in a new memcg_slab_alloc_error_hook(). Thanks to Chengming Zhou for noticing the missing uncharge. As a consequence, __kmem_cache_alloc_bulk() no longer needs the objcg parameter, remove it. Reviewed-by: Chengming Zhou Signed-off-by: Vlastimil Babka --- mm/slub.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index d7b0ca6012e0..0a9e4bd0dd68 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2003,6 +2003,14 @@ void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, __memcg_slab_free_hook(s, slab, p, objects, objcgs); } + +static inline +void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects, + struct obj_cgroup *objcg) +{ + if (objcg) + obj_cgroup_uncharge(objcg, objects * obj_full_size(s)); +} #else /* CONFIG_MEMCG_KMEM */ static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) { @@ -2032,6 +2040,12 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { } + +static inline +void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects, + struct obj_cgroup *objcg) +{ +} #endif /* CONFIG_MEMCG_KMEM */ /* @@ -4478,6 +4492,27 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, return same; } +/* + * Internal bulk free of objects that were not initialised by the post alloc + * hooks and thus should not be processed by the free hooks + */ +static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + if (!size) + return; + + do { + struct detached_freelist df; + + size = build_detached_freelist(s, size, p, &df); + if (!df.slab) + continue; + + do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, + _RET_IP_); + } while (likely(size)); +} + /* Note that interrupts must be enabled when calling this function. */ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) { @@ -4498,8 +4533,9 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) EXPORT_SYMBOL(kmem_cache_free_bulk); #ifndef CONFIG_SLUB_TINY -static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, - size_t size, void **p, struct obj_cgroup *objcg) +static inline +int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) { struct kmem_cache_cpu *c; unsigned long irqflags; @@ -4563,14 +4599,13 @@ static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, error: slub_put_cpu_ptr(s->cpu_slab); - slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); - kmem_cache_free_bulk(s, i, p); + __kmem_cache_free_bulk(s, i, p); return 0; } #else /* CONFIG_SLUB_TINY */ static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, - size_t size, void **p, struct obj_cgroup *objcg) + size_t size, void **p) { int i; @@ -4593,8 +4628,7 @@ static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, return i; error: - slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); - kmem_cache_free_bulk(s, i, p); + __kmem_cache_free_bulk(s, i, p); return 0; } #endif /* CONFIG_SLUB_TINY */ @@ -4614,15 +4648,19 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, if (unlikely(!s)) return 0; - i = __kmem_cache_alloc_bulk(s, flags, size, p, objcg); + i = __kmem_cache_alloc_bulk(s, flags, size, p); /* * memcg and kmem_cache debug support and memory initialization. * Done outside of the IRQ disabled fastpath loop. */ - if (i != 0) + if (likely(i != 0)) { slab_post_alloc_hook(s, objcg, flags, size, p, slab_want_init_on_alloc(flags, s), s->object_size); + } else { + memcg_slab_alloc_error_hook(s, size, objcg); + } + return i; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); From 284f17ac13fe34ae9eecbe57bb91553374d9b855 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 3 Nov 2023 20:24:51 +0100 Subject: [PATCH 33/34] mm/slub: handle bulk and single object freeing separately Currently we have a single function slab_free() handling both single object freeing and bulk freeing with necessary hooks, the latter case requiring slab_free_freelist_hook(). It should be however better to distinguish the two use cases for the following reasons: - code simpler to follow for the single object case - better code generation - although inlining should eliminate the slab_free_freelist_hook() for single object freeing in case no debugging options are enabled, it seems it's not perfect. When e.g. KASAN is enabled, we're imposing additional unnecessary overhead for single object freeing. - preparation to add percpu array caches in near future Therefore, simplify slab_free() for the single object case by dropping unnecessary parameters and calling only slab_free_hook() instead of slab_free_freelist_hook(). Rename the bulk variant to slab_free_bulk() and adjust callers accordingly. While at it, flip (and document) slab_free_hook() return value so that it returns true when the freeing can proceed, which matches the logic of slab_free_freelist_hook() and is not confusingly the opposite. Additionally we can simplify a bit by changing the tail parameter of do_slab_free() when freeing a single object - instead of NULL we can set it equal to head. bloat-o-meter shows small code reduction with a .config that has KASAN etc disabled: add/remove: 0/0 grow/shrink: 0/4 up/down: 0/-118 (-118) Function old new delta kmem_cache_alloc_bulk 1203 1196 -7 kmem_cache_free 861 835 -26 __kmem_cache_free 741 704 -37 kmem_cache_free_bulk 911 863 -48 Reviewed-by: Chengming Zhou Signed-off-by: Vlastimil Babka --- mm/slub.c | 59 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 0a9e4bd0dd68..af8c8fc9e799 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2051,9 +2051,12 @@ void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects, /* * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. + * + * Returns true if freeing of the object can proceed, false if its reuse + * was delayed by KASAN quarantine. */ -static __always_inline bool slab_free_hook(struct kmem_cache *s, - void *x, bool init) +static __always_inline +bool slab_free_hook(struct kmem_cache *s, void *x, bool init) { kmemleak_free_recursive(x, s->flags); kmsan_slab_free(s, x); @@ -2086,7 +2089,7 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, s->size - s->inuse - rsize); } /* KASAN might put x into memory quarantine, delaying its reuse. */ - return kasan_slab_free(s, x, init); + return !kasan_slab_free(s, x, init); } static inline bool slab_free_freelist_hook(struct kmem_cache *s, @@ -2096,7 +2099,7 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, void *object; void *next = *head; - void *old_tail = *tail ? *tail : *head; + void *old_tail = *tail; if (is_kfence_address(next)) { slab_free_hook(s, next, false); @@ -2112,8 +2115,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, next = get_freepointer(s, object); /* If object's reuse doesn't have to be delayed */ - if (likely(!slab_free_hook(s, object, - slab_want_init_on_free(s)))) { + if (likely(slab_free_hook(s, object, + slab_want_init_on_free(s)))) { /* Move object to the new freelist */ set_freepointer(s, object, *head); *head = object; @@ -2128,9 +2131,6 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, } } while (object != old_tail); - if (*head == *tail) - *tail = NULL; - return *head != NULL; } @@ -4241,7 +4241,6 @@ static __always_inline void do_slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, int cnt, unsigned long addr) { - void *tail_obj = tail ? : head; struct kmem_cache_cpu *c; unsigned long tid; void **freelist; @@ -4260,14 +4259,14 @@ redo: barrier(); if (unlikely(slab != c->slab)) { - __slab_free(s, slab, head, tail_obj, cnt, addr); + __slab_free(s, slab, head, tail, cnt, addr); return; } if (USE_LOCKLESS_FAST_PATH()) { freelist = READ_ONCE(c->freelist); - set_freepointer(s, tail_obj, freelist); + set_freepointer(s, tail, freelist); if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) { note_cmpxchg_failure("slab_free", s, tid); @@ -4284,7 +4283,7 @@ redo: tid = c->tid; freelist = c->freelist; - set_freepointer(s, tail_obj, freelist); + set_freepointer(s, tail, freelist); c->freelist = head; c->tid = next_tid(tid); @@ -4297,15 +4296,27 @@ static void do_slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, int cnt, unsigned long addr) { - void *tail_obj = tail ? : head; - - __slab_free(s, slab, head, tail_obj, cnt, addr); + __slab_free(s, slab, head, tail, cnt, addr); } #endif /* CONFIG_SLUB_TINY */ -static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, - void *head, void *tail, void **p, int cnt, - unsigned long addr) +static __fastpath_inline +void slab_free(struct kmem_cache *s, struct slab *slab, void *object, + unsigned long addr) +{ + bool init; + + memcg_slab_free_hook(s, slab, &object, 1); + + init = !is_kfence_address(object) && slab_want_init_on_free(s); + + if (likely(slab_free_hook(s, object, init))) + do_slab_free(s, slab, object, object, 1, addr); +} + +static __fastpath_inline +void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head, + void *tail, void **p, int cnt, unsigned long addr) { memcg_slab_free_hook(s, slab, p, cnt); /* @@ -4319,7 +4330,7 @@ static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, #ifdef CONFIG_KASAN_GENERIC void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) { - do_slab_free(cache, virt_to_slab(x), x, NULL, 1, addr); + do_slab_free(cache, virt_to_slab(x), x, x, 1, addr); } #endif @@ -4363,7 +4374,7 @@ void kmem_cache_free(struct kmem_cache *s, void *x) if (!s) return; trace_kmem_cache_free(_RET_IP_, x, s); - slab_free(s, virt_to_slab(x), x, NULL, &x, 1, _RET_IP_); + slab_free(s, virt_to_slab(x), x, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); @@ -4409,7 +4420,7 @@ void kfree(const void *object) slab = folio_slab(folio); s = slab->slab_cache; - slab_free(s, slab, x, NULL, &x, 1, _RET_IP_); + slab_free(s, slab, x, _RET_IP_); } EXPORT_SYMBOL(kfree); @@ -4526,8 +4537,8 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (!df.slab) continue; - slab_free(df.s, df.slab, df.freelist, df.tail, &p[size], df.cnt, - _RET_IP_); + slab_free_bulk(df.s, df.slab, df.freelist, df.tail, &p[size], + df.cnt, _RET_IP_); } while (likely(size)); } EXPORT_SYMBOL(kmem_cache_free_bulk); From 782f8906f8057efc7151b4b98b0a0280a71d005f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 14 Nov 2023 22:12:47 +0100 Subject: [PATCH 34/34] mm/slub: free KFENCE objects in slab_free_hook() When freeing an object that was allocated from KFENCE, we do that in the slowpath __slab_free(), relying on the fact that KFENCE "slab" cannot be the cpu slab, so the fastpath has to fallback to the slowpath. This optimization doesn't help much though, because is_kfence_address() is checked earlier anyway during the free hook processing or detached freelist building. Thus we can simplify the code by making the slab_free_hook() free the KFENCE object immediately, similarly to KASAN quarantine. In slab_free_hook() we can place kfence_free() above init processing, as callers have been making sure to set init to false for KFENCE objects. This simplifies slab_free(). This places it also above kasan_slab_free() which is ok as that skips KFENCE objects anyway. While at it also determine the init value in slab_free_freelist_hook() outside of the loop. This change will also make introducing per cpu array caches easier. Tested-by: Marco Elver Reviewed-by: Chengming Zhou Signed-off-by: Vlastimil Babka --- mm/slub.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index af8c8fc9e799..ccd57636b739 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2053,7 +2053,7 @@ void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects, * production configuration these hooks all should produce no code at all. * * Returns true if freeing of the object can proceed, false if its reuse - * was delayed by KASAN quarantine. + * was delayed by KASAN quarantine, or it was returned to KFENCE. */ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x, bool init) @@ -2071,6 +2071,9 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) __kcsan_check_access(x, s->object_size, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); + if (kfence_free(x)) + return false; + /* * As memory initialization might be integrated into KASAN, * kasan_slab_free and initialization memset's must be @@ -2100,23 +2103,25 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, void *object; void *next = *head; void *old_tail = *tail; + bool init; if (is_kfence_address(next)) { slab_free_hook(s, next, false); - return true; + return false; } /* Head and tail of the reconstructed freelist */ *head = NULL; *tail = NULL; + init = slab_want_init_on_free(s); + do { object = next; next = get_freepointer(s, object); /* If object's reuse doesn't have to be delayed */ - if (likely(slab_free_hook(s, object, - slab_want_init_on_free(s)))) { + if (likely(slab_free_hook(s, object, init))) { /* Move object to the new freelist */ set_freepointer(s, object, *head); *head = object; @@ -4117,9 +4122,6 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, stat(s, FREE_SLOWPATH); - if (kfence_free(head)) - return; - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { free_to_partial_list(s, slab, head, tail, cnt, addr); return; @@ -4304,13 +4306,9 @@ static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, void *object, unsigned long addr) { - bool init; - memcg_slab_free_hook(s, slab, &object, 1); - init = !is_kfence_address(object) && slab_want_init_on_free(s); - - if (likely(slab_free_hook(s, object, init))) + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) do_slab_free(s, slab, object, object, 1, addr); }