From a0ae95040853aa05dc006f4b16f8c82c6f9dd9e4 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Mon, 7 Oct 2024 18:49:52 +0200 Subject: [PATCH 01/25] debugobjects: Delete a piece of redundant code The statically allocated objects are all located in obj_static_pool[], the whole memory of obj_static_pool[] will be reclaimed later. Therefore, there is no need to split the remaining statically nodes in list obj_pool into isolated ones, no one will use them anymore. Just write INIT_HLIST_HEAD(&obj_pool) is enough. Since hlist_move_list() directly discards the old list, even this can be omitted. Signed-off-by: Zhen Lei Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240911083521.2257-2-thunder.leizhen@huawei.com Link: https://lore.kernel.org/all/20241007164913.009849239@linutronix.de --- lib/debugobjects.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 5ce473ad499b..df48acc5d4b3 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -1325,10 +1325,10 @@ static int __init debug_objects_replace_static_objects(void) * active object references. */ - /* Remove the statically allocated objects from the pool */ - hlist_for_each_entry_safe(obj, tmp, &obj_pool, node) - hlist_del(&obj->node); - /* Move the allocated objects to the pool */ + /* + * Replace the statically allocated objects list with the allocated + * objects list. + */ hlist_move_list(&objects, &obj_pool); /* Replace the active object references */ From 813fd07858cfb410bc9574c05b7922185f65989b Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Mon, 7 Oct 2024 18:49:53 +0200 Subject: [PATCH 02/25] debugobjects: Collect newly allocated objects in a list to reduce lock contention Collect the newly allocated debug objects in a list outside the lock, so that the lock held time and the potential lock contention is reduced. Signed-off-by: Zhen Lei Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240911083521.2257-3-thunder.leizhen@huawei.com Link: https://lore.kernel.org/all/20241007164913.073653668@linutronix.de --- lib/debugobjects.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index df48acc5d4b3..798ce5ac1ffe 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -161,23 +161,25 @@ static void fill_pool(void) return; while (READ_ONCE(obj_pool_free) < debug_objects_pool_min_level) { - struct debug_obj *new[ODEBUG_BATCH_SIZE]; + struct debug_obj *new, *last = NULL; + HLIST_HEAD(head); int cnt; for (cnt = 0; cnt < ODEBUG_BATCH_SIZE; cnt++) { - new[cnt] = kmem_cache_zalloc(obj_cache, gfp); - if (!new[cnt]) + new = kmem_cache_zalloc(obj_cache, gfp); + if (!new) break; + hlist_add_head(&new->node, &head); + if (!last) + last = new; } if (!cnt) return; raw_spin_lock_irqsave(&pool_lock, flags); - while (cnt) { - hlist_add_head(&new[--cnt]->node, &obj_pool); - debug_objects_allocated++; - WRITE_ONCE(obj_pool_free, obj_pool_free + 1); - } + hlist_splice_init(&head, &last->node, &obj_pool); + debug_objects_allocated += cnt; + WRITE_ONCE(obj_pool_free, obj_pool_free + cnt); raw_spin_unlock_irqrestore(&pool_lock, flags); } } From 55fb412ef7d0c33226fcac4ebc68c60282e5f150 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:49:54 +0200 Subject: [PATCH 03/25] debugobjects: Dont destroy kmem cache in init() debug_objects_mem_init() is invoked from mm_core_init() before work queues are available. If debug_objects_mem_init() destroys the kmem cache in the error path it causes an Oops in __queue_work(): Oops: Oops: 0000 [#1] PREEMPT SMP PTI RIP: 0010:__queue_work+0x35/0x6a0 queue_work_on+0x66/0x70 flush_all_cpus_locked+0xdf/0x1a0 __kmem_cache_shutdown+0x2f/0x340 kmem_cache_destroy+0x4e/0x150 mm_core_init+0x9e/0x120 start_kernel+0x298/0x800 x86_64_start_reservations+0x18/0x30 x86_64_start_kernel+0xc5/0xe0 common_startup_64+0x12c/0x138 Further the object cache pointer is used in various places to check for early boot operation. It is exposed before the replacments for the static boot time objects are allocated and the self test operates on it. This can be avoided by: 1) Running the self test with the static boot objects 2) Exposing it only after the replacement objects have been added to the pool. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241007164913.137021337@linutronix.de --- lib/debugobjects.c | 70 ++++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 798ce5ac1ffe..1f6bf0fb0b20 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -1211,7 +1211,7 @@ static __initconst const struct debug_obj_descr descr_type_test = { static __initdata struct self_test obj = { .static_init = 0 }; -static void __init debug_objects_selftest(void) +static bool __init debug_objects_selftest(void) { int fixups, oldfixups, warnings, oldwarnings; unsigned long flags; @@ -1280,9 +1280,10 @@ out: descr_test = NULL; local_irq_restore(flags); + return !!debug_objects_enabled; } #else -static inline void debug_objects_selftest(void) { } +static inline bool debug_objects_selftest(void) { return true; } #endif /* @@ -1302,18 +1303,21 @@ void __init debug_objects_early_init(void) } /* - * Convert the statically allocated objects to dynamic ones: + * Convert the statically allocated objects to dynamic ones. + * debug_objects_mem_init() is called early so only one CPU is up and + * interrupts are disabled, which means it is safe to replace the active + * object references. */ -static int __init debug_objects_replace_static_objects(void) +static bool __init debug_objects_replace_static_objects(struct kmem_cache *cache) { struct debug_bucket *db = obj_hash; - struct hlist_node *tmp; struct debug_obj *obj, *new; + struct hlist_node *tmp; HLIST_HEAD(objects); int i, cnt = 0; for (i = 0; i < ODEBUG_POOL_SIZE; i++) { - obj = kmem_cache_zalloc(obj_cache, GFP_KERNEL); + obj = kmem_cache_zalloc(cache, GFP_KERNEL); if (!obj) goto free; hlist_add_head(&obj->node, &objects); @@ -1321,12 +1325,6 @@ static int __init debug_objects_replace_static_objects(void) debug_objects_allocated += i; - /* - * debug_objects_mem_init() is now called early that only one CPU is up - * and interrupts have been disabled, so it is safe to replace the - * active object references. - */ - /* * Replace the statically allocated objects list with the allocated * objects list. @@ -1347,15 +1345,14 @@ static int __init debug_objects_replace_static_objects(void) } } - pr_debug("%d of %d active objects replaced\n", - cnt, obj_pool_used); - return 0; + pr_debug("%d of %d active objects replaced\n", cnt, obj_pool_used); + return true; free: hlist_for_each_entry_safe(obj, tmp, &objects, node) { hlist_del(&obj->node); - kmem_cache_free(obj_cache, obj); + kmem_cache_free(cache, obj); } - return -ENOMEM; + return false; } /* @@ -1366,6 +1363,7 @@ free: */ void __init debug_objects_mem_init(void) { + struct kmem_cache *cache; int cpu, extras; if (!debug_objects_enabled) @@ -1380,29 +1378,33 @@ void __init debug_objects_mem_init(void) for_each_possible_cpu(cpu) INIT_HLIST_HEAD(&per_cpu(percpu_obj_pool.free_objs, cpu)); - obj_cache = kmem_cache_create("debug_objects_cache", - sizeof (struct debug_obj), 0, - SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE, - NULL); - - if (!obj_cache || debug_objects_replace_static_objects()) { - debug_objects_enabled = 0; - kmem_cache_destroy(obj_cache); - pr_warn("out of memory.\n"); + if (!debug_objects_selftest()) return; - } else - debug_objects_selftest(); -#ifdef CONFIG_HOTPLUG_CPU - cpuhp_setup_state_nocalls(CPUHP_DEBUG_OBJ_DEAD, "object:offline", NULL, - object_cpu_offline); -#endif + cache = kmem_cache_create("debug_objects_cache", sizeof (struct debug_obj), 0, + SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE, NULL); + + if (!cache || !debug_objects_replace_static_objects(cache)) { + debug_objects_enabled = 0; + pr_warn("Out of memory.\n"); + return; + } /* - * Increase the thresholds for allocating and freeing objects - * according to the number of possible CPUs available in the system. + * Adjust the thresholds for allocating and freeing objects + * according to the number of possible CPUs available in the + * system. */ extras = num_possible_cpus() * ODEBUG_BATCH_SIZE; debug_objects_pool_size += extras; debug_objects_pool_min_level += extras; + + /* Everything worked. Expose the cache */ + obj_cache = cache; + +#ifdef CONFIG_HOTPLUG_CPU + cpuhp_setup_state_nocalls(CPUHP_DEBUG_OBJ_DEAD, "object:offline", NULL, + object_cpu_offline); +#endif + return; } From 3f397bf9553d9f142fbfaa19713e0350803fcc31 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:49:56 +0200 Subject: [PATCH 04/25] debugobjects: Remove pointless hlist initialization It's BSS zero initialized. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.200379308@linutronix.de --- lib/debugobjects.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 1f6bf0fb0b20..9867412d7946 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -1364,20 +1364,11 @@ free: void __init debug_objects_mem_init(void) { struct kmem_cache *cache; - int cpu, extras; + int extras; if (!debug_objects_enabled) return; - /* - * Initialize the percpu object pools - * - * Initialization is not strictly necessary, but was done for - * completeness. - */ - for_each_possible_cpu(cpu) - INIT_HLIST_HEAD(&per_cpu(percpu_obj_pool.free_objs, cpu)); - if (!debug_objects_selftest()) return; From a2a702383e8baa22ee66ee60f1e036835a1ef42e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:49:57 +0200 Subject: [PATCH 05/25] debugobjects: Dont free objects directly on CPU hotplug Freeing the per CPU pool of the unplugged CPU directly is suboptimal as the objects can be reused in the real pool if there is room. Aside of that this gets the accounting wrong. Use the regular free path, which allows reuse and has the accounting correct. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.263960570@linutronix.de --- lib/debugobjects.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 9867412d7946..a3d4c54f0839 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -430,27 +430,28 @@ static void free_object(struct debug_obj *obj) } #ifdef CONFIG_HOTPLUG_CPU -static int object_cpu_offline(unsigned int cpu) +static void put_objects(struct hlist_head *list) { - struct debug_percpu_free *percpu_pool; struct hlist_node *tmp; struct debug_obj *obj; - unsigned long flags; - /* Remote access is safe as the CPU is dead already */ - percpu_pool = per_cpu_ptr(&percpu_obj_pool, cpu); - hlist_for_each_entry_safe(obj, tmp, &percpu_pool->free_objs, node) { + /* + * Using free_object() puts the objects into reuse or schedules + * them for freeing and it get's all the accounting correct. + */ + hlist_for_each_entry_safe(obj, tmp, list, node) { hlist_del(&obj->node); - kmem_cache_free(obj_cache, obj); + free_object(obj); } +} - raw_spin_lock_irqsave(&pool_lock, flags); - obj_pool_used -= percpu_pool->obj_free; - debug_objects_freed += percpu_pool->obj_free; - raw_spin_unlock_irqrestore(&pool_lock, flags); - - percpu_pool->obj_free = 0; +static int object_cpu_offline(unsigned int cpu) +{ + /* Remote access is safe as the CPU is dead already */ + struct debug_percpu_free *pcp = per_cpu_ptr(&percpu_obj_pool, cpu); + put_objects(&pcp->free_objs); + pcp->obj_free = 0; return 0; } #endif From 49968cf18154d6391e84c68520149232057ca62c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:49:58 +0200 Subject: [PATCH 06/25] debugobjects: Reuse put_objects() on OOM Reuse the helper function instead of having a open coded copy. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.326834268@linutronix.de --- lib/debugobjects.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index a3d4c54f0839..2c866d0bdd68 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -429,7 +429,6 @@ static void free_object(struct debug_obj *obj) } } -#ifdef CONFIG_HOTPLUG_CPU static void put_objects(struct hlist_head *list) { struct hlist_node *tmp; @@ -445,6 +444,7 @@ static void put_objects(struct hlist_head *list) } } +#ifdef CONFIG_HOTPLUG_CPU static int object_cpu_offline(unsigned int cpu) { /* Remote access is safe as the CPU is dead already */ @@ -456,31 +456,19 @@ static int object_cpu_offline(unsigned int cpu) } #endif -/* - * We run out of memory. That means we probably have tons of objects - * allocated. - */ +/* Out of memory. Free all objects from hash */ static void debug_objects_oom(void) { struct debug_bucket *db = obj_hash; - struct hlist_node *tmp; HLIST_HEAD(freelist); - struct debug_obj *obj; - unsigned long flags; - int i; pr_warn("Out of memory. ODEBUG disabled\n"); - for (i = 0; i < ODEBUG_HASH_SIZE; i++, db++) { - raw_spin_lock_irqsave(&db->lock, flags); - hlist_move_list(&db->list, &freelist); - raw_spin_unlock_irqrestore(&db->lock, flags); + for (int i = 0; i < ODEBUG_HASH_SIZE; i++, db++) { + scoped_guard(raw_spinlock_irqsave, &db->lock) + hlist_move_list(&db->list, &freelist); - /* Now free them */ - hlist_for_each_entry_safe(obj, tmp, &freelist, node) { - hlist_del(&obj->node); - free_object(obj); - } + put_objects(&freelist); } } From 241463f4fdcc845fa4a174e63a23940305cb6691 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:49:59 +0200 Subject: [PATCH 07/25] debugobjects: Remove pointless debug printk It has zero value. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.390511021@linutronix.de --- lib/debugobjects.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 2c866d0bdd68..fc54115818a1 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -1303,7 +1303,7 @@ static bool __init debug_objects_replace_static_objects(struct kmem_cache *cache struct debug_obj *obj, *new; struct hlist_node *tmp; HLIST_HEAD(objects); - int i, cnt = 0; + int i; for (i = 0; i < ODEBUG_POOL_SIZE; i++) { obj = kmem_cache_zalloc(cache, GFP_KERNEL); @@ -1330,11 +1330,8 @@ static bool __init debug_objects_replace_static_objects(struct kmem_cache *cache /* copy object data */ *new = *obj; hlist_add_head(&new->node, &db->list); - cnt++; } } - - pr_debug("%d of %d active objects replaced\n", cnt, obj_pool_used); return true; free: hlist_for_each_entry_safe(obj, tmp, &objects, node) { From 49a5cb827d3d625944d48518acec4e4b9d61e1da Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:01 +0200 Subject: [PATCH 08/25] debugobjects: Provide and use free_object_list() Move the loop to free a list of objects into a helper function so it can be reused later. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241007164913.453912357@linutronix.de --- lib/debugobjects.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index fc54115818a1..6ccdfebcd807 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -125,6 +125,20 @@ static const char *obj_states[ODEBUG_STATE_MAX] = { [ODEBUG_STATE_NOTAVAILABLE] = "not available", }; +static void free_object_list(struct hlist_head *head) +{ + struct hlist_node *tmp; + struct debug_obj *obj; + int cnt = 0; + + hlist_for_each_entry_safe(obj, tmp, head, node) { + hlist_del(&obj->node); + kmem_cache_free(obj_cache, obj); + cnt++; + } + debug_objects_freed += cnt; +} + static void fill_pool(void) { gfp_t gfp = __GFP_HIGH | __GFP_NOWARN; @@ -286,7 +300,6 @@ init_obj: */ static void free_obj_work(struct work_struct *work) { - struct hlist_node *tmp; struct debug_obj *obj; unsigned long flags; HLIST_HEAD(tofree); @@ -323,15 +336,11 @@ free_objs: */ if (obj_nr_tofree) { hlist_move_list(&obj_to_free, &tofree); - debug_objects_freed += obj_nr_tofree; WRITE_ONCE(obj_nr_tofree, 0); } raw_spin_unlock_irqrestore(&pool_lock, flags); - hlist_for_each_entry_safe(obj, tmp, &tofree, node) { - hlist_del(&obj->node); - kmem_cache_free(obj_cache, obj); - } + free_object_list(&tofree); } static void __free_object(struct debug_obj *obj) @@ -1334,6 +1343,7 @@ static bool __init debug_objects_replace_static_objects(struct kmem_cache *cache } return true; free: + /* Can't use free_object_list() as the cache is not populated yet */ hlist_for_each_entry_safe(obj, tmp, &objects, node) { hlist_del(&obj->node); kmem_cache_free(cache, obj); From 661cc28b523d4616a322c8f82f06ec7880192060 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:02 +0200 Subject: [PATCH 09/25] debugobjects: Make debug_objects_enabled bool Make it what it is. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.518175013@linutronix.de --- lib/debugobjects.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 6ccdfebcd807..0d69095394ee 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -82,7 +82,7 @@ static int __data_racy debug_objects_maxchain __read_mostly; static int __data_racy __maybe_unused debug_objects_maxchecked __read_mostly; static int __data_racy debug_objects_fixups __read_mostly; static int __data_racy debug_objects_warnings __read_mostly; -static int __data_racy debug_objects_enabled __read_mostly +static bool __data_racy debug_objects_enabled __read_mostly = CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT; static int debug_objects_pool_size __ro_after_init = ODEBUG_POOL_SIZE; @@ -103,17 +103,16 @@ static DECLARE_DELAYED_WORK(debug_obj_work, free_obj_work); static int __init enable_object_debug(char *str) { - debug_objects_enabled = 1; + debug_objects_enabled = true; return 0; } +early_param("debug_objects", enable_object_debug); static int __init disable_object_debug(char *str) { - debug_objects_enabled = 0; + debug_objects_enabled = false; return 0; } - -early_param("debug_objects", enable_object_debug); early_param("no_debug_objects", disable_object_debug); static const char *obj_states[ODEBUG_STATE_MAX] = { @@ -592,7 +591,7 @@ static struct debug_obj *lookup_object_or_alloc(void *addr, struct debug_bucket } /* Out of memory. Do the cleanup outside of the locked region */ - debug_objects_enabled = 0; + debug_objects_enabled = false; return NULL; } @@ -1194,7 +1193,7 @@ check_results(void *addr, enum debug_obj_state state, int fixups, int warnings) out: raw_spin_unlock_irqrestore(&db->lock, flags); if (res) - debug_objects_enabled = 0; + debug_objects_enabled = false; return res; } @@ -1278,7 +1277,7 @@ out: descr_test = NULL; local_irq_restore(flags); - return !!debug_objects_enabled; + return debug_objects_enabled; } #else static inline bool debug_objects_selftest(void) { return true; } @@ -1372,7 +1371,7 @@ void __init debug_objects_mem_init(void) SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE, NULL); if (!cache || !debug_objects_replace_static_objects(cache)) { - debug_objects_enabled = 0; + debug_objects_enabled = false; pr_warn("Out of memory.\n"); return; } From d8c6cd3a5c8008f5d42c7763a93b43d7f3a40e94 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Mon, 7 Oct 2024 18:50:03 +0200 Subject: [PATCH 10/25] debugobjects: Reduce parallel pool fill attempts The contention on the global pool_lock can be massive when the global pool needs to be refilled and many CPUs try to handle this. Address this by: - splitting the refill from free list and allocation. Refill from free list has no constraints vs. the context on RT, so it can be tried outside of the RT specific preemptible() guard - Let only one CPU handle the free list - Let only one CPU do allocations unless the pool level is below half of the minimum fill level. Suggested-by: Thomas Gleixner Signed-off-by: Zhen Lei Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240911083521.2257-4-thunder.leizhen@huawei.com- Link: https://lore.kernel.org/all/20241007164913.582118421@linutronix.de -- lib/debugobjects.c | 84 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 25 deletions(-) --- lib/debugobjects.c | 88 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 61 insertions(+), 27 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 0d69095394ee..64a72d419136 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -138,14 +138,10 @@ static void free_object_list(struct hlist_head *head) debug_objects_freed += cnt; } -static void fill_pool(void) +static void fill_pool_from_freelist(void) { - gfp_t gfp = __GFP_HIGH | __GFP_NOWARN; + static unsigned long state; struct debug_obj *obj; - unsigned long flags; - - if (likely(READ_ONCE(obj_pool_free) >= debug_objects_pool_min_level)) - return; /* * Reuse objs from the global obj_to_free list; they will be @@ -154,32 +150,58 @@ static void fill_pool(void) * obj_nr_tofree is checked locklessly; the READ_ONCE() pairs with * the WRITE_ONCE() in pool_lock critical sections. */ - if (READ_ONCE(obj_nr_tofree)) { - raw_spin_lock_irqsave(&pool_lock, flags); - /* - * Recheck with the lock held as the worker thread might have - * won the race and freed the global free list already. - */ - while (obj_nr_tofree && (obj_pool_free < debug_objects_pool_min_level)) { - obj = hlist_entry(obj_to_free.first, typeof(*obj), node); - hlist_del(&obj->node); - WRITE_ONCE(obj_nr_tofree, obj_nr_tofree - 1); - hlist_add_head(&obj->node, &obj_pool); - WRITE_ONCE(obj_pool_free, obj_pool_free + 1); - } - raw_spin_unlock_irqrestore(&pool_lock, flags); - } - - if (unlikely(!obj_cache)) + if (!READ_ONCE(obj_nr_tofree)) return; + /* + * Prevent the context from being scheduled or interrupted after + * setting the state flag; + */ + guard(irqsave)(); + + /* + * Avoid lock contention on &pool_lock and avoid making the cache + * line exclusive by testing the bit before attempting to set it. + */ + if (test_bit(0, &state) || test_and_set_bit(0, &state)) + return; + + guard(raw_spinlock)(&pool_lock); + /* + * Recheck with the lock held as the worker thread might have + * won the race and freed the global free list already. + */ + while (obj_nr_tofree && (obj_pool_free < debug_objects_pool_min_level)) { + obj = hlist_entry(obj_to_free.first, typeof(*obj), node); + hlist_del(&obj->node); + WRITE_ONCE(obj_nr_tofree, obj_nr_tofree - 1); + hlist_add_head(&obj->node, &obj_pool); + WRITE_ONCE(obj_pool_free, obj_pool_free + 1); + } + clear_bit(0, &state); +} + +static void fill_pool(void) +{ + static atomic_t cpus_allocating; + + /* + * Avoid allocation and lock contention when: + * - One other CPU is already allocating + * - the global pool has not reached the critical level yet + */ + if (READ_ONCE(obj_pool_free) > (debug_objects_pool_min_level / 2) && + atomic_read(&cpus_allocating)) + return; + + atomic_inc(&cpus_allocating); while (READ_ONCE(obj_pool_free) < debug_objects_pool_min_level) { struct debug_obj *new, *last = NULL; HLIST_HEAD(head); int cnt; for (cnt = 0; cnt < ODEBUG_BATCH_SIZE; cnt++) { - new = kmem_cache_zalloc(obj_cache, gfp); + new = kmem_cache_zalloc(obj_cache, __GFP_HIGH | __GFP_NOWARN); if (!new) break; hlist_add_head(&new->node, &head); @@ -187,14 +209,14 @@ static void fill_pool(void) last = new; } if (!cnt) - return; + break; - raw_spin_lock_irqsave(&pool_lock, flags); + guard(raw_spinlock_irqsave)(&pool_lock); hlist_splice_init(&head, &last->node, &obj_pool); debug_objects_allocated += cnt; WRITE_ONCE(obj_pool_free, obj_pool_free + cnt); - raw_spin_unlock_irqrestore(&pool_lock, flags); } + atomic_dec(&cpus_allocating); } /* @@ -597,6 +619,18 @@ static struct debug_obj *lookup_object_or_alloc(void *addr, struct debug_bucket static void debug_objects_fill_pool(void) { + if (unlikely(!obj_cache)) + return; + + if (likely(READ_ONCE(obj_pool_free) >= debug_objects_pool_min_level)) + return; + + /* Try reusing objects from obj_to_free_list */ + fill_pool_from_freelist(); + + if (likely(READ_ONCE(obj_pool_free) >= debug_objects_pool_min_level)) + return; + /* * On RT enabled kernels the pool refill must happen in preemptible * context -- for !RT kernels we rely on the fact that spinlock_t and From e18328ff705270d1e53889ea9d79dce86d1b8786 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:04 +0200 Subject: [PATCH 11/25] debugobjects: Move pools into a datastructure The contention on the global pool lock can be reduced by strict batch processing where batches of objects are moved from one list head to another instead of moving them object by object. This also reduces the cache footprint because it avoids the list walk and dirties at maximum three cache lines instead of potentially up to eighteen. To prepare for that, move the hlist head and related counters into a struct. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.646171170@linutronix.de --- lib/debugobjects.c | 140 +++++++++++++++++++++++++-------------------- 1 file changed, 78 insertions(+), 62 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 64a72d419136..fcba13db90ed 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -52,6 +52,11 @@ struct debug_percpu_free { int obj_free; }; +struct obj_pool { + struct hlist_head objects; + unsigned int cnt; +} ____cacheline_aligned; + static DEFINE_PER_CPU(struct debug_percpu_free, percpu_obj_pool); static struct debug_bucket obj_hash[ODEBUG_HASH_SIZE]; @@ -60,8 +65,8 @@ static struct debug_obj obj_static_pool[ODEBUG_POOL_SIZE] __initdata; static DEFINE_RAW_SPINLOCK(pool_lock); -static HLIST_HEAD(obj_pool); -static HLIST_HEAD(obj_to_free); +static struct obj_pool pool_global; +static struct obj_pool pool_to_free; /* * Because of the presence of percpu free pools, obj_pool_free will @@ -71,12 +76,9 @@ static HLIST_HEAD(obj_to_free); * can be off. */ static int __data_racy obj_pool_min_free = ODEBUG_POOL_SIZE; -static int __data_racy obj_pool_free = ODEBUG_POOL_SIZE; static int obj_pool_used; static int __data_racy obj_pool_max_used; static bool obj_freeing; -/* The number of objs on the global free list */ -static int obj_nr_tofree; static int __data_racy debug_objects_maxchain __read_mostly; static int __data_racy __maybe_unused debug_objects_maxchecked __read_mostly; @@ -124,6 +126,21 @@ static const char *obj_states[ODEBUG_STATE_MAX] = { [ODEBUG_STATE_NOTAVAILABLE] = "not available", }; +static __always_inline unsigned int pool_count(struct obj_pool *pool) +{ + return READ_ONCE(pool->cnt); +} + +static inline bool pool_global_should_refill(void) +{ + return READ_ONCE(pool_global.cnt) < debug_objects_pool_min_level; +} + +static inline bool pool_global_must_refill(void) +{ + return READ_ONCE(pool_global.cnt) < (debug_objects_pool_min_level / 2); +} + static void free_object_list(struct hlist_head *head) { struct hlist_node *tmp; @@ -146,11 +163,8 @@ static void fill_pool_from_freelist(void) /* * Reuse objs from the global obj_to_free list; they will be * reinitialized when allocating. - * - * obj_nr_tofree is checked locklessly; the READ_ONCE() pairs with - * the WRITE_ONCE() in pool_lock critical sections. */ - if (!READ_ONCE(obj_nr_tofree)) + if (!pool_count(&pool_to_free)) return; /* @@ -171,12 +185,12 @@ static void fill_pool_from_freelist(void) * Recheck with the lock held as the worker thread might have * won the race and freed the global free list already. */ - while (obj_nr_tofree && (obj_pool_free < debug_objects_pool_min_level)) { - obj = hlist_entry(obj_to_free.first, typeof(*obj), node); + while (pool_to_free.cnt && (pool_global.cnt < debug_objects_pool_min_level)) { + obj = hlist_entry(pool_to_free.objects.first, typeof(*obj), node); hlist_del(&obj->node); - WRITE_ONCE(obj_nr_tofree, obj_nr_tofree - 1); - hlist_add_head(&obj->node, &obj_pool); - WRITE_ONCE(obj_pool_free, obj_pool_free + 1); + WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt - 1); + hlist_add_head(&obj->node, &pool_global.objects); + WRITE_ONCE(pool_global.cnt, pool_global.cnt + 1); } clear_bit(0, &state); } @@ -190,12 +204,11 @@ static void fill_pool(void) * - One other CPU is already allocating * - the global pool has not reached the critical level yet */ - if (READ_ONCE(obj_pool_free) > (debug_objects_pool_min_level / 2) && - atomic_read(&cpus_allocating)) + if (!pool_global_must_refill() && atomic_read(&cpus_allocating)) return; atomic_inc(&cpus_allocating); - while (READ_ONCE(obj_pool_free) < debug_objects_pool_min_level) { + while (pool_global_should_refill()) { struct debug_obj *new, *last = NULL; HLIST_HEAD(head); int cnt; @@ -212,9 +225,9 @@ static void fill_pool(void) break; guard(raw_spinlock_irqsave)(&pool_lock); - hlist_splice_init(&head, &last->node, &obj_pool); + hlist_splice_init(&head, &last->node, &pool_global.objects); debug_objects_allocated += cnt; - WRITE_ONCE(obj_pool_free, obj_pool_free + cnt); + WRITE_ONCE(pool_global.cnt, pool_global.cnt + cnt); } atomic_dec(&cpus_allocating); } @@ -268,10 +281,10 @@ alloc_object(void *addr, struct debug_bucket *b, const struct debug_obj_descr *d } raw_spin_lock(&pool_lock); - obj = __alloc_object(&obj_pool); + obj = __alloc_object(&pool_global.objects); if (obj) { obj_pool_used++; - WRITE_ONCE(obj_pool_free, obj_pool_free - 1); + WRITE_ONCE(pool_global.cnt, pool_global.cnt - 1); /* * Looking ahead, allocate one batch of debug objects and @@ -283,22 +296,21 @@ alloc_object(void *addr, struct debug_bucket *b, const struct debug_obj_descr *d for (i = 0; i < ODEBUG_BATCH_SIZE; i++) { struct debug_obj *obj2; - obj2 = __alloc_object(&obj_pool); + obj2 = __alloc_object(&pool_global.objects); if (!obj2) break; - hlist_add_head(&obj2->node, - &percpu_pool->free_objs); + hlist_add_head(&obj2->node, &percpu_pool->free_objs); percpu_pool->obj_free++; obj_pool_used++; - WRITE_ONCE(obj_pool_free, obj_pool_free - 1); + WRITE_ONCE(pool_global.cnt, pool_global.cnt - 1); } } if (obj_pool_used > obj_pool_max_used) obj_pool_max_used = obj_pool_used; - if (obj_pool_free < obj_pool_min_free) - obj_pool_min_free = obj_pool_free; + if (pool_global.cnt < obj_pool_min_free) + obj_pool_min_free = pool_global.cnt; } raw_spin_unlock(&pool_lock); @@ -329,7 +341,7 @@ static void free_obj_work(struct work_struct *work) if (!raw_spin_trylock_irqsave(&pool_lock, flags)) return; - if (obj_pool_free >= debug_objects_pool_size) + if (pool_global.cnt >= debug_objects_pool_size) goto free_objs; /* @@ -339,12 +351,12 @@ static void free_obj_work(struct work_struct *work) * may be gearing up to use more and more objects, don't free any * of them until the next round. */ - while (obj_nr_tofree && obj_pool_free < debug_objects_pool_size) { - obj = hlist_entry(obj_to_free.first, typeof(*obj), node); + while (pool_to_free.cnt && pool_global.cnt < debug_objects_pool_size) { + obj = hlist_entry(pool_to_free.objects.first, typeof(*obj), node); hlist_del(&obj->node); - hlist_add_head(&obj->node, &obj_pool); - WRITE_ONCE(obj_pool_free, obj_pool_free + 1); - WRITE_ONCE(obj_nr_tofree, obj_nr_tofree - 1); + hlist_add_head(&obj->node, &pool_global.objects); + WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt - 1); + WRITE_ONCE(pool_global.cnt, pool_global.cnt + 1); } raw_spin_unlock_irqrestore(&pool_lock, flags); return; @@ -355,9 +367,9 @@ free_objs: * list. Move remaining free objs to a temporary list to free the * memory outside the pool_lock held region. */ - if (obj_nr_tofree) { - hlist_move_list(&obj_to_free, &tofree); - WRITE_ONCE(obj_nr_tofree, 0); + if (pool_to_free.cnt) { + hlist_move_list(&pool_to_free.objects, &tofree); + WRITE_ONCE(pool_to_free.cnt, 0); } raw_spin_unlock_irqrestore(&pool_lock, flags); @@ -400,45 +412,45 @@ static void __free_object(struct debug_obj *obj) free_to_obj_pool: raw_spin_lock(&pool_lock); - work = (obj_pool_free > debug_objects_pool_size) && obj_cache && - (obj_nr_tofree < ODEBUG_FREE_WORK_MAX); + work = (pool_global.cnt > debug_objects_pool_size) && obj_cache && + (pool_to_free.cnt < ODEBUG_FREE_WORK_MAX); obj_pool_used--; if (work) { - WRITE_ONCE(obj_nr_tofree, obj_nr_tofree + 1); - hlist_add_head(&obj->node, &obj_to_free); + WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt + 1); + hlist_add_head(&obj->node, &pool_to_free.objects); if (lookahead_count) { - WRITE_ONCE(obj_nr_tofree, obj_nr_tofree + lookahead_count); + WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt + lookahead_count); obj_pool_used -= lookahead_count; while (lookahead_count) { hlist_add_head(&objs[--lookahead_count]->node, - &obj_to_free); + &pool_to_free.objects); } } - if ((obj_pool_free > debug_objects_pool_size) && - (obj_nr_tofree < ODEBUG_FREE_WORK_MAX)) { + if ((pool_global.cnt > debug_objects_pool_size) && + (pool_to_free.cnt < ODEBUG_FREE_WORK_MAX)) { int i; /* * Free one more batch of objects from obj_pool. */ for (i = 0; i < ODEBUG_BATCH_SIZE; i++) { - obj = __alloc_object(&obj_pool); - hlist_add_head(&obj->node, &obj_to_free); - WRITE_ONCE(obj_pool_free, obj_pool_free - 1); - WRITE_ONCE(obj_nr_tofree, obj_nr_tofree + 1); + obj = __alloc_object(&pool_global.objects); + hlist_add_head(&obj->node, &pool_to_free.objects); + WRITE_ONCE(pool_global.cnt, pool_global.cnt - 1); + WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt + 1); } } } else { - WRITE_ONCE(obj_pool_free, obj_pool_free + 1); - hlist_add_head(&obj->node, &obj_pool); + WRITE_ONCE(pool_global.cnt, pool_global.cnt + 1); + hlist_add_head(&obj->node, &pool_global.objects); if (lookahead_count) { - WRITE_ONCE(obj_pool_free, obj_pool_free + lookahead_count); + WRITE_ONCE(pool_global.cnt, pool_global.cnt + lookahead_count); obj_pool_used -= lookahead_count; while (lookahead_count) { hlist_add_head(&objs[--lookahead_count]->node, - &obj_pool); + &pool_global.objects); } } } @@ -453,7 +465,7 @@ free_to_obj_pool: static void free_object(struct debug_obj *obj) { __free_object(obj); - if (!READ_ONCE(obj_freeing) && READ_ONCE(obj_nr_tofree)) { + if (!READ_ONCE(obj_freeing) && pool_count(&pool_to_free)) { WRITE_ONCE(obj_freeing, true); schedule_delayed_work(&debug_obj_work, ODEBUG_FREE_WORK_DELAY); } @@ -622,13 +634,13 @@ static void debug_objects_fill_pool(void) if (unlikely(!obj_cache)) return; - if (likely(READ_ONCE(obj_pool_free) >= debug_objects_pool_min_level)) + if (likely(!pool_global_should_refill())) return; /* Try reusing objects from obj_to_free_list */ fill_pool_from_freelist(); - if (likely(READ_ONCE(obj_pool_free) >= debug_objects_pool_min_level)) + if (likely(!pool_global_should_refill())) return; /* @@ -1040,7 +1052,7 @@ repeat: debug_objects_maxchecked = objs_checked; /* Schedule work to actually kmem_cache_free() objects */ - if (!READ_ONCE(obj_freeing) && READ_ONCE(obj_nr_tofree)) { + if (!READ_ONCE(obj_freeing) && pool_count(&pool_to_free)) { WRITE_ONCE(obj_freeing, true); schedule_delayed_work(&debug_obj_work, ODEBUG_FREE_WORK_DELAY); } @@ -1066,12 +1078,12 @@ static int debug_stats_show(struct seq_file *m, void *v) seq_printf(m, "max_checked :%d\n", debug_objects_maxchecked); seq_printf(m, "warnings :%d\n", debug_objects_warnings); seq_printf(m, "fixups :%d\n", debug_objects_fixups); - seq_printf(m, "pool_free :%d\n", READ_ONCE(obj_pool_free) + obj_percpu_free); + seq_printf(m, "pool_free :%d\n", pool_count(&pool_global) + obj_percpu_free); seq_printf(m, "pool_pcp_free :%d\n", obj_percpu_free); seq_printf(m, "pool_min_free :%d\n", obj_pool_min_free); seq_printf(m, "pool_used :%d\n", obj_pool_used - obj_percpu_free); seq_printf(m, "pool_max_used :%d\n", obj_pool_max_used); - seq_printf(m, "on_free_list :%d\n", READ_ONCE(obj_nr_tofree)); + seq_printf(m, "on_free_list :%d\n", pool_count(&pool_to_free)); seq_printf(m, "objs_allocated:%d\n", debug_objects_allocated); seq_printf(m, "objs_freed :%d\n", debug_objects_freed); return 0; @@ -1330,7 +1342,9 @@ void __init debug_objects_early_init(void) raw_spin_lock_init(&obj_hash[i].lock); for (i = 0; i < ODEBUG_POOL_SIZE; i++) - hlist_add_head(&obj_static_pool[i].node, &obj_pool); + hlist_add_head(&obj_static_pool[i].node, &pool_global.objects); + + pool_global.cnt = ODEBUG_POOL_SIZE; } /* @@ -1354,21 +1368,23 @@ static bool __init debug_objects_replace_static_objects(struct kmem_cache *cache hlist_add_head(&obj->node, &objects); } - debug_objects_allocated += i; + debug_objects_allocated = ODEBUG_POOL_SIZE; + pool_global.cnt = ODEBUG_POOL_SIZE; /* * Replace the statically allocated objects list with the allocated * objects list. */ - hlist_move_list(&objects, &obj_pool); + hlist_move_list(&objects, &pool_global.objects); /* Replace the active object references */ for (i = 0; i < ODEBUG_HASH_SIZE; i++, db++) { hlist_move_list(&db->list, &objects); hlist_for_each_entry(obj, &objects, node) { - new = hlist_entry(obj_pool.first, typeof(*obj), node); + new = hlist_entry(pool_global.objects.first, typeof(*obj), node); hlist_del(&new->node); + pool_global.cnt--; /* copy object data */ *new = *obj; hlist_add_head(&new->node, &db->list); From cb58d190843059d5dc50d6ac483647ba61001e8f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:05 +0200 Subject: [PATCH 12/25] debugobjects: Use separate list head for boot pool There is no point to handle the statically allocated objects during early boot in the actual pool list. This phase does not require accounting, so all of the related complexity can be avoided. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.708939081@linutronix.de --- lib/debugobjects.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index fcba13db90ed..0b29a25dc3f9 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -68,6 +68,8 @@ static DEFINE_RAW_SPINLOCK(pool_lock); static struct obj_pool pool_global; static struct obj_pool pool_to_free; +static HLIST_HEAD(pool_boot); + /* * Because of the presence of percpu free pools, obj_pool_free will * under-count those in the percpu free pools. Similarly, obj_pool_used @@ -278,6 +280,9 @@ alloc_object(void *addr, struct debug_bucket *b, const struct debug_obj_descr *d percpu_pool->obj_free--; goto init_obj; } + } else { + obj = __alloc_object(&pool_boot); + goto init_obj; } raw_spin_lock(&pool_lock); @@ -381,12 +386,14 @@ static void __free_object(struct debug_obj *obj) struct debug_obj *objs[ODEBUG_BATCH_SIZE]; struct debug_percpu_free *percpu_pool; int lookahead_count = 0; - unsigned long flags; bool work; - local_irq_save(flags); - if (!obj_cache) - goto free_to_obj_pool; + guard(irqsave)(); + + if (unlikely(!obj_cache)) { + hlist_add_head(&obj->node, &pool_boot); + return; + } /* * Try to free it into the percpu pool first. @@ -395,7 +402,6 @@ static void __free_object(struct debug_obj *obj) if (percpu_pool->obj_free < ODEBUG_POOL_PERCPU_SIZE) { hlist_add_head(&obj->node, &percpu_pool->free_objs); percpu_pool->obj_free++; - local_irq_restore(flags); return; } @@ -410,7 +416,6 @@ static void __free_object(struct debug_obj *obj) percpu_pool->obj_free--; } -free_to_obj_pool: raw_spin_lock(&pool_lock); work = (pool_global.cnt > debug_objects_pool_size) && obj_cache && (pool_to_free.cnt < ODEBUG_FREE_WORK_MAX); @@ -455,7 +460,6 @@ free_to_obj_pool: } } raw_spin_unlock(&pool_lock); - local_irq_restore(flags); } /* @@ -1341,10 +1345,9 @@ void __init debug_objects_early_init(void) for (i = 0; i < ODEBUG_HASH_SIZE; i++) raw_spin_lock_init(&obj_hash[i].lock); + /* Keep early boot simple and add everything to the boot list */ for (i = 0; i < ODEBUG_POOL_SIZE; i++) - hlist_add_head(&obj_static_pool[i].node, &pool_global.objects); - - pool_global.cnt = ODEBUG_POOL_SIZE; + hlist_add_head(&obj_static_pool[i].node, &pool_boot); } /* @@ -1372,10 +1375,11 @@ static bool __init debug_objects_replace_static_objects(struct kmem_cache *cache pool_global.cnt = ODEBUG_POOL_SIZE; /* - * Replace the statically allocated objects list with the allocated - * objects list. + * Move the allocated objects to the global pool and disconnect the + * boot pool. */ hlist_move_list(&objects, &pool_global.objects); + pool_boot.first = NULL; /* Replace the active object references */ for (i = 0; i < ODEBUG_HASH_SIZE; i++, db++) { From 18b8afcb37d8a72479892e080e4d37890f2bf353 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:07 +0200 Subject: [PATCH 13/25] debugobjects: Rename and tidy up per CPU pools No point in having a separate data structure. Reuse struct obj_pool and tidy up the code. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.770595795@linutronix.de --- lib/debugobjects.c | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 0b29a25dc3f9..3d1d9733676e 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -43,21 +43,12 @@ struct debug_bucket { raw_spinlock_t lock; }; -/* - * Debug object percpu free list - * Access is protected by disabling irq - */ -struct debug_percpu_free { - struct hlist_head free_objs; - int obj_free; -}; - struct obj_pool { struct hlist_head objects; unsigned int cnt; } ____cacheline_aligned; -static DEFINE_PER_CPU(struct debug_percpu_free, percpu_obj_pool); +static DEFINE_PER_CPU(struct obj_pool, pool_pcpu); static struct debug_bucket obj_hash[ODEBUG_HASH_SIZE]; @@ -271,13 +262,13 @@ static struct debug_obj *__alloc_object(struct hlist_head *list) static struct debug_obj * alloc_object(void *addr, struct debug_bucket *b, const struct debug_obj_descr *descr) { - struct debug_percpu_free *percpu_pool = this_cpu_ptr(&percpu_obj_pool); + struct obj_pool *percpu_pool = this_cpu_ptr(&pool_pcpu); struct debug_obj *obj; if (likely(obj_cache)) { - obj = __alloc_object(&percpu_pool->free_objs); + obj = __alloc_object(&percpu_pool->objects); if (obj) { - percpu_pool->obj_free--; + percpu_pool->cnt--; goto init_obj; } } else { @@ -304,8 +295,8 @@ alloc_object(void *addr, struct debug_bucket *b, const struct debug_obj_descr *d obj2 = __alloc_object(&pool_global.objects); if (!obj2) break; - hlist_add_head(&obj2->node, &percpu_pool->free_objs); - percpu_pool->obj_free++; + hlist_add_head(&obj2->node, &percpu_pool->objects); + percpu_pool->cnt++; obj_pool_used++; WRITE_ONCE(pool_global.cnt, pool_global.cnt - 1); } @@ -384,7 +375,7 @@ free_objs: static void __free_object(struct debug_obj *obj) { struct debug_obj *objs[ODEBUG_BATCH_SIZE]; - struct debug_percpu_free *percpu_pool; + struct obj_pool *percpu_pool; int lookahead_count = 0; bool work; @@ -398,10 +389,10 @@ static void __free_object(struct debug_obj *obj) /* * Try to free it into the percpu pool first. */ - percpu_pool = this_cpu_ptr(&percpu_obj_pool); - if (percpu_pool->obj_free < ODEBUG_POOL_PERCPU_SIZE) { - hlist_add_head(&obj->node, &percpu_pool->free_objs); - percpu_pool->obj_free++; + percpu_pool = this_cpu_ptr(&pool_pcpu); + if (percpu_pool->cnt < ODEBUG_POOL_PERCPU_SIZE) { + hlist_add_head(&obj->node, &percpu_pool->objects); + percpu_pool->cnt++; return; } @@ -410,10 +401,10 @@ static void __free_object(struct debug_obj *obj) * of objects from the percpu pool and free them as well. */ for (; lookahead_count < ODEBUG_BATCH_SIZE; lookahead_count++) { - objs[lookahead_count] = __alloc_object(&percpu_pool->free_objs); + objs[lookahead_count] = __alloc_object(&percpu_pool->objects); if (!objs[lookahead_count]) break; - percpu_pool->obj_free--; + percpu_pool->cnt--; } raw_spin_lock(&pool_lock); @@ -494,10 +485,10 @@ static void put_objects(struct hlist_head *list) static int object_cpu_offline(unsigned int cpu) { /* Remote access is safe as the CPU is dead already */ - struct debug_percpu_free *pcp = per_cpu_ptr(&percpu_obj_pool, cpu); + struct obj_pool *pcp = per_cpu_ptr(&pool_pcpu, cpu); - put_objects(&pcp->free_objs); - pcp->obj_free = 0; + put_objects(&pcp->objects); + pcp->cnt = 0; return 0; } #endif @@ -1076,7 +1067,7 @@ static int debug_stats_show(struct seq_file *m, void *v) int cpu, obj_percpu_free = 0; for_each_possible_cpu(cpu) - obj_percpu_free += per_cpu(percpu_obj_pool.obj_free, cpu); + obj_percpu_free += per_cpu(pool_pcpu.cnt, cpu); seq_printf(m, "max_chain :%d\n", debug_objects_maxchain); seq_printf(m, "max_checked :%d\n", debug_objects_maxchecked); From 96a9a0421c77301f9b551f3460ac04471a3c0612 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:08 +0200 Subject: [PATCH 14/25] debugobjects: Move min/max count into pool struct Having the accounting in the datastructure is better in terms of cache lines and allows more optimizations later on. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.831908427@linutronix.de --- lib/debugobjects.c | 55 ++++++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 3d1d9733676e..fbe8f2625082 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -46,9 +46,14 @@ struct debug_bucket { struct obj_pool { struct hlist_head objects; unsigned int cnt; + unsigned int min_cnt; + unsigned int max_cnt; } ____cacheline_aligned; -static DEFINE_PER_CPU(struct obj_pool, pool_pcpu); + +static DEFINE_PER_CPU_ALIGNED(struct obj_pool, pool_pcpu) = { + .max_cnt = ODEBUG_POOL_PERCPU_SIZE, +}; static struct debug_bucket obj_hash[ODEBUG_HASH_SIZE]; @@ -56,8 +61,14 @@ static struct debug_obj obj_static_pool[ODEBUG_POOL_SIZE] __initdata; static DEFINE_RAW_SPINLOCK(pool_lock); -static struct obj_pool pool_global; -static struct obj_pool pool_to_free; +static struct obj_pool pool_global = { + .min_cnt = ODEBUG_POOL_MIN_LEVEL, + .max_cnt = ODEBUG_POOL_SIZE, +}; + +static struct obj_pool pool_to_free = { + .max_cnt = UINT_MAX, +}; static HLIST_HEAD(pool_boot); @@ -79,13 +90,9 @@ static int __data_racy debug_objects_fixups __read_mostly; static int __data_racy debug_objects_warnings __read_mostly; static bool __data_racy debug_objects_enabled __read_mostly = CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT; -static int debug_objects_pool_size __ro_after_init - = ODEBUG_POOL_SIZE; -static int debug_objects_pool_min_level __ro_after_init - = ODEBUG_POOL_MIN_LEVEL; -static const struct debug_obj_descr *descr_test __read_mostly; -static struct kmem_cache *obj_cache __ro_after_init; +static const struct debug_obj_descr *descr_test __read_mostly; +static struct kmem_cache *obj_cache __ro_after_init; /* * Track numbers of kmem_cache_alloc()/free() calls done. @@ -124,14 +131,14 @@ static __always_inline unsigned int pool_count(struct obj_pool *pool) return READ_ONCE(pool->cnt); } -static inline bool pool_global_should_refill(void) +static __always_inline bool pool_should_refill(struct obj_pool *pool) { - return READ_ONCE(pool_global.cnt) < debug_objects_pool_min_level; + return pool_count(pool) < pool->min_cnt; } -static inline bool pool_global_must_refill(void) +static __always_inline bool pool_must_refill(struct obj_pool *pool) { - return READ_ONCE(pool_global.cnt) < (debug_objects_pool_min_level / 2); + return pool_count(pool) < pool->min_cnt / 2; } static void free_object_list(struct hlist_head *head) @@ -178,7 +185,7 @@ static void fill_pool_from_freelist(void) * Recheck with the lock held as the worker thread might have * won the race and freed the global free list already. */ - while (pool_to_free.cnt && (pool_global.cnt < debug_objects_pool_min_level)) { + while (pool_to_free.cnt && (pool_global.cnt < pool_global.min_cnt)) { obj = hlist_entry(pool_to_free.objects.first, typeof(*obj), node); hlist_del(&obj->node); WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt - 1); @@ -197,11 +204,11 @@ static void fill_pool(void) * - One other CPU is already allocating * - the global pool has not reached the critical level yet */ - if (!pool_global_must_refill() && atomic_read(&cpus_allocating)) + if (!pool_must_refill(&pool_global) && atomic_read(&cpus_allocating)) return; atomic_inc(&cpus_allocating); - while (pool_global_should_refill()) { + while (pool_should_refill(&pool_global)) { struct debug_obj *new, *last = NULL; HLIST_HEAD(head); int cnt; @@ -337,7 +344,7 @@ static void free_obj_work(struct work_struct *work) if (!raw_spin_trylock_irqsave(&pool_lock, flags)) return; - if (pool_global.cnt >= debug_objects_pool_size) + if (pool_global.cnt >= pool_global.max_cnt) goto free_objs; /* @@ -347,7 +354,7 @@ static void free_obj_work(struct work_struct *work) * may be gearing up to use more and more objects, don't free any * of them until the next round. */ - while (pool_to_free.cnt && pool_global.cnt < debug_objects_pool_size) { + while (pool_to_free.cnt && pool_global.cnt < pool_global.max_cnt) { obj = hlist_entry(pool_to_free.objects.first, typeof(*obj), node); hlist_del(&obj->node); hlist_add_head(&obj->node, &pool_global.objects); @@ -408,7 +415,7 @@ static void __free_object(struct debug_obj *obj) } raw_spin_lock(&pool_lock); - work = (pool_global.cnt > debug_objects_pool_size) && obj_cache && + work = (pool_global.cnt > pool_global.max_cnt) && obj_cache && (pool_to_free.cnt < ODEBUG_FREE_WORK_MAX); obj_pool_used--; @@ -424,7 +431,7 @@ static void __free_object(struct debug_obj *obj) } } - if ((pool_global.cnt > debug_objects_pool_size) && + if ((pool_global.cnt > pool_global.max_cnt) && (pool_to_free.cnt < ODEBUG_FREE_WORK_MAX)) { int i; @@ -629,13 +636,13 @@ static void debug_objects_fill_pool(void) if (unlikely(!obj_cache)) return; - if (likely(!pool_global_should_refill())) + if (likely(!pool_should_refill(&pool_global))) return; /* Try reusing objects from obj_to_free_list */ fill_pool_from_freelist(); - if (likely(!pool_global_should_refill())) + if (likely(!pool_should_refill(&pool_global))) return; /* @@ -1427,8 +1434,8 @@ void __init debug_objects_mem_init(void) * system. */ extras = num_possible_cpus() * ODEBUG_BATCH_SIZE; - debug_objects_pool_size += extras; - debug_objects_pool_min_level += extras; + pool_global.max_cnt += extras; + pool_global.min_cnt += extras; /* Everything worked. Expose the cache */ obj_cache = cache; From fb60c004f33e0fa2e87b9456b87f1b2709436b88 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:09 +0200 Subject: [PATCH 15/25] debugobjects: Rework object allocation The current allocation scheme tries to allocate from the per CPU pool first. If that fails it allocates one object from the global pool and then refills the per CPU pool from the global pool. That is in the way of switching the pool management to batch mode as the global pool needs to be a strict stack of batches, which does not allow to allocate single objects. Rework the code to refill the per CPU pool first and then allocate the object from the refilled batch. Also try to allocate from the to free pool first to avoid freeing and reallocating objects. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.893554162@linutronix.de --- lib/debugobjects.c | 144 ++++++++++++++++++++++----------------------- 1 file changed, 69 insertions(+), 75 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index fbe8f2625082..3b18d5dcbf6e 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -141,6 +141,64 @@ static __always_inline bool pool_must_refill(struct obj_pool *pool) return pool_count(pool) < pool->min_cnt / 2; } +static bool pool_move_batch(struct obj_pool *dst, struct obj_pool *src) +{ + if (dst->cnt + ODEBUG_BATCH_SIZE > dst->max_cnt || !src->cnt) + return false; + + for (int i = 0; i < ODEBUG_BATCH_SIZE && src->cnt; i++) { + struct hlist_node *node = src->objects.first; + + WRITE_ONCE(src->cnt, src->cnt - 1); + WRITE_ONCE(dst->cnt, dst->cnt + 1); + + hlist_del(node); + hlist_add_head(node, &dst->objects); + } + return true; +} + +static struct debug_obj *__alloc_object(struct hlist_head *list) +{ + struct debug_obj *obj; + + if (unlikely(!list->first)) + return NULL; + + obj = hlist_entry(list->first, typeof(*obj), node); + hlist_del(&obj->node); + return obj; +} + +static struct debug_obj *pcpu_alloc(void) +{ + struct obj_pool *pcp = this_cpu_ptr(&pool_pcpu); + + lockdep_assert_irqs_disabled(); + + for (;;) { + struct debug_obj *obj = __alloc_object(&pcp->objects); + + if (likely(obj)) { + pcp->cnt--; + return obj; + } + + guard(raw_spinlock)(&pool_lock); + if (!pool_move_batch(pcp, &pool_to_free)) { + if (!pool_move_batch(pcp, &pool_global)) + return NULL; + } + obj_pool_used += pcp->cnt; + + if (obj_pool_used > obj_pool_max_used) + obj_pool_max_used = obj_pool_used; + + if (pool_global.cnt < obj_pool_min_free) + obj_pool_min_free = pool_global.cnt; + } +} + static void free_object_list(struct hlist_head *head) { struct hlist_node *tmp; @@ -158,7 +216,6 @@ static void free_object_list(struct hlist_head *head) static void fill_pool_from_freelist(void) { static unsigned long state; - struct debug_obj *obj; /* * Reuse objs from the global obj_to_free list; they will be @@ -180,17 +237,11 @@ static void fill_pool_from_freelist(void) if (test_bit(0, &state) || test_and_set_bit(0, &state)) return; - guard(raw_spinlock)(&pool_lock); - /* - * Recheck with the lock held as the worker thread might have - * won the race and freed the global free list already. - */ - while (pool_to_free.cnt && (pool_global.cnt < pool_global.min_cnt)) { - obj = hlist_entry(pool_to_free.objects.first, typeof(*obj), node); - hlist_del(&obj->node); - WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt - 1); - hlist_add_head(&obj->node, &pool_global.objects); - WRITE_ONCE(pool_global.cnt, pool_global.cnt + 1); + /* Avoid taking the lock when there is no work to do */ + while (pool_should_refill(&pool_global) && pool_count(&pool_to_free)) { + guard(raw_spinlock)(&pool_lock); + /* Move a batch if possible */ + pool_move_batch(&pool_global, &pool_to_free); } clear_bit(0, &state); } @@ -251,74 +302,17 @@ static struct debug_obj *lookup_object(void *addr, struct debug_bucket *b) return NULL; } -/* - * Allocate a new object from the hlist - */ -static struct debug_obj *__alloc_object(struct hlist_head *list) +static struct debug_obj *alloc_object(void *addr, struct debug_bucket *b, + const struct debug_obj_descr *descr) { - struct debug_obj *obj = NULL; - - if (list->first) { - obj = hlist_entry(list->first, typeof(*obj), node); - hlist_del(&obj->node); - } - - return obj; -} - -static struct debug_obj * -alloc_object(void *addr, struct debug_bucket *b, const struct debug_obj_descr *descr) -{ - struct obj_pool *percpu_pool = this_cpu_ptr(&pool_pcpu); struct debug_obj *obj; - if (likely(obj_cache)) { - obj = __alloc_object(&percpu_pool->objects); - if (obj) { - percpu_pool->cnt--; - goto init_obj; - } - } else { + if (likely(obj_cache)) + obj = pcpu_alloc(); + else obj = __alloc_object(&pool_boot); - goto init_obj; - } - raw_spin_lock(&pool_lock); - obj = __alloc_object(&pool_global.objects); - if (obj) { - obj_pool_used++; - WRITE_ONCE(pool_global.cnt, pool_global.cnt - 1); - - /* - * Looking ahead, allocate one batch of debug objects and - * put them into the percpu free pool. - */ - if (likely(obj_cache)) { - int i; - - for (i = 0; i < ODEBUG_BATCH_SIZE; i++) { - struct debug_obj *obj2; - - obj2 = __alloc_object(&pool_global.objects); - if (!obj2) - break; - hlist_add_head(&obj2->node, &percpu_pool->objects); - percpu_pool->cnt++; - obj_pool_used++; - WRITE_ONCE(pool_global.cnt, pool_global.cnt - 1); - } - } - - if (obj_pool_used > obj_pool_max_used) - obj_pool_max_used = obj_pool_used; - - if (pool_global.cnt < obj_pool_min_free) - obj_pool_min_free = pool_global.cnt; - } - raw_spin_unlock(&pool_lock); - -init_obj: - if (obj) { + if (likely(obj)) { obj->object = addr; obj->descr = descr; obj->state = ODEBUG_STATE_NONE; From a3b9e191f5fc11fa93176a4074a919d33d64c5fe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:10 +0200 Subject: [PATCH 16/25] debugobjects: Rework object freeing __free_object() is uncomprehensibly complex. The same can be achieved by: 1) Adding the object to the per CPU pool 2) If that pool is full, move a batch of objects into the global pool or if the global pool is full into the to free pool This also prepares for batch processing. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164913.955542307@linutronix.de --- lib/debugobjects.c | 99 +++++++++++----------------------------------- 1 file changed, 24 insertions(+), 75 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 3b18d5dcbf6e..3700ddfcf76a 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -199,6 +199,27 @@ static struct debug_obj *pcpu_alloc(void) } } +static void pcpu_free(struct debug_obj *obj) +{ + struct obj_pool *pcp = this_cpu_ptr(&pool_pcpu); + + lockdep_assert_irqs_disabled(); + + hlist_add_head(&obj->node, &pcp->objects); + pcp->cnt++; + + /* Pool full ? */ + if (pcp->cnt < ODEBUG_POOL_PERCPU_SIZE) + return; + + /* Remove a batch from the per CPU pool */ + guard(raw_spinlock)(&pool_lock); + /* Try to fit the batch into the pool_global first */ + if (!pool_move_batch(&pool_global, pcp)) + pool_move_batch(&pool_to_free, pcp); + obj_pool_used -= ODEBUG_BATCH_SIZE; +} + static void free_object_list(struct hlist_head *head) { struct hlist_node *tmp; @@ -375,83 +396,11 @@ free_objs: static void __free_object(struct debug_obj *obj) { - struct debug_obj *objs[ODEBUG_BATCH_SIZE]; - struct obj_pool *percpu_pool; - int lookahead_count = 0; - bool work; - guard(irqsave)(); - - if (unlikely(!obj_cache)) { + if (likely(obj_cache)) + pcpu_free(obj); + else hlist_add_head(&obj->node, &pool_boot); - return; - } - - /* - * Try to free it into the percpu pool first. - */ - percpu_pool = this_cpu_ptr(&pool_pcpu); - if (percpu_pool->cnt < ODEBUG_POOL_PERCPU_SIZE) { - hlist_add_head(&obj->node, &percpu_pool->objects); - percpu_pool->cnt++; - return; - } - - /* - * As the percpu pool is full, look ahead and pull out a batch - * of objects from the percpu pool and free them as well. - */ - for (; lookahead_count < ODEBUG_BATCH_SIZE; lookahead_count++) { - objs[lookahead_count] = __alloc_object(&percpu_pool->objects); - if (!objs[lookahead_count]) - break; - percpu_pool->cnt--; - } - - raw_spin_lock(&pool_lock); - work = (pool_global.cnt > pool_global.max_cnt) && obj_cache && - (pool_to_free.cnt < ODEBUG_FREE_WORK_MAX); - obj_pool_used--; - - if (work) { - WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt + 1); - hlist_add_head(&obj->node, &pool_to_free.objects); - if (lookahead_count) { - WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt + lookahead_count); - obj_pool_used -= lookahead_count; - while (lookahead_count) { - hlist_add_head(&objs[--lookahead_count]->node, - &pool_to_free.objects); - } - } - - if ((pool_global.cnt > pool_global.max_cnt) && - (pool_to_free.cnt < ODEBUG_FREE_WORK_MAX)) { - int i; - - /* - * Free one more batch of objects from obj_pool. - */ - for (i = 0; i < ODEBUG_BATCH_SIZE; i++) { - obj = __alloc_object(&pool_global.objects); - hlist_add_head(&obj->node, &pool_to_free.objects); - WRITE_ONCE(pool_global.cnt, pool_global.cnt - 1); - WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt + 1); - } - } - } else { - WRITE_ONCE(pool_global.cnt, pool_global.cnt + 1); - hlist_add_head(&obj->node, &pool_global.objects); - if (lookahead_count) { - WRITE_ONCE(pool_global.cnt, pool_global.cnt + lookahead_count); - obj_pool_used -= lookahead_count; - while (lookahead_count) { - hlist_add_head(&objs[--lookahead_count]->node, - &pool_global.objects); - } - } - } - raw_spin_unlock(&pool_lock); } /* From 9ce99c6d7bfbca71f1e5fa34045ea48cb768f54a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:12 +0200 Subject: [PATCH 17/25] debugobjects: Rework free_object_work() Convert it to batch processing with intermediate helper functions. This reduces the final changes for batch processing. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164914.015906394@linutronix.de --- lib/debugobjects.c | 80 ++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 42 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 3700ddfcf76a..d5a853851482 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -35,7 +35,7 @@ * frequency of 10Hz and about 1024 objects for each freeing operation. * So it is freeing at most 10k debug objects per second. */ -#define ODEBUG_FREE_WORK_MAX 1024 +#define ODEBUG_FREE_WORK_MAX (1024 / ODEBUG_BATCH_SIZE) #define ODEBUG_FREE_WORK_DELAY DIV_ROUND_UP(HZ, 10) struct debug_bucket { @@ -158,6 +158,21 @@ static bool pool_move_batch(struct obj_pool *dst, struct obj_pool *src) return true; } +static bool pool_pop_batch(struct hlist_head *head, struct obj_pool *src) +{ + if (!src->cnt) + return false; + + for (int i = 0; src->cnt && i < ODEBUG_BATCH_SIZE; i++) { + struct hlist_node *node = src->objects.first; + + WRITE_ONCE(src->cnt, src->cnt - 1); + hlist_del(node); + hlist_add_head(node, head); + } + return true; +} + static struct debug_obj *__alloc_object(struct hlist_head *list) { struct debug_obj *obj; @@ -343,55 +358,36 @@ static struct debug_obj *alloc_object(void *addr, struct debug_bucket *b, return obj; } -/* - * workqueue function to free objects. - * - * To reduce contention on the global pool_lock, the actual freeing of - * debug objects will be delayed if the pool_lock is busy. - */ +/* workqueue function to free objects. */ static void free_obj_work(struct work_struct *work) { - struct debug_obj *obj; - unsigned long flags; - HLIST_HEAD(tofree); + bool free = true; WRITE_ONCE(obj_freeing, false); - if (!raw_spin_trylock_irqsave(&pool_lock, flags)) + + if (!pool_count(&pool_to_free)) return; - if (pool_global.cnt >= pool_global.max_cnt) - goto free_objs; + for (unsigned int cnt = 0; cnt < ODEBUG_FREE_WORK_MAX; cnt++) { + HLIST_HEAD(tofree); - /* - * The objs on the pool list might be allocated before the work is - * run, so recheck if pool list it full or not, if not fill pool - * list from the global free list. As it is likely that a workload - * may be gearing up to use more and more objects, don't free any - * of them until the next round. - */ - while (pool_to_free.cnt && pool_global.cnt < pool_global.max_cnt) { - obj = hlist_entry(pool_to_free.objects.first, typeof(*obj), node); - hlist_del(&obj->node); - hlist_add_head(&obj->node, &pool_global.objects); - WRITE_ONCE(pool_to_free.cnt, pool_to_free.cnt - 1); - WRITE_ONCE(pool_global.cnt, pool_global.cnt + 1); + /* Acquire and drop the lock for each batch */ + scoped_guard(raw_spinlock_irqsave, &pool_lock) { + if (!pool_to_free.cnt) + return; + + /* Refill the global pool if possible */ + if (pool_move_batch(&pool_global, &pool_to_free)) { + /* Don't free as there seems to be demand */ + free = false; + } else if (free) { + pool_pop_batch(&tofree, &pool_to_free); + } else { + return; + } + } + free_object_list(&tofree); } - raw_spin_unlock_irqrestore(&pool_lock, flags); - return; - -free_objs: - /* - * Pool list is already full and there are still objs on the free - * list. Move remaining free objs to a temporary list to free the - * memory outside the pool_lock held region. - */ - if (pool_to_free.cnt) { - hlist_move_list(&pool_to_free.objects, &tofree); - WRITE_ONCE(pool_to_free.cnt, 0); - } - raw_spin_unlock_irqrestore(&pool_lock, flags); - - free_object_list(&tofree); } static void __free_object(struct debug_obj *obj) From 14077b9e583bbafc9a02734beab99c37bff68644 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:13 +0200 Subject: [PATCH 18/25] debugobjects: Use static key for boot pool selection Get rid of the conditional in the hot path. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164914.077247071@linutronix.de --- lib/debugobjects.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index d5a853851482..65cce4cd9643 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -7,16 +7,16 @@ #define pr_fmt(fmt) "ODEBUG: " fmt +#include #include -#include +#include +#include +#include #include #include #include -#include #include -#include -#include -#include +#include #define ODEBUG_HASH_BITS 14 #define ODEBUG_HASH_SIZE (1 << ODEBUG_HASH_BITS) @@ -103,6 +103,8 @@ static int __data_racy debug_objects_freed; static void free_obj_work(struct work_struct *work); static DECLARE_DELAYED_WORK(debug_obj_work, free_obj_work); +static DEFINE_STATIC_KEY_FALSE(obj_cache_enabled); + static int __init enable_object_debug(char *str) { debug_objects_enabled = true; @@ -343,7 +345,7 @@ static struct debug_obj *alloc_object(void *addr, struct debug_bucket *b, { struct debug_obj *obj; - if (likely(obj_cache)) + if (static_branch_likely(&obj_cache_enabled)) obj = pcpu_alloc(); else obj = __alloc_object(&pool_boot); @@ -393,7 +395,7 @@ static void free_obj_work(struct work_struct *work) static void __free_object(struct debug_obj *obj) { guard(irqsave)(); - if (likely(obj_cache)) + if (static_branch_likely(&obj_cache_enabled)) pcpu_free(obj); else hlist_add_head(&obj->node, &pool_boot); @@ -572,7 +574,7 @@ static struct debug_obj *lookup_object_or_alloc(void *addr, struct debug_bucket static void debug_objects_fill_pool(void) { - if (unlikely(!obj_cache)) + if (!static_branch_likely(&obj_cache_enabled)) return; if (likely(!pool_should_refill(&pool_global))) @@ -1378,6 +1380,7 @@ void __init debug_objects_mem_init(void) /* Everything worked. Expose the cache */ obj_cache = cache; + static_branch_enable(&obj_cache_enabled); #ifdef CONFIG_HOTPLUG_CPU cpuhp_setup_state_nocalls(CPUHP_DEBUG_OBJ_DEAD, "object:offline", NULL, From 74fe1ad4132234f04fcc75e16600449496a67b5b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:14 +0200 Subject: [PATCH 19/25] debugobjects: Prepare for batching Move the debug_obj::object pointer into a union and add a pointer to the last node in a batch. That allows to implement batch processing efficiently by utilizing the stack property of hlist: When the first object of a batch is added to the list, then the batch pointer is set to the hlist node of the object itself. Any subsequent add retrieves the pointer to the last node from the first object in the list and uses that for storing the last node pointer in the newly added object. Add the pointer to the data structure and ensure that all relevant pool sizes are strictly batch sized. The actual batching implementation follows in subsequent changes. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164914.139204961@linutronix.de --- include/linux/debugobjects.h | 12 ++++++++---- lib/debugobjects.c | 10 +++++++--- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h index 32444686b6ff..8b95545e7924 100644 --- a/include/linux/debugobjects.h +++ b/include/linux/debugobjects.h @@ -23,13 +23,17 @@ struct debug_obj_descr; * @state: tracked object state * @astate: current active state * @object: pointer to the real object + * @batch_last: pointer to the last hlist node in a batch * @descr: pointer to an object type specific debug description structure */ struct debug_obj { - struct hlist_node node; - enum debug_obj_state state; - unsigned int astate; - void *object; + struct hlist_node node; + enum debug_obj_state state; + unsigned int astate; + union { + void *object; + struct hlist_node *batch_last; + }; const struct debug_obj_descr *descr; }; diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 65cce4cd9643..2fed7b863827 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -21,11 +21,15 @@ #define ODEBUG_HASH_BITS 14 #define ODEBUG_HASH_SIZE (1 << ODEBUG_HASH_BITS) -#define ODEBUG_POOL_SIZE 1024 -#define ODEBUG_POOL_MIN_LEVEL 256 -#define ODEBUG_POOL_PERCPU_SIZE 64 +/* Must be power of two */ #define ODEBUG_BATCH_SIZE 16 +/* Initial values. Must all be a multiple of batch size */ +#define ODEBUG_POOL_SIZE (64 * ODEBUG_BATCH_SIZE) +#define ODEBUG_POOL_MIN_LEVEL (ODEBUG_POOL_SIZE / 4) + +#define ODEBUG_POOL_PERCPU_SIZE (4 * ODEBUG_BATCH_SIZE) + #define ODEBUG_CHUNK_SHIFT PAGE_SHIFT #define ODEBUG_CHUNK_SIZE (1 << ODEBUG_CHUNK_SHIFT) #define ODEBUG_CHUNK_MASK (~(ODEBUG_CHUNK_SIZE - 1)) From aebbfe0779b271c099cc80c5e2995c2087b28dcf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:15 +0200 Subject: [PATCH 20/25] debugobjects: Prepare kmem_cache allocations for batching Allocate a batch and then push it into the pool. Utilize the debug_obj::last_node pointer for keeping track of the batch boundary. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241007164914.198647184@linutronix.de --- lib/debugobjects.c | 80 ++++++++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 31 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 2fed7b863827..cdd5d238eea7 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -164,6 +164,22 @@ static bool pool_move_batch(struct obj_pool *dst, struct obj_pool *src) return true; } +static bool pool_push_batch(struct obj_pool *dst, struct hlist_head *head) +{ + struct hlist_node *last; + struct debug_obj *obj; + + if (dst->cnt >= dst->max_cnt) + return false; + + obj = hlist_entry(head->first, typeof(*obj), node); + last = obj->batch_last; + + hlist_splice_init(head, last, &dst->objects); + WRITE_ONCE(dst->cnt, dst->cnt + ODEBUG_BATCH_SIZE); + return true; +} + static bool pool_pop_batch(struct hlist_head *head, struct obj_pool *src) { if (!src->cnt) @@ -288,6 +304,28 @@ static void fill_pool_from_freelist(void) clear_bit(0, &state); } +static bool kmem_alloc_batch(struct hlist_head *head, struct kmem_cache *cache, gfp_t gfp) +{ + struct hlist_node *last = NULL; + struct debug_obj *obj; + + for (int cnt = 0; cnt < ODEBUG_BATCH_SIZE; cnt++) { + obj = kmem_cache_zalloc(cache, gfp); + if (!obj) { + free_object_list(head); + return false; + } + debug_objects_allocated++; + + if (!last) + last = &obj->node; + obj->batch_last = last; + + hlist_add_head(&obj->node, head); + } + return true; +} + static void fill_pool(void) { static atomic_t cpus_allocating; @@ -302,25 +340,14 @@ static void fill_pool(void) atomic_inc(&cpus_allocating); while (pool_should_refill(&pool_global)) { - struct debug_obj *new, *last = NULL; HLIST_HEAD(head); - int cnt; - for (cnt = 0; cnt < ODEBUG_BATCH_SIZE; cnt++) { - new = kmem_cache_zalloc(obj_cache, __GFP_HIGH | __GFP_NOWARN); - if (!new) - break; - hlist_add_head(&new->node, &head); - if (!last) - last = new; - } - if (!cnt) + if (!kmem_alloc_batch(&head, obj_cache, __GFP_HIGH | __GFP_NOWARN)) break; guard(raw_spinlock_irqsave)(&pool_lock); - hlist_splice_init(&head, &last->node, &pool_global.objects); - debug_objects_allocated += cnt; - WRITE_ONCE(pool_global.cnt, pool_global.cnt + cnt); + if (!pool_push_batch(&pool_global, &head)) + pool_push_batch(&pool_to_free, &head); } atomic_dec(&cpus_allocating); } @@ -1302,26 +1329,18 @@ void __init debug_objects_early_init(void) static bool __init debug_objects_replace_static_objects(struct kmem_cache *cache) { struct debug_bucket *db = obj_hash; - struct debug_obj *obj, *new; struct hlist_node *tmp; + struct debug_obj *obj; HLIST_HEAD(objects); int i; - for (i = 0; i < ODEBUG_POOL_SIZE; i++) { - obj = kmem_cache_zalloc(cache, GFP_KERNEL); - if (!obj) + for (i = 0; i < ODEBUG_POOL_SIZE; i += ODEBUG_BATCH_SIZE) { + if (!kmem_alloc_batch(&objects, cache, GFP_KERNEL)) goto free; - hlist_add_head(&obj->node, &objects); + pool_push_batch(&pool_global, &objects); } - debug_objects_allocated = ODEBUG_POOL_SIZE; - pool_global.cnt = ODEBUG_POOL_SIZE; - - /* - * Move the allocated objects to the global pool and disconnect the - * boot pool. - */ - hlist_move_list(&objects, &pool_global.objects); + /* Disconnect the boot pool. */ pool_boot.first = NULL; /* Replace the active object references */ @@ -1329,9 +1348,8 @@ static bool __init debug_objects_replace_static_objects(struct kmem_cache *cache hlist_move_list(&db->list, &objects); hlist_for_each_entry(obj, &objects, node) { - new = hlist_entry(pool_global.objects.first, typeof(*obj), node); - hlist_del(&new->node); - pool_global.cnt--; + struct debug_obj *new = pcpu_alloc(); + /* copy object data */ *new = *obj; hlist_add_head(&new->node, &db->list); @@ -1340,7 +1358,7 @@ static bool __init debug_objects_replace_static_objects(struct kmem_cache *cache return true; free: /* Can't use free_object_list() as the cache is not populated yet */ - hlist_for_each_entry_safe(obj, tmp, &objects, node) { + hlist_for_each_entry_safe(obj, tmp, &pool_global.objects, node) { hlist_del(&obj->node); kmem_cache_free(cache, obj); } From f57ebb92ba3e09a7e1082f147d6e1456d702d4b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:17 +0200 Subject: [PATCH 21/25] debugobjects: Implement batch processing Adding and removing single objects in a loop is bad in terms of lock contention and cache line accesses. To implement batching, record the last object in a batch in the object itself. This is trivialy possible as hlists are strictly stacks. At a batch boundary, when the first object is added to the list the object stores a pointer to itself in debug_obj::batch_last. When the next object is added to the list then the batch_last pointer is retrieved from the first object in the list and stored in the to be added one. That means for batch processing the first object always has a pointer to the last object in a batch, which allows to move batches in a cache line efficient way and reduces the lock held time. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164914.258995000@linutronix.de --- lib/debugobjects.c | 61 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index cdd5d238eea7..4e80c31f4349 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -149,18 +149,31 @@ static __always_inline bool pool_must_refill(struct obj_pool *pool) static bool pool_move_batch(struct obj_pool *dst, struct obj_pool *src) { - if (dst->cnt + ODEBUG_BATCH_SIZE > dst->max_cnt || !src->cnt) + struct hlist_node *last, *next_batch, *first_batch; + struct debug_obj *obj; + + if (dst->cnt >= dst->max_cnt || !src->cnt) return false; - for (int i = 0; i < ODEBUG_BATCH_SIZE && src->cnt; i++) { - struct hlist_node *node = src->objects.first; + first_batch = src->objects.first; + obj = hlist_entry(first_batch, typeof(*obj), node); + last = obj->batch_last; + next_batch = last->next; - WRITE_ONCE(src->cnt, src->cnt - 1); - WRITE_ONCE(dst->cnt, dst->cnt + 1); + /* Move the next batch to the front of the source pool */ + src->objects.first = next_batch; + if (next_batch) + next_batch->pprev = &src->objects.first; - hlist_del(node); - hlist_add_head(node, &dst->objects); - } + /* Add the extracted batch to the destination pool */ + last->next = dst->objects.first; + if (last->next) + last->next->pprev = &last->next; + first_batch->pprev = &dst->objects.first; + dst->objects.first = first_batch; + + WRITE_ONCE(src->cnt, src->cnt - ODEBUG_BATCH_SIZE); + WRITE_ONCE(dst->cnt, dst->cnt + ODEBUG_BATCH_SIZE); return true; } @@ -182,16 +195,27 @@ static bool pool_push_batch(struct obj_pool *dst, struct hlist_head *head) static bool pool_pop_batch(struct hlist_head *head, struct obj_pool *src) { + struct hlist_node *last, *next; + struct debug_obj *obj; + if (!src->cnt) return false; - for (int i = 0; src->cnt && i < ODEBUG_BATCH_SIZE; i++) { - struct hlist_node *node = src->objects.first; + /* Move the complete list to the head */ + hlist_move_list(&src->objects, head); - WRITE_ONCE(src->cnt, src->cnt - 1); - hlist_del(node); - hlist_add_head(node, head); - } + obj = hlist_entry(head->first, typeof(*obj), node); + last = obj->batch_last; + next = last->next; + /* Disconnect the batch from the list */ + last->next = NULL; + + /* Move the node after last back to the source pool. */ + src->objects.first = next; + if (next) + next->pprev = &src->objects.first; + + WRITE_ONCE(src->cnt, src->cnt - ODEBUG_BATCH_SIZE); return true; } @@ -226,7 +250,7 @@ static struct debug_obj *pcpu_alloc(void) if (!pool_move_batch(pcp, &pool_global)) return NULL; } - obj_pool_used += pcp->cnt; + obj_pool_used += ODEBUG_BATCH_SIZE; if (obj_pool_used > obj_pool_max_used) obj_pool_max_used = obj_pool_used; @@ -239,9 +263,16 @@ static struct debug_obj *pcpu_alloc(void) static void pcpu_free(struct debug_obj *obj) { struct obj_pool *pcp = this_cpu_ptr(&pool_pcpu); + struct debug_obj *first; lockdep_assert_irqs_disabled(); + if (!(pcp->cnt % ODEBUG_BATCH_SIZE)) { + obj->batch_last = &obj->node; + } else { + first = hlist_entry(pcp->objects.first, typeof(*first), node); + obj->batch_last = first->batch_last; + } hlist_add_head(&obj->node, &pcp->objects); pcp->cnt++; From 2638345d22529cdc964d20a617bfd32d87f27e0f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:18 +0200 Subject: [PATCH 22/25] debugobjects: Move pool statistics into global_pool struct Keep it along with the pool as that's a hot cache line anyway and it makes the code more comprehensible. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164914.318776207@linutronix.de --- lib/debugobjects.c | 85 +++++++++++++++++++++++++++------------------- 1 file changed, 51 insertions(+), 34 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 4e80c31f4349..cf704e2bc301 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -47,11 +47,18 @@ struct debug_bucket { raw_spinlock_t lock; }; +struct pool_stats { + unsigned int cur_used; + unsigned int max_used; + unsigned int min_fill; +}; + struct obj_pool { struct hlist_head objects; unsigned int cnt; unsigned int min_cnt; unsigned int max_cnt; + struct pool_stats stats; } ____cacheline_aligned; @@ -66,8 +73,11 @@ static struct debug_obj obj_static_pool[ODEBUG_POOL_SIZE] __initdata; static DEFINE_RAW_SPINLOCK(pool_lock); static struct obj_pool pool_global = { - .min_cnt = ODEBUG_POOL_MIN_LEVEL, - .max_cnt = ODEBUG_POOL_SIZE, + .min_cnt = ODEBUG_POOL_MIN_LEVEL, + .max_cnt = ODEBUG_POOL_SIZE, + .stats = { + .min_fill = ODEBUG_POOL_SIZE, + }, }; static struct obj_pool pool_to_free = { @@ -76,16 +86,6 @@ static struct obj_pool pool_to_free = { static HLIST_HEAD(pool_boot); -/* - * Because of the presence of percpu free pools, obj_pool_free will - * under-count those in the percpu free pools. Similarly, obj_pool_used - * will over-count those in the percpu free pools. Adjustments will be - * made at debug_stats_show(). Both obj_pool_min_free and obj_pool_max_used - * can be off. - */ -static int __data_racy obj_pool_min_free = ODEBUG_POOL_SIZE; -static int obj_pool_used; -static int __data_racy obj_pool_max_used; static bool obj_freeing; static int __data_racy debug_objects_maxchain __read_mostly; @@ -231,6 +231,19 @@ static struct debug_obj *__alloc_object(struct hlist_head *list) return obj; } +static void pcpu_refill_stats(void) +{ + struct pool_stats *stats = &pool_global.stats; + + WRITE_ONCE(stats->cur_used, stats->cur_used + ODEBUG_BATCH_SIZE); + + if (stats->cur_used > stats->max_used) + stats->max_used = stats->cur_used; + + if (pool_global.cnt < stats->min_fill) + stats->min_fill = pool_global.cnt; +} + static struct debug_obj *pcpu_alloc(void) { struct obj_pool *pcp = this_cpu_ptr(&pool_pcpu); @@ -250,13 +263,7 @@ static struct debug_obj *pcpu_alloc(void) if (!pool_move_batch(pcp, &pool_global)) return NULL; } - obj_pool_used += ODEBUG_BATCH_SIZE; - - if (obj_pool_used > obj_pool_max_used) - obj_pool_max_used = obj_pool_used; - - if (pool_global.cnt < obj_pool_min_free) - obj_pool_min_free = pool_global.cnt; + pcpu_refill_stats(); } } @@ -285,7 +292,7 @@ static void pcpu_free(struct debug_obj *obj) /* Try to fit the batch into the pool_global first */ if (!pool_move_batch(&pool_global, pcp)) pool_move_batch(&pool_to_free, pcp); - obj_pool_used -= ODEBUG_BATCH_SIZE; + WRITE_ONCE(pool_global.stats.cur_used, pool_global.stats.cur_used - ODEBUG_BATCH_SIZE); } static void free_object_list(struct hlist_head *head) @@ -1074,23 +1081,33 @@ void debug_check_no_obj_freed(const void *address, unsigned long size) static int debug_stats_show(struct seq_file *m, void *v) { - int cpu, obj_percpu_free = 0; + unsigned int cpu, pool_used, pcp_free = 0; + /* + * pool_global.stats.cur_used is the number of batches currently + * handed out to per CPU pools. Convert it to number of objects + * and subtract the number of free objects in the per CPU pools. + * As this is lockless the number is an estimate. + */ for_each_possible_cpu(cpu) - obj_percpu_free += per_cpu(pool_pcpu.cnt, cpu); + pcp_free += per_cpu(pool_pcpu.cnt, cpu); - seq_printf(m, "max_chain :%d\n", debug_objects_maxchain); - seq_printf(m, "max_checked :%d\n", debug_objects_maxchecked); - seq_printf(m, "warnings :%d\n", debug_objects_warnings); - seq_printf(m, "fixups :%d\n", debug_objects_fixups); - seq_printf(m, "pool_free :%d\n", pool_count(&pool_global) + obj_percpu_free); - seq_printf(m, "pool_pcp_free :%d\n", obj_percpu_free); - seq_printf(m, "pool_min_free :%d\n", obj_pool_min_free); - seq_printf(m, "pool_used :%d\n", obj_pool_used - obj_percpu_free); - seq_printf(m, "pool_max_used :%d\n", obj_pool_max_used); - seq_printf(m, "on_free_list :%d\n", pool_count(&pool_to_free)); - seq_printf(m, "objs_allocated:%d\n", debug_objects_allocated); - seq_printf(m, "objs_freed :%d\n", debug_objects_freed); + pool_used = data_race(pool_global.stats.cur_used); + pcp_free = min(pool_used, pcp_free); + pool_used -= pcp_free; + + seq_printf(m, "max_chain : %d\n", debug_objects_maxchain); + seq_printf(m, "max_checked : %d\n", debug_objects_maxchecked); + seq_printf(m, "warnings : %d\n", debug_objects_warnings); + seq_printf(m, "fixups : %d\n", debug_objects_fixups); + seq_printf(m, "pool_free : %u\n", pool_count(&pool_global) + pcp_free); + seq_printf(m, "pool_pcp_free : %u\n", pcp_free); + seq_printf(m, "pool_min_free : %u\n", data_race(pool_global.stats.min_fill)); + seq_printf(m, "pool_used : %u\n", pool_used); + seq_printf(m, "pool_max_used : %u\n", data_race(pool_global.stats.max_used)); + seq_printf(m, "on_free_list : %u\n", pool_count(&pool_to_free)); + seq_printf(m, "objs_allocated: %d\n", debug_objects_allocated); + seq_printf(m, "objs_freed : %d\n", debug_objects_freed); return 0; } DEFINE_SHOW_ATTRIBUTE(debug_stats); From a201a96b9682e5b42ed93108c4aeb6135c909661 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:19 +0200 Subject: [PATCH 23/25] debugobjects: Double the per CPU slots In situations where objects are rapidly allocated from the pool and handed back, the size of the per CPU pool turns out to be too small. Double the size of the per CPU pool. This reduces the kmem cache allocation and free operations during a kernel compile: alloc free Baseline: 380k 330k Double size: 295k 245k Especially the reduction of allocations is important because that happens in the hot path when objects are initialized. The maximum increase in per CPU pool memory consumption is about 2.5K per online CPU, which is acceptable. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164914.378676302@linutronix.de --- lib/debugobjects.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index cf704e2bc301..fc9397de5534 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -28,7 +28,7 @@ #define ODEBUG_POOL_SIZE (64 * ODEBUG_BATCH_SIZE) #define ODEBUG_POOL_MIN_LEVEL (ODEBUG_POOL_SIZE / 4) -#define ODEBUG_POOL_PERCPU_SIZE (4 * ODEBUG_BATCH_SIZE) +#define ODEBUG_POOL_PERCPU_SIZE (8 * ODEBUG_BATCH_SIZE) #define ODEBUG_CHUNK_SHIFT PAGE_SHIFT #define ODEBUG_CHUNK_SIZE (1 << ODEBUG_CHUNK_SHIFT) From 13f9ca723900ae3ae8e0a1e76ba86e7786e60645 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:20 +0200 Subject: [PATCH 24/25] debugobjects: Refill per CPU pool more agressively Right now the per CPU pools are only refilled when they become empty. That's suboptimal especially when there are still non-freed objects in the to free list. Check whether an allocation from the per CPU pool emptied a batch and try to allocate from the free pool if that still has objects available. kmem_cache_alloc() kmem_cache_free() Baseline: 295k 245k Refill: 225k 173k Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164914.439053085@linutronix.de --- lib/debugobjects.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index fc9397de5534..cc32844ca18d 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -255,6 +255,24 @@ static struct debug_obj *pcpu_alloc(void) if (likely(obj)) { pcp->cnt--; + /* + * If this emptied a batch try to refill from the + * free pool. Don't do that if this was the top-most + * batch as pcpu_free() expects the per CPU pool + * to be less than ODEBUG_POOL_PERCPU_SIZE. + */ + if (unlikely(pcp->cnt < (ODEBUG_POOL_PERCPU_SIZE - ODEBUG_BATCH_SIZE) && + !(pcp->cnt % ODEBUG_BATCH_SIZE))) { + /* + * Don't try to allocate from the regular pool here + * to not exhaust it prematurely. + */ + if (pool_count(&pool_to_free)) { + guard(raw_spinlock)(&pool_lock); + pool_move_batch(pcp, &pool_to_free); + pcpu_refill_stats(); + } + } return obj; } From ff8d523cc4520a5ce86cde0fd57c304e2b4f61b3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 13 Oct 2024 20:45:57 +0200 Subject: [PATCH 25/25] debugobjects: Track object usage to avoid premature freeing of objects The freelist is freed at a constant rate independent of the actual usage requirements. That's bad in scenarios where usage comes in bursts. The end of a burst puts the objects on the free list and freeing proceeds even when the next burst which requires objects started again. Keep track of the usage with a exponentially wheighted moving average and take that into account in the worker function which frees objects from the free list. This further reduces the kmem_cache allocation/free rate for a full kernel compile: kmem_cache_alloc() kmem_cache_free() Baseline: 225k 173k Usage: 170k 117k Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/87bjznhme2.ffs@tglx --- lib/debugobjects.c | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index cc32844ca18d..7f50c4480a4e 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -86,6 +87,7 @@ static struct obj_pool pool_to_free = { static HLIST_HEAD(pool_boot); +static unsigned long avg_usage; static bool obj_freeing; static int __data_racy debug_objects_maxchain __read_mostly; @@ -427,11 +429,31 @@ static struct debug_obj *lookup_object(void *addr, struct debug_bucket *b) return NULL; } +static void calc_usage(void) +{ + static DEFINE_RAW_SPINLOCK(avg_lock); + static unsigned long avg_period; + unsigned long cur, now = jiffies; + + if (!time_after_eq(now, READ_ONCE(avg_period))) + return; + + if (!raw_spin_trylock(&avg_lock)) + return; + + WRITE_ONCE(avg_period, now + msecs_to_jiffies(10)); + cur = READ_ONCE(pool_global.stats.cur_used) * ODEBUG_FREE_WORK_MAX; + WRITE_ONCE(avg_usage, calc_load(avg_usage, EXP_5, cur)); + raw_spin_unlock(&avg_lock); +} + static struct debug_obj *alloc_object(void *addr, struct debug_bucket *b, const struct debug_obj_descr *descr) { struct debug_obj *obj; + calc_usage(); + if (static_branch_likely(&obj_cache_enabled)) obj = pcpu_alloc(); else @@ -450,14 +472,26 @@ static struct debug_obj *alloc_object(void *addr, struct debug_bucket *b, /* workqueue function to free objects. */ static void free_obj_work(struct work_struct *work) { - bool free = true; + static unsigned long last_use_avg; + unsigned long cur_used, last_used, delta; + unsigned int max_free = 0; WRITE_ONCE(obj_freeing, false); + /* Rate limit freeing based on current use average */ + cur_used = READ_ONCE(avg_usage); + last_used = last_use_avg; + last_use_avg = cur_used; + if (!pool_count(&pool_to_free)) return; - for (unsigned int cnt = 0; cnt < ODEBUG_FREE_WORK_MAX; cnt++) { + if (cur_used <= last_used) { + delta = (last_used - cur_used) / ODEBUG_FREE_WORK_MAX; + max_free = min(delta, ODEBUG_FREE_WORK_MAX); + } + + for (int cnt = 0; cnt < ODEBUG_FREE_WORK_MAX; cnt++) { HLIST_HEAD(tofree); /* Acquire and drop the lock for each batch */ @@ -468,9 +502,10 @@ static void free_obj_work(struct work_struct *work) /* Refill the global pool if possible */ if (pool_move_batch(&pool_global, &pool_to_free)) { /* Don't free as there seems to be demand */ - free = false; - } else if (free) { + max_free = 0; + } else if (max_free) { pool_pop_batch(&tofree, &pool_to_free); + max_free--; } else { return; } @@ -1110,7 +1145,7 @@ static int debug_stats_show(struct seq_file *m, void *v) for_each_possible_cpu(cpu) pcp_free += per_cpu(pool_pcpu.cnt, cpu); - pool_used = data_race(pool_global.stats.cur_used); + pool_used = READ_ONCE(pool_global.stats.cur_used); pcp_free = min(pool_used, pcp_free); pool_used -= pcp_free;