mm: make counting of list_lru_one::nr_items lockless
During the reclaiming slab of a memcg, shrink_slab iterates over all registered shrinkers in the system, and tries to count and consume objects related to the cgroup. In case of memory pressure, this behaves bad: I observe high system time and time spent in list_lru_count_one() for many processes on RHEL7 kernel. This patch makes list_lru_node::memcg_lrus rcu protected, that allows to skip taking spinlock in list_lru_count_one(). Shakeel Butt with the patch observes significant perf graph change. He says: ======================================================================== Setup: running a fork-bomb in a memcg of 200MiB on a 8GiB and 4 vcpu VM and recording the trace with 'perf record -g -a'. The trace without the patch: + 34.19% fb.sh [kernel.kallsyms] [k] queued_spin_lock_slowpath + 30.77% fb.sh [kernel.kallsyms] [k] _raw_spin_lock + 3.53% fb.sh [kernel.kallsyms] [k] list_lru_count_one + 2.26% fb.sh [kernel.kallsyms] [k] super_cache_count + 1.68% fb.sh [kernel.kallsyms] [k] shrink_slab + 0.59% fb.sh [kernel.kallsyms] [k] down_read_trylock + 0.48% fb.sh [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore + 0.38% fb.sh [kernel.kallsyms] [k] shrink_node_memcg + 0.32% fb.sh [kernel.kallsyms] [k] queue_work_on + 0.26% fb.sh [kernel.kallsyms] [k] count_shadow_nodes With the patch: + 0.16% swapper [kernel.kallsyms] [k] default_idle + 0.13% oom_reaper [kernel.kallsyms] [k] mutex_spin_on_owner + 0.05% perf [kernel.kallsyms] [k] copy_user_generic_string + 0.05% init.real [kernel.kallsyms] [k] wait_consider_task + 0.05% kworker/0:0 [kernel.kallsyms] [k] finish_task_switch + 0.04% kworker/2:1 [kernel.kallsyms] [k] finish_task_switch + 0.04% kworker/3:1 [kernel.kallsyms] [k] finish_task_switch + 0.04% kworker/1:0 [kernel.kallsyms] [k] finish_task_switch + 0.03% binary [kernel.kallsyms] [k] copy_page ======================================================================== Thanks Shakeel for the testing. [ktkhai@virtuozzo.com: v2] Link: http://lkml.kernel.org/r/151203869520.3915.2587549826865799173.stgit@localhost.localdomain Link: http://lkml.kernel.org/r/150583358557.26700.8490036563698102569.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com> Tested-by: Shakeel Butt <shakeelb@google.com> Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Andrey Ryabinin <aryabinin@virtuozzo.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									f5c754d63d
								
							
						
					
					
						commit
						0c7c1bed7e
					
				| @ -32,6 +32,7 @@ struct list_lru_one { | ||||
| }; | ||||
| 
 | ||||
| struct list_lru_memcg { | ||||
| 	struct rcu_head		rcu; | ||||
| 	/* array of per cgroup lists, indexed by memcg_cache_id */ | ||||
| 	struct list_lru_one	*lru[0]; | ||||
| }; | ||||
| @ -43,7 +44,7 @@ struct list_lru_node { | ||||
| 	struct list_lru_one	lru; | ||||
| #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) | ||||
| 	/* for cgroup aware lrus points to per cgroup lists, otherwise NULL */ | ||||
| 	struct list_lru_memcg	*memcg_lrus; | ||||
| 	struct list_lru_memcg	__rcu *memcg_lrus; | ||||
| #endif | ||||
| 	long nr_items; | ||||
| } ____cacheline_aligned_in_smp; | ||||
|  | ||||
| @ -52,14 +52,15 @@ static inline bool list_lru_memcg_aware(struct list_lru *lru) | ||||
| static inline struct list_lru_one * | ||||
| list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) | ||||
| { | ||||
| 	struct list_lru_memcg *memcg_lrus; | ||||
| 	/*
 | ||||
| 	 * The lock protects the array of per cgroup lists from relocation | ||||
| 	 * (see memcg_update_list_lru_node). | ||||
| 	 * Either lock or RCU protects the array of per cgroup lists | ||||
| 	 * from relocation (see memcg_update_list_lru_node). | ||||
| 	 */ | ||||
| 	lockdep_assert_held(&nlru->lock); | ||||
| 	if (nlru->memcg_lrus && idx >= 0) | ||||
| 		return nlru->memcg_lrus->lru[idx]; | ||||
| 
 | ||||
| 	memcg_lrus = rcu_dereference_check(nlru->memcg_lrus, | ||||
| 					   lockdep_is_held(&nlru->lock)); | ||||
| 	if (memcg_lrus && idx >= 0) | ||||
| 		return memcg_lrus->lru[idx]; | ||||
| 	return &nlru->lru; | ||||
| } | ||||
| 
 | ||||
| @ -168,10 +169,10 @@ static unsigned long __list_lru_count_one(struct list_lru *lru, | ||||
| 	struct list_lru_one *l; | ||||
| 	unsigned long count; | ||||
| 
 | ||||
| 	spin_lock(&nlru->lock); | ||||
| 	rcu_read_lock(); | ||||
| 	l = list_lru_from_memcg_idx(nlru, memcg_idx); | ||||
| 	count = l->nr_items; | ||||
| 	spin_unlock(&nlru->lock); | ||||
| 	rcu_read_unlock(); | ||||
| 
 | ||||
| 	return count; | ||||
| } | ||||
| @ -324,24 +325,41 @@ fail: | ||||
| 
 | ||||
| static int memcg_init_list_lru_node(struct list_lru_node *nlru) | ||||
| { | ||||
| 	struct list_lru_memcg *memcg_lrus; | ||||
| 	int size = memcg_nr_cache_ids; | ||||
| 
 | ||||
| 	nlru->memcg_lrus = kvmalloc(size * sizeof(void *), GFP_KERNEL); | ||||
| 	if (!nlru->memcg_lrus) | ||||
| 	memcg_lrus = kvmalloc(sizeof(*memcg_lrus) + | ||||
| 			      size * sizeof(void *), GFP_KERNEL); | ||||
| 	if (!memcg_lrus) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) { | ||||
| 		kvfree(nlru->memcg_lrus); | ||||
| 	if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) { | ||||
| 		kvfree(memcg_lrus); | ||||
| 		return -ENOMEM; | ||||
| 	} | ||||
| 	RCU_INIT_POINTER(nlru->memcg_lrus, memcg_lrus); | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) | ||||
| { | ||||
| 	__memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids); | ||||
| 	kvfree(nlru->memcg_lrus); | ||||
| 	struct list_lru_memcg *memcg_lrus; | ||||
| 	/*
 | ||||
| 	 * This is called when shrinker has already been unregistered, | ||||
| 	 * and nobody can use it. So, there is no need to use kvfree_rcu(). | ||||
| 	 */ | ||||
| 	memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true); | ||||
| 	__memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids); | ||||
| 	kvfree(memcg_lrus); | ||||
| } | ||||
| 
 | ||||
| static void kvfree_rcu(struct rcu_head *head) | ||||
| { | ||||
| 	struct list_lru_memcg *mlru; | ||||
| 
 | ||||
| 	mlru = container_of(head, struct list_lru_memcg, rcu); | ||||
| 	kvfree(mlru); | ||||
| } | ||||
| 
 | ||||
| static int memcg_update_list_lru_node(struct list_lru_node *nlru, | ||||
| @ -351,8 +369,9 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, | ||||
| 
 | ||||
| 	BUG_ON(old_size > new_size); | ||||
| 
 | ||||
| 	old = nlru->memcg_lrus; | ||||
| 	new = kvmalloc(new_size * sizeof(void *), GFP_KERNEL); | ||||
| 	old = rcu_dereference_protected(nlru->memcg_lrus, | ||||
| 					lockdep_is_held(&list_lrus_mutex)); | ||||
| 	new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL); | ||||
| 	if (!new) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| @ -361,29 +380,33 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, | ||||
| 		return -ENOMEM; | ||||
| 	} | ||||
| 
 | ||||
| 	memcpy(new, old, old_size * sizeof(void *)); | ||||
| 	memcpy(&new->lru, &old->lru, old_size * sizeof(void *)); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * The lock guarantees that we won't race with a reader | ||||
| 	 * (see list_lru_from_memcg_idx). | ||||
| 	 * The locking below allows readers that hold nlru->lock avoid taking | ||||
| 	 * rcu_read_lock (see list_lru_from_memcg_idx). | ||||
| 	 * | ||||
| 	 * Since list_lru_{add,del} may be called under an IRQ-safe lock, | ||||
| 	 * we have to use IRQ-safe primitives here to avoid deadlock. | ||||
| 	 */ | ||||
| 	spin_lock_irq(&nlru->lock); | ||||
| 	nlru->memcg_lrus = new; | ||||
| 	rcu_assign_pointer(nlru->memcg_lrus, new); | ||||
| 	spin_unlock_irq(&nlru->lock); | ||||
| 
 | ||||
| 	kvfree(old); | ||||
| 	call_rcu(&old->rcu, kvfree_rcu); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru, | ||||
| 					      int old_size, int new_size) | ||||
| { | ||||
| 	struct list_lru_memcg *memcg_lrus; | ||||
| 
 | ||||
| 	memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, | ||||
| 					       lockdep_is_held(&list_lrus_mutex)); | ||||
| 	/* do not bother shrinking the array back to the old size, because we
 | ||||
| 	 * cannot handle allocation failures here */ | ||||
| 	__memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size); | ||||
| 	__memcg_destroy_list_lru_node(memcg_lrus, old_size, new_size); | ||||
| } | ||||
| 
 | ||||
| static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user