From 11fb479ff5d9872ddff02dd533c16d60372c86b2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 5 Dec 2020 22:14:38 -0800 Subject: [PATCH 01/12] zlib: export S390 symbols for zlib modules Fix build errors when ZLIB_INFLATE=m and ZLIB_DEFLATE=m and ZLIB_DFLTCC=y by exporting the 2 needed symbols in dfltcc_inflate.c. Fixes these build errors: ERROR: modpost: "dfltcc_inflate" [lib/zlib_inflate/zlib_inflate.ko] undefined! ERROR: modpost: "dfltcc_can_inflate" [lib/zlib_inflate/zlib_inflate.ko] undefined! Fixes: 126196100063 ("lib/zlib: add s390 hardware support for kernel zlib_inflate") Reported-by: kernel test robot Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Acked-by: Ilya Leoshkevich Cc: Mikhail Zaslonko Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Link: https://lkml.kernel.org/r/20201123191712.4882-1-rdunlap@infradead.org Signed-off-by: Linus Torvalds --- lib/zlib_dfltcc/dfltcc_inflate.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/zlib_dfltcc/dfltcc_inflate.c b/lib/zlib_dfltcc/dfltcc_inflate.c index aa9ef23474df..db107016d29b 100644 --- a/lib/zlib_dfltcc/dfltcc_inflate.c +++ b/lib/zlib_dfltcc/dfltcc_inflate.c @@ -4,6 +4,7 @@ #include "dfltcc_util.h" #include "dfltcc.h" #include +#include #include /* @@ -29,6 +30,7 @@ int dfltcc_can_inflate( return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0); } +EXPORT_SYMBOL(dfltcc_can_inflate); static int dfltcc_was_inflate_used( z_streamp strm @@ -147,3 +149,4 @@ dfltcc_inflate_action dfltcc_inflate( return (cc == DFLTCC_CC_OP1_TOO_SHORT || cc == DFLTCC_CC_OP2_TOO_SHORT) ? DFLTCC_INFLATE_BREAK : DFLTCC_INFLATE_CONTINUE; } +EXPORT_SYMBOL(dfltcc_inflate); From 2bf509d96d84c3336d08375e8af34d1b85ee71c8 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 5 Dec 2020 22:14:42 -0800 Subject: [PATCH 02/12] coredump: fix core_pattern parse error 'format_corename()' will splite 'core_pattern' on spaces when it is in pipe mode, and take helper_argv[0] as the path to usermode executable. It works fine in most cases. However, if there is a space between '|' and '/file/path', such as '| /usr/lib/systemd/systemd-coredump %P %u %g', then helper_argv[0] will be parsed as '', and users will get a 'Core dump to | disabled'. It is not friendly to users, as the pattern above was valid previously. Fix this by ignoring the spaces between '|' and '/file/path'. Fixes: 315c69261dd3 ("coredump: split pipe command whitespace before expanding template") Signed-off-by: Menglong Dong Signed-off-by: Andrew Morton Cc: Paul Wise Cc: Jakub Wilk [https://bugs.debian.org/924398] Cc: Neil Horman Cc: Link: https://lkml.kernel.org/r/5fb62870.1c69fb81.8ef5d.af76@mx.google.com Signed-off-by: Linus Torvalds --- fs/coredump.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/coredump.c b/fs/coredump.c index 0cd9056d79cc..c6acfc694f65 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -229,7 +229,8 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, */ if (ispipe) { if (isspace(*pat_ptr)) { - was_space = true; + if (cn->used != 0) + was_space = true; pat_ptr++; continue; } else if (was_space) { From becaba65f62f88e553ec92ed98370e9d2b18e629 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 5 Dec 2020 22:14:45 -0800 Subject: [PATCH 03/12] mm: memcg/slab: fix obj_cgroup_charge() return value handling Commit 10befea91b61 ("mm: memcg/slab: use a single set of kmem_caches for all allocations") introduced a regression into the handling of the obj_cgroup_charge() return value. If a non-zero value is returned (indicating of exceeding one of memory.max limits), the allocation should fail, instead of falling back to non-accounted mode. To make the code more readable, move memcg_slab_pre_alloc_hook() and memcg_slab_post_alloc_hook() calling conditions into bodies of these hooks. Fixes: 10befea91b61 ("mm: memcg/slab: use a single set of kmem_caches for all allocations") Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Link: https://lkml.kernel.org/r/20201127161828.GD840171@carbon.dhcp.thefacebook.com Signed-off-by: Linus Torvalds --- mm/slab.h | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 6d7c6a5056ba..f9977d6613d6 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -274,22 +274,32 @@ static inline size_t obj_full_size(struct kmem_cache *s) return s->size + sizeof(struct obj_cgroup *); } -static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s, - size_t objects, - gfp_t flags) +/* + * Returns false if the allocation should fail. + */ +static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, + struct obj_cgroup **objcgp, + size_t objects, gfp_t flags) { struct obj_cgroup *objcg; + if (!memcg_kmem_enabled()) + return true; + + if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) + return true; + objcg = get_obj_cgroup_from_current(); if (!objcg) - return NULL; + return true; if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) { obj_cgroup_put(objcg); - return NULL; + return false; } - return objcg; + *objcgp = objcg; + return true; } static inline void mod_objcg_state(struct obj_cgroup *objcg, @@ -315,7 +325,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, unsigned long off; size_t i; - if (!objcg) + if (!memcg_kmem_enabled() || !objcg) return; flags &= ~__GFP_ACCOUNT; @@ -400,11 +410,11 @@ static inline void memcg_free_page_obj_cgroups(struct page *page) { } -static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s, - size_t objects, - gfp_t flags) +static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, + struct obj_cgroup **objcgp, + size_t objects, gfp_t flags) { - return NULL; + return true; } static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, @@ -508,9 +518,8 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, if (should_failslab(s, flags)) return NULL; - if (memcg_kmem_enabled() && - ((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT))) - *objcgp = memcg_slab_pre_alloc_hook(s, size, flags); + if (!memcg_slab_pre_alloc_hook(s, objcgp, size, flags)) + return NULL; return s; } @@ -529,8 +538,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, s->flags, flags); } - if (memcg_kmem_enabled()) - memcg_slab_post_alloc_hook(s, objcg, flags, size, p); + memcg_slab_post_alloc_hook(s, objcg, flags, size, p); } #ifndef CONFIG_SLOB From 8199be001a470209f5c938570cc199abb012fe53 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Sat, 5 Dec 2020 22:14:48 -0800 Subject: [PATCH 04/12] mm: list_lru: set shrinker map bit when child nr_items is not zero When investigating a slab cache bloat problem, significant amount of negative dentry cache was seen, but confusingly they neither got shrunk by reclaimer (the host has very tight memory) nor be shrunk by dropping cache. The vmcore shows there are over 14M negative dentry objects on lru, but tracing result shows they were even not scanned at all. Further investigation shows the memcg's vfs shrinker_map bit is not set. So the reclaimer or dropping cache just skip calling vfs shrinker. So we have to reboot the hosts to get the memory back. I didn't manage to come up with a reproducer in test environment, and the problem can't be reproduced after rebooting. But it seems there is race between shrinker map bit clear and reparenting by code inspection. The hypothesis is elaborated as below. The memcg hierarchy on our production environment looks like: root / \ system user The main workloads are running under user slice's children, and it creates and removes memcg frequently. So reparenting happens very often under user slice, but no task is under user slice directly. So with the frequent reparenting and tight memory pressure, the below hypothetical race condition may happen: CPU A CPU B reparent dst->nr_items == 0 shrinker: total_objects == 0 add src->nr_items to dst set_bit return SHRINK_EMPTY clear_bit child memcg offline replace child's kmemcg_id with parent's (in memcg_offline_kmem()) list_lru_del() between shrinker runs see parent's kmemcg_id dec dst->nr_items reparent again dst->nr_items may go negative due to concurrent list_lru_del() The second run of shrinker: read nr_items without any synchronization, so it may see intermediate negative nr_items then total_objects may return 0 coincidently keep the bit cleared dst->nr_items != 0 skip set_bit add scr->nr_item to dst After this point dst->nr_item may never go zero, so reparenting will not set shrinker_map bit anymore. And since there is no task under user slice directly, so no new object will be added to its lru to set the shrinker map bit either. That bit is kept cleared forever. How does list_lru_del() race with reparenting? It is because reparenting replaces children's kmemcg_id to parent's without protecting from nlru->lock, so list_lru_del() may see parent's kmemcg_id but actually deleting items from child's lru, but dec'ing parent's nr_items, so the parent's nr_items may go negative as commit 2788cf0c401c ("memcg: reparent list_lrus and free kmemcg_id on css offline") says. Since it is impossible that dst->nr_items goes negative and src->nr_items goes zero at the same time, so it seems we could set the shrinker map bit iff src->nr_items != 0. We could synchronize list_lru_count_one() and reparenting with nlru->lock, but it seems checking src->nr_items in reparenting is the simplest and avoids lock contention. Fixes: fae91d6d8be5 ("mm/list_lru.c: set bit in memcg shrinker bitmap on first list_lru item appearance") Suggested-by: Roman Gushchin Signed-off-by: Yang Shi Signed-off-by: Andrew Morton Reviewed-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Kirill Tkhai Cc: Vladimir Davydov Cc: [4.19] Link: https://lkml.kernel.org/r/20201202171749.264354-1-shy828301@gmail.com Signed-off-by: Linus Torvalds --- mm/list_lru.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 5aa6e44bc2ae..fe230081690b 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -534,7 +534,6 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid, struct list_lru_node *nlru = &lru->node[nid]; int dst_idx = dst_memcg->kmemcg_id; struct list_lru_one *src, *dst; - bool set; /* * Since list_lru_{add,del} may be called under an IRQ-safe lock, @@ -546,11 +545,12 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid, dst = list_lru_from_memcg_idx(nlru, dst_idx); list_splice_init(&src->list, &dst->list); - set = (!dst->nr_items && src->nr_items); - dst->nr_items += src->nr_items; - if (set) + + if (src->nr_items) { + dst->nr_items += src->nr_items; memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); - src->nr_items = 0; + src->nr_items = 0; + } spin_unlock_irq(&nlru->lock); } From e91d8d78237de8d7120c320b3645b7100848f24d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Sat, 5 Dec 2020 22:14:51 -0800 Subject: [PATCH 05/12] mm/zsmalloc.c: drop ZSMALLOC_PGTABLE_MAPPING While I was doing zram testing, I found sometimes decompression failed since the compression buffer was corrupted. With investigation, I found below commit calls cond_resched unconditionally so it could make a problem in atomic context if the task is reschedule. BUG: sleeping function called from invalid context at mm/vmalloc.c:108 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 946, name: memhog 3 locks held by memhog/946: #0: ffff9d01d4b193e8 (&mm->mmap_lock#2){++++}-{4:4}, at: __mm_populate+0x103/0x160 #1: ffffffffa3d53de0 (fs_reclaim){+.+.}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0xa98/0x1160 #2: ffff9d01d56b8110 (&zspage->lock){.+.+}-{3:3}, at: zs_map_object+0x8e/0x1f0 CPU: 0 PID: 946 Comm: memhog Not tainted 5.9.3-00011-gc5bfc0287345-dirty #316 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 Call Trace: unmap_kernel_range_noflush+0x2eb/0x350 unmap_kernel_range+0x14/0x30 zs_unmap_object+0xd5/0xe0 zram_bvec_rw.isra.0+0x38c/0x8e0 zram_rw_page+0x90/0x101 bdev_write_page+0x92/0xe0 __swap_writepage+0x94/0x4a0 pageout+0xe3/0x3a0 shrink_page_list+0xb94/0xd60 shrink_inactive_list+0x158/0x460 We can fix this by removing the ZSMALLOC_PGTABLE_MAPPING feature (which contains the offending calling code) from zsmalloc. Even though this option showed some amount improvement(e.g., 30%) in some arm32 platforms, it has been headache to maintain since it have abused APIs[1](e.g., unmap_kernel_range in atomic context). Since we are approaching to deprecate 32bit machines and already made the config option available for only builtin build since v5.8, lastly it has been not default option in zsmalloc, it's time to drop the option for better maintenance. [1] http://lore.kernel.org/linux-mm/20201105170249.387069-1-minchan@kernel.org Fixes: e47110e90584 ("mm/vunmap: add cond_resched() in vunmap_pmd_range") Signed-off-by: Minchan Kim Signed-off-by: Andrew Morton Reviewed-by: Sergey Senozhatsky Cc: Tony Lindgren Cc: Christoph Hellwig Cc: Harish Sriram Cc: Uladzislau Rezki Cc: Link: https://lkml.kernel.org/r/20201117202916.GA3856507@google.com Signed-off-by: Linus Torvalds --- arch/arm/configs/omap2plus_defconfig | 1 - include/linux/zsmalloc.h | 1 - mm/Kconfig | 13 ------- mm/zsmalloc.c | 54 ---------------------------- 4 files changed, 69 deletions(-) diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 34793aabdb65..58df9fd79a76 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -81,7 +81,6 @@ CONFIG_PARTITION_ADVANCED=y CONFIG_BINFMT_MISC=y CONFIG_CMA=y CONFIG_ZSMALLOC=m -CONFIG_ZSMALLOC_PGTABLE_MAPPING=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 0fdbf653b173..4807ca4d52e0 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -20,7 +20,6 @@ * zsmalloc mapping modes * * NOTE: These only make a difference when a mapped object spans pages. - * They also have no effect when ZSMALLOC_PGTABLE_MAPPING is selected. */ enum zs_mapmode { ZS_MM_RW, /* normal read-write mapping */ diff --git a/mm/Kconfig b/mm/Kconfig index d42423f884a7..390165ffbb0f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -707,19 +707,6 @@ config ZSMALLOC returned by an alloc(). This handle must be mapped in order to access the allocated space. -config ZSMALLOC_PGTABLE_MAPPING - bool "Use page table mapping to access object in zsmalloc" - depends on ZSMALLOC=y - help - By default, zsmalloc uses a copy-based object mapping method to - access allocations that span two pages. However, if a particular - architecture (ex, ARM) performs VM mapping faster than copying, - then you should select this. This causes zsmalloc to use page table - mapping rather than copying for object mapping. - - You can check speed with zsmalloc benchmark: - https://github.com/spartacus06/zsmapbench - config ZSMALLOC_STAT bool "Export zsmalloc statistics" depends on ZSMALLOC diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 918c7b019b3d..cdfaaadea8ff 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -293,11 +293,7 @@ struct zspage { }; struct mapping_area { -#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING - struct vm_struct *vm; /* vm area for mapping object that span pages */ -#else char *vm_buf; /* copy buffer for objects that span pages */ -#endif char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ }; @@ -1113,54 +1109,6 @@ static struct zspage *find_get_zspage(struct size_class *class) return zspage; } -#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING -static inline int __zs_cpu_up(struct mapping_area *area) -{ - /* - * Make sure we don't leak memory if a cpu UP notification - * and zs_init() race and both call zs_cpu_up() on the same cpu - */ - if (area->vm) - return 0; - area->vm = get_vm_area(PAGE_SIZE * 2, 0); - if (!area->vm) - return -ENOMEM; - - /* - * Populate ptes in advance to avoid pte allocation with GFP_KERNEL - * in non-preemtible context of zs_map_object. - */ - return apply_to_page_range(&init_mm, (unsigned long)area->vm->addr, - PAGE_SIZE * 2, NULL, NULL); -} - -static inline void __zs_cpu_down(struct mapping_area *area) -{ - if (area->vm) - free_vm_area(area->vm); - area->vm = NULL; -} - -static inline void *__zs_map_object(struct mapping_area *area, - struct page *pages[2], int off, int size) -{ - unsigned long addr = (unsigned long)area->vm->addr; - - BUG_ON(map_kernel_range(addr, PAGE_SIZE * 2, PAGE_KERNEL, pages) < 0); - area->vm_addr = area->vm->addr; - return area->vm_addr + off; -} - -static inline void __zs_unmap_object(struct mapping_area *area, - struct page *pages[2], int off, int size) -{ - unsigned long addr = (unsigned long)area->vm_addr; - - unmap_kernel_range(addr, PAGE_SIZE * 2); -} - -#else /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */ - static inline int __zs_cpu_up(struct mapping_area *area) { /* @@ -1241,8 +1189,6 @@ out: pagefault_enable(); } -#endif /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */ - static int zs_cpu_prepare(unsigned int cpu) { struct mapping_area *area; From b11a76b37a5aa7b07c3e3eeeaae20b25475bddd3 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Sat, 5 Dec 2020 22:14:55 -0800 Subject: [PATCH 06/12] mm/swapfile: do not sleep with a spin lock held We can't call kvfree() with a spin lock held, so defer it. Fixes a might_sleep() runtime warning. Fixes: 873d7bcfd066 ("mm/swapfile.c: use kvzalloc for swap_info_struct allocation") Signed-off-by: Qian Cai Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Hugh Dickins Cc: Link: https://lkml.kernel.org/r/20201202151549.10350-1-qcai@redhat.com Signed-off-by: Linus Torvalds --- mm/swapfile.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index c4a613688a17..d58361109066 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2867,6 +2867,7 @@ late_initcall(max_swapfiles_check); static struct swap_info_struct *alloc_swap_info(void) { struct swap_info_struct *p; + struct swap_info_struct *defer = NULL; unsigned int type; int i; @@ -2895,7 +2896,7 @@ static struct swap_info_struct *alloc_swap_info(void) smp_wmb(); WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1); } else { - kvfree(p); + defer = p; p = swap_info[type]; /* * Do not memset this entry: a racing procfs swap_next() @@ -2908,6 +2909,7 @@ static struct swap_info_struct *alloc_swap_info(void) plist_node_init(&p->avail_lists[i], 0); p->flags = SWP_USED; spin_unlock(&swap_lock); + kvfree(defer); spin_lock_init(&p->lock); spin_lock_init(&p->cont_lock); From 4e60340c5ca560278c938726235bc0daa5fc8c7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 5 Dec 2020 22:14:58 -0800 Subject: [PATCH 07/12] =?UTF-8?q?mailmap:=20add=20two=20more=20addresses?= =?UTF-8?q?=20of=20Uwe=20Kleine-K=C3=B6nig?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes attribution for the commits (among others) - d4097456cd1d ("video/framebuffer: move the probe func into .devinit.text in Blackfin LCD driver") - 0312e024d6cd ("mfd: mc13xxx: Add support for mc34708") Signed-off-by: Uwe Kleine-König Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20201127213358.3440830-1-u.kleine-koenig@pengutronix.de Signed-off-by: Linus Torvalds --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index d9fb83d67055..225546cc8028 100644 --- a/.mailmap +++ b/.mailmap @@ -322,6 +322,8 @@ TripleX Chung Tsuneo Yoshioka Tycho Andersen Uwe Kleine-König +Uwe Kleine-König +Uwe Kleine-König Uwe Kleine-König Uwe Kleine-König Valdis Kletnieks From d8cbe8bfa7df3c680ddfd5e1eee3a5c86d8dc764 Mon Sep 17 00:00:00 2001 From: Xingxing Su Date: Sat, 5 Dec 2020 22:15:02 -0800 Subject: [PATCH 08/12] tools/testing/selftests/vm: fix build error Only x86 and PowerPC implement the pkey-xxx.h, and an error was reported when compiling protection_keys.c. Add a Arch judgment to compile "protection_keys" in the Makefile. If other arch implement this, add the arch name to the Makefile. eg: ifneq (,$(findstring $(ARCH),powerpc mips ... )) Following build errors: pkey-helpers.h:93:2: error: #error Architecture not supported #error Architecture not supported pkey-helpers.h:96:20: error: `PKEY_DISABLE_ACCESS' undeclared #define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) ^ protection_keys.c:218:45: error: `PKEY_DISABLE_WRITE' undeclared pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); ^ Signed-off-by: Xingxing Su Signed-off-by: Andrew Morton Cc: Shuah Khan Cc: Sandipan Das Cc: John Hubbard Cc: Dave Hansen Cc: "Kirill A. Shutemov" Cc: Brian Geffon Cc: Mina Almasry Link: https://lkml.kernel.org/r/1606826876-30656-1-git-send-email-suxingxing@loongson.cn Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 30873b19d04b..691893afc15d 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -60,9 +60,13 @@ ifeq ($(CAN_BUILD_X86_64),1) TEST_GEN_FILES += $(BINARIES_64) endif else + +ifneq (,$(findstring $(ARCH),powerpc)) TEST_GEN_FILES += protection_keys endif +endif + ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64)) TEST_GEN_FILES += va_128TBswitch TEST_GEN_FILES += virtual_address_range From 573a259336f8c57739bdaf035aa7abbae7d9a713 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Sat, 5 Dec 2020 22:15:05 -0800 Subject: [PATCH 09/12] userfaultfd: selftests: fix SIGSEGV if huge mmap fails The error handling in hugetlb_allocate_area() was incorrect for the hugetlb_shared test case. Previously the behavior was: - mmap a hugetlb area - If this fails, set the pointer to NULL, and carry on - mmap an alias of the same hugetlb fd - If this fails, munmap the original area If the original mmap failed, it's likely the second one did too. If both failed, we'd blindly try to munmap a NULL pointer, causing a SIGSEGV. Instead, "goto fail" so we return before trying to mmap the alias. This issue can be hit "in real life" by forgetting to set /proc/sys/vm/nr_hugepages (leaving it at 0), and then trying to run the hugetlb_shared test. Another small improvement is, when the original mmap fails, don't just print "it failed": perror(), so we can see *why*. :) Signed-off-by: Axel Rasmussen Signed-off-by: Andrew Morton Cc: Shuah Khan Cc: Peter Xu Cc: Joe Perches Cc: Mike Rapoport Cc: Andrea Arcangeli Cc: David Alan Gilbert Link: https://lkml.kernel.org/r/20201204203443.2714693-1-axelrasmussen@google.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 25 +++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 9b0912a01777..c4425597769a 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -206,19 +206,19 @@ static int hugetlb_release_pages(char *rel_area) return ret; } - static void hugetlb_allocate_area(void **alloc_area) { void *area_alias = NULL; char **alloc_area_alias; + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, (map_shared ? MAP_SHARED : MAP_PRIVATE) | MAP_HUGETLB, huge_fd, *alloc_area == area_src ? 0 : nr_pages * page_size); if (*alloc_area == MAP_FAILED) { - fprintf(stderr, "mmap of hugetlbfs file failed\n"); - *alloc_area = NULL; + perror("mmap of hugetlbfs file failed"); + goto fail; } if (map_shared) { @@ -227,14 +227,11 @@ static void hugetlb_allocate_area(void **alloc_area) huge_fd, *alloc_area == area_src ? 0 : nr_pages * page_size); if (area_alias == MAP_FAILED) { - if (munmap(*alloc_area, nr_pages * page_size) < 0) { - perror("hugetlb munmap"); - exit(1); - } - *alloc_area = NULL; - return; + perror("mmap of hugetlb file alias failed"); + goto fail_munmap; } } + if (*alloc_area == area_src) { huge_fd_off0 = *alloc_area; alloc_area_alias = &area_src_alias; @@ -243,6 +240,16 @@ static void hugetlb_allocate_area(void **alloc_area) } if (area_alias) *alloc_area_alias = area_alias; + + return; + +fail_munmap: + if (munmap(*alloc_area, nr_pages * page_size) < 0) { + perror("hugetlb munmap"); + exit(1); + } +fail: + *alloc_area = NULL; } static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) From 3351b16af4946fff0d46481d155fb91adb28b1f9 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Sat, 5 Dec 2020 22:15:09 -0800 Subject: [PATCH 10/12] mm/filemap: add static for function __add_to_page_cache_locked mm/filemap.c:830:14: warning: no previous prototype for `__add_to_page_cache_locked' [-Wmissing-prototypes] Signed-off-by: Alex Shi Signed-off-by: Andrew Morton Cc: Souptick Joarder Link: https://lkml.kernel.org/r/1604661895-5495-1-git-send-email-alex.shi@linux.alibaba.com Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 0b2067b3c328..331f4261d723 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -827,7 +827,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) } EXPORT_SYMBOL_GPL(replace_page_cache_page); -noinline int __add_to_page_cache_locked(struct page *page, +static noinline int __add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp, void **shadowp) From 7a5bde37983d37783161681ff7c6122dfd081791 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Sat, 5 Dec 2020 22:15:12 -0800 Subject: [PATCH 11/12] hugetlb_cgroup: fix offline of hugetlb cgroup with reservations Adrian Moreno was ruuning a kubernetes 1.19 + containerd/docker workload using hugetlbfs. In this environment the issue is reproduced by: - Start a simple pod that uses the recently added HugePages medium feature (pod yaml attached) - Start a DPDK app. It doesn't need to run successfully (as in transfer packets) nor interact with real hardware. It seems just initializing the EAL layer (which handles hugepage reservation and locking) is enough to trigger the issue - Delete the Pod (or let it "Complete"). This would result in a kworker thread going into a tight loop (top output): 1425 root 20 0 0 0 0 R 99.7 0.0 5:22.45 kworker/28:7+cgroup_destroy 'perf top -g' reports: - 63.28% 0.01% [kernel] [k] worker_thread - 49.97% worker_thread - 52.64% process_one_work - 62.08% css_killed_work_fn - hugetlb_cgroup_css_offline 41.52% _raw_spin_lock - 2.82% _cond_resched rcu_all_qs 2.66% PageHuge - 0.57% schedule - 0.57% __schedule We are spinning in the do-while loop in hugetlb_cgroup_css_offline. Worse yet, we are holding the master cgroup lock (cgroup_mutex) while infinitely spinning. Little else can be done on the system as the cgroup_mutex can not be acquired. Do note that the issue can be reproduced by simply offlining a hugetlb cgroup containing pages with reservation counts. The loop in hugetlb_cgroup_css_offline is moving page counts from the cgroup being offlined to the parent cgroup. This is done for each hstate, and is repeated until hugetlb_cgroup_have_usage returns false. The routine moving counts (hugetlb_cgroup_move_parent) is only moving 'usage' counts. The routine hugetlb_cgroup_have_usage is checking for both 'usage' and 'reservation' counts. Discussion about what to do with reservation counts when reparenting was discussed here: https://lore.kernel.org/linux-kselftest/CAHS8izMFAYTgxym-Hzb_JmkTK1N_S9tGN71uS6MFV+R7swYu5A@mail.gmail.com/ The decision was made to leave a zombie cgroup for with reservation counts. Unfortunately, the code checking reservation counts was incorrectly added to hugetlb_cgroup_have_usage. To fix the issue, simply remove the check for reservation counts. While fixing this issue, a related bug in hugetlb_cgroup_css_offline was noticed. The hstate index is not reinitialized each time through the do-while loop. Fix this as well. Fixes: 1adc4d419aa2 ("hugetlb_cgroup: add interface for charge/uncharge hugetlb reservations") Reported-by: Adrian Moreno Signed-off-by: Mike Kravetz Signed-off-by: Andrew Morton Tested-by: Adrian Moreno Reviewed-by: Shakeel Butt Cc: Mina Almasry Cc: David Rientjes Cc: Greg Thelen Cc: Sandipan Das Cc: Shuah Khan Cc: Link: https://lkml.kernel.org/r/20201203220242.158165-1-mike.kravetz@oracle.com Signed-off-by: Linus Torvalds --- mm/hugetlb_cgroup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 1f87aec9ab5c..9182848dda3e 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -82,11 +82,8 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) for (idx = 0; idx < hugetlb_max_hstate; idx++) { if (page_counter_read( - hugetlb_cgroup_counter_from_cgroup(h_cg, idx)) || - page_counter_read(hugetlb_cgroup_counter_from_cgroup_rsvd( - h_cg, idx))) { + hugetlb_cgroup_counter_from_cgroup(h_cg, idx))) return true; - } } return false; } @@ -202,9 +199,10 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); struct hstate *h; struct page *page; - int idx = 0; + int idx; do { + idx = 0; for_each_hstate(h) { spin_lock(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_activelist, lru) From 309d08d9b3a3659ab3f239d27d4e38b670b08fc9 Mon Sep 17 00:00:00 2001 From: Liu Zixian Date: Sat, 5 Dec 2020 22:15:15 -0800 Subject: [PATCH 12/12] mm/mmap.c: fix mmap return value when vma is merged after call_mmap() On success, mmap should return the begin address of newly mapped area, but patch "mm: mmap: merge vma after call_mmap() if possible" set vm_start of newly merged vma to return value addr. Users of mmap will get wrong address if vma is merged after call_mmap(). We fix this by moving the assignment to addr before merging vma. We have a driver which changes vm_flags, and this bug is found by our testcases. Fixes: d70cec898324 ("mm: mmap: merge vma after call_mmap() if possible") Signed-off-by: Liu Zixian Signed-off-by: Andrew Morton Reviewed-by: Jason Gunthorpe Reviewed-by: David Hildenbrand Cc: Miaohe Lin Cc: Hongxiang Lou Cc: Hu Shiyuan Cc: Matthew Wilcox Link: https://lkml.kernel.org/r/20201203085350.22624-1-liuzixian4@huawei.com Signed-off-by: Linus Torvalds --- mm/mmap.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index d91ecb00d38c..5c8b4485860d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1808,6 +1808,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr, if (error) goto unmap_and_free_vma; + /* Can addr have changed?? + * + * Answer: Yes, several device drivers can do it in their + * f_op->mmap method. -DaveM + * Bug: If addr is changed, prev, rb_link, rb_parent should + * be updated for vma_link() + */ + WARN_ON_ONCE(addr != vma->vm_start); + + addr = vma->vm_start; + /* If vm_flags changed after call_mmap(), we should try merge vma again * as we may succeed this time. */ @@ -1822,25 +1833,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr, fput(vma->vm_file); vm_area_free(vma); vma = merge; - /* Update vm_flags and possible addr to pick up the change. We don't - * warn here if addr changed as the vma is not linked by vma_link(). - */ - addr = vma->vm_start; + /* Update vm_flags to pick up the change. */ vm_flags = vma->vm_flags; goto unmap_writable; } } - /* Can addr have changed?? - * - * Answer: Yes, several device drivers can do it in their - * f_op->mmap method. -DaveM - * Bug: If addr is changed, prev, rb_link, rb_parent should - * be updated for vma_link() - */ - WARN_ON_ONCE(addr != vma->vm_start); - - addr = vma->vm_start; vm_flags = vma->vm_flags; } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma);