From fa76da461bb0be13c8339d984dcf179151167c8f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 26 Oct 2018 15:02:16 -0700 Subject: [PATCH 001/132] mm: /proc/pid/smaps_rollup: fix NULL pointer deref in smaps_pte_range() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Leonardo reports an apparent regression in 4.19-rc7: BUG: unable to handle kernel NULL pointer dereference at 00000000000000f0 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 6032 Comm: python Not tainted 4.19.0-041900rc7-lowlatency #201810071631 Hardware name: LENOVO 80UG/Toronto 4A2, BIOS 0XCN45WW 08/09/2018 RIP: 0010:smaps_pte_range+0x32d/0x540 Code: 80 00 00 00 00 74 a9 48 89 de 41 f6 40 52 40 0f 85 04 02 00 00 49 2b 30 48 c1 ee 0c 49 03 b0 98 00 00 00 49 8b 80 a0 00 00 00 <48> 8b b8 f0 00 00 00 e8 b7 ef ec ff 48 85 c0 0f 84 71 ff ff ff a8 RSP: 0018:ffffb0cbc484fb88 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 0000560ddb9e9000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000560ddb9e9 RDI: 0000000000000001 RBP: ffffb0cbc484fbc0 R08: ffff94a5a227a578 R09: ffff94a5a227a578 R10: 0000000000000000 R11: 0000560ddbbe7000 R12: ffffe903098ba728 R13: ffffb0cbc484fc78 R14: ffffb0cbc484fcf8 R15: ffff94a5a2e9cf48 FS: 00007f6dfb683740(0000) GS:ffff94a5aaf80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000f0 CR3: 000000011c118001 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __walk_page_range+0x3c2/0x6f0 walk_page_vma+0x42/0x60 smap_gather_stats+0x79/0xe0 ? gather_pte_stats+0x320/0x320 ? gather_hugetlb_stats+0x70/0x70 show_smaps_rollup+0xcd/0x1c0 seq_read+0x157/0x400 __vfs_read+0x3a/0x180 ? security_file_permission+0x93/0xc0 ? security_file_permission+0x93/0xc0 vfs_read+0x8f/0x140 ksys_read+0x55/0xc0 __x64_sys_read+0x1a/0x20 do_syscall_64+0x5a/0x110 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Decoded code matched to local compilation+disassembly points to smaps_pte_entry(): } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap && pte_none(*pte))) { page = find_get_entry(vma->vm_file->f_mapping, linear_page_index(vma, addr)); Here, vma->vm_file is NULL. mss->check_shmem_swap should be false in that case, however for smaps_rollup, smap_gather_stats() can set the flag true for one vma and leave it true for subsequent vma's where it should be false. To fix, reset the check_shmem_swap flag to false. There's also related bug which sets mss->swap to shmem_swapped, which in the context of smaps_rollup overwrites any value accumulated from previous vma's. Fix that as well. Note that the report suggests a regression between 4.17.19 and 4.19-rc7, which makes the 4.19 series ending with commit 258f669e7e88 ("mm: /proc/pid/smaps_rollup: convert to single value seq_file") suspicious. But the mss was reused for rollup since 493b0e9d945f ("mm: add /proc/pid/smaps_rollup") so let's play it safe with the stable backport. Link: http://lkml.kernel.org/r/555fbd1f-4ac9-0b58-dcd4-5dc4380ff7ca@suse.cz Link: https://bugzilla.kernel.org/show_bug.cgi?id=201377 Fixes: 493b0e9d945f ("mm: add /proc/pid/smaps_rollup") Signed-off-by: Vlastimil Babka Reported-by: Leonardo Soares Müller Tested-by: Leonardo Soares Müller Cc: Greg Kroah-Hartman Cc: Daniel Colascione Cc: Alexey Dobriyan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5ea1d64cb0b4..a027473561c6 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -713,6 +713,8 @@ static void smap_gather_stats(struct vm_area_struct *vma, smaps_walk.private = mss; #ifdef CONFIG_SHMEM + /* In case of smaps_rollup, reset the value from previous vma */ + mss->check_shmem_swap = false; if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all @@ -728,7 +730,7 @@ static void smap_gather_stats(struct vm_area_struct *vma, if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || !(vma->vm_flags & VM_WRITE)) { - mss->swap = shmem_swapped; + mss->swap += shmem_swapped; } else { mss->check_shmem_swap = true; smaps_walk.pte_hole = smaps_pte_hole; From ae62c16e105a869524afcf8a07ee85c5ae5d0479 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 Oct 2018 15:02:19 -0700 Subject: [PATCH 002/132] userfaultfd: disable irqs when taking the waitqueue lock userfaultfd contains howe-grown locking of the waitqueue lock, and does not disable interrupts. This relies on the fact that no one else takes it from interrupt context and violates an invariat of the normal waitqueue locking scheme. With aio poll it is easy to trigger other locks that disable interrupts (or are called from interrupt context). Link: http://lkml.kernel.org/r/20181018154101.18750-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Andrea Arcangeli Reviewed-by: Andrew Morton Cc: [4.19.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index bfa0ec69f924..356d2b8568c1 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1026,7 +1026,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, struct userfaultfd_ctx *fork_nctx = NULL; /* always take the fd_wqh lock before the fault_pending_wqh lock */ - spin_lock(&ctx->fd_wqh.lock); + spin_lock_irq(&ctx->fd_wqh.lock); __add_wait_queue(&ctx->fd_wqh, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); @@ -1112,13 +1112,13 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, ret = -EAGAIN; break; } - spin_unlock(&ctx->fd_wqh.lock); + spin_unlock_irq(&ctx->fd_wqh.lock); schedule(); - spin_lock(&ctx->fd_wqh.lock); + spin_lock_irq(&ctx->fd_wqh.lock); } __remove_wait_queue(&ctx->fd_wqh, &wait); __set_current_state(TASK_RUNNING); - spin_unlock(&ctx->fd_wqh.lock); + spin_unlock_irq(&ctx->fd_wqh.lock); if (!ret && msg->event == UFFD_EVENT_FORK) { ret = resolve_userfault_fork(ctx, fork_nctx, msg); From 4d54954a197175c0dcb3c82af0c0740d0c5f827a Mon Sep 17 00:00:00 2001 From: Sebastien Boisvert Date: Fri, 26 Oct 2018 15:02:23 -0700 Subject: [PATCH 003/132] include/linux/pfn_t.h: force '~' to be parsed as an unary operator Tracing the event "fs_dax:dax_pmd_insert_mapping" with perf produces this warning: [fs_dax:dax_pmd_insert_mapping] unknown op '~' It is printed in process_op (tools/lib/traceevent/event-parse.c) because '~' is parsed as a binary operator. perf reads the format of fs_dax:dax_pmd_insert_mapping ("print fmt") from /sys/kernel/debug/tracing/events/fs_dax/dax_pmd_insert_mapping/format . The format contains: ~(((u64) ~(~(((1UL) << 12)-1))) ^ \ interpreted as a binary operator by process_op(). This part is generated in the declaration of the event class dax_pmd_insert_mapping_class in include/trace/events/fs_dax.h : __print_flags_u64(__entry->pfn_val & PFN_FLAGS_MASK, "|", PFN_FLAGS_TRACE), This patch adds a pair of parentheses in the declaration of PFN_FLAGS_MASK to make sure that '~' is parsed as a unary operator by perf. The part of the format that was problematic is now: ~(((u64) (~(~(((1UL) << 12)-1)))) Now, all the '~' are parsed as unary operators. Link: http://lkml.kernel.org/r/20181021145939.8760-1-sebhtml@videotron.qc.ca Signed-off-by: Sebastien Boisvert Acked-by: Dan Williams Cc: "Steven Rostedt (VMware)" Cc: Arnaldo Carvalho de Melo Cc: "Tzvetomir Stoyanov (VMware)" Cc: Namhyung Kim Cc: Ross Zwisler Cc: Elenie Godzaridis Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pfn_t.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 21713dc14ce2..673546ba7342 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -10,7 +10,7 @@ * PFN_DEV - pfn is not covered by system memmap by default * PFN_MAP - pfn has a dynamic page mapping established by a device driver */ -#define PFN_FLAGS_MASK (((u64) ~PAGE_MASK) << (BITS_PER_LONG_LONG - PAGE_SHIFT)) +#define PFN_FLAGS_MASK (((u64) (~PAGE_MASK)) << (BITS_PER_LONG_LONG - PAGE_SHIFT)) #define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1)) #define PFN_SG_LAST (1ULL << (BITS_PER_LONG_LONG - 2)) #define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3)) From 74f213ea25b99fddcf34cbe07dabdb01136bcd86 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 26 Oct 2018 15:02:27 -0700 Subject: [PATCH 004/132] include/linux/linkage.h: align weak symbols Since WEAK() supposed to be used instead of ENTRY() to define weak symbols, but unlike ENTRY() it doesn't have ALIGN directive. It seems there is no actual reason to not have, so let's add ALIGN to WEAK() too. Link: http://lkml.kernel.org/r/20180920135631.23833-1-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Will Deacon , Catalin Marinas Cc: Kyeongdon Kim Cc: Ard Biesheuvel Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/linkage.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/linkage.h b/include/linux/linkage.h index d7618c41f74c..7c47b1a471d4 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -90,6 +90,7 @@ #ifndef WEAK #define WEAK(name) \ .weak name ASM_NL \ + ALIGN ASM_NL \ name: #endif From 19a2ca0fb560fd7be7b5293c6b652c6d6078dcde Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 26 Oct 2018 15:02:30 -0700 Subject: [PATCH 005/132] arm64: lib: use C string functions with KASAN enabled ARM64 has asm implementation of memchr(), memcmp(), str[r]chr(), str[n]cmp(), str[n]len(). KASAN don't see memory accesses in asm code, thus it can potentially miss many bugs. Ifdef out __HAVE_ARCH_* defines of these functions when KASAN is enabled, so the generic implementations from lib/string.c will be used. We can't just remove the asm functions because efistub uses them. And we can't have two non-weak functions either, so declare the asm functions as weak. Link: http://lkml.kernel.org/r/20180920135631.23833-2-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Reported-by: Kyeongdon Kim Cc: Alexander Potapenko Cc: Ard Biesheuvel Cc: Dmitry Vyukov Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/string.h | 14 ++++++++------ arch/arm64/kernel/arm64ksyms.c | 7 +++++-- arch/arm64/lib/memchr.S | 2 +- arch/arm64/lib/memcmp.S | 2 +- arch/arm64/lib/strchr.S | 2 +- arch/arm64/lib/strcmp.S | 2 +- arch/arm64/lib/strlen.S | 2 +- arch/arm64/lib/strncmp.S | 2 +- arch/arm64/lib/strnlen.S | 2 +- arch/arm64/lib/strrchr.S | 2 +- 10 files changed, 21 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index dd95d33a5bd5..03a6c256b7ec 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -16,6 +16,7 @@ #ifndef __ASM_STRING_H #define __ASM_STRING_H +#ifndef CONFIG_KASAN #define __HAVE_ARCH_STRRCHR extern char *strrchr(const char *, int c); @@ -34,6 +35,13 @@ extern __kernel_size_t strlen(const char *); #define __HAVE_ARCH_STRNLEN extern __kernel_size_t strnlen(const char *, __kernel_size_t); +#define __HAVE_ARCH_MEMCMP +extern int memcmp(const void *, const void *, size_t); + +#define __HAVE_ARCH_MEMCHR +extern void *memchr(const void *, int, __kernel_size_t); +#endif + #define __HAVE_ARCH_MEMCPY extern void *memcpy(void *, const void *, __kernel_size_t); extern void *__memcpy(void *, const void *, __kernel_size_t); @@ -42,16 +50,10 @@ extern void *__memcpy(void *, const void *, __kernel_size_t); extern void *memmove(void *, const void *, __kernel_size_t); extern void *__memmove(void *, const void *, __kernel_size_t); -#define __HAVE_ARCH_MEMCHR -extern void *memchr(const void *, int, __kernel_size_t); - #define __HAVE_ARCH_MEMSET extern void *memset(void *, int, __kernel_size_t); extern void *__memset(void *, int, __kernel_size_t); -#define __HAVE_ARCH_MEMCMP -extern int memcmp(const void *, const void *, size_t); - #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE #define __HAVE_ARCH_MEMCPY_FLUSHCACHE void memcpy_flushcache(void *dst, const void *src, size_t cnt); diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index d894a20b70b2..72f63a59b008 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -44,20 +44,23 @@ EXPORT_SYMBOL(__arch_copy_in_user); EXPORT_SYMBOL(memstart_addr); /* string / mem functions */ +#ifndef CONFIG_KASAN EXPORT_SYMBOL(strchr); EXPORT_SYMBOL(strrchr); EXPORT_SYMBOL(strcmp); EXPORT_SYMBOL(strncmp); EXPORT_SYMBOL(strlen); EXPORT_SYMBOL(strnlen); +EXPORT_SYMBOL(memcmp); +EXPORT_SYMBOL(memchr); +#endif + EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(__memset); EXPORT_SYMBOL(__memcpy); EXPORT_SYMBOL(__memmove); -EXPORT_SYMBOL(memchr); -EXPORT_SYMBOL(memcmp); /* atomic bitops */ EXPORT_SYMBOL(set_bit); diff --git a/arch/arm64/lib/memchr.S b/arch/arm64/lib/memchr.S index 4444c1d25f4b..0f164a4baf52 100644 --- a/arch/arm64/lib/memchr.S +++ b/arch/arm64/lib/memchr.S @@ -30,7 +30,7 @@ * Returns: * x0 - address of first occurrence of 'c' or 0 */ -ENTRY(memchr) +WEAK(memchr) and w1, w1, #0xff 1: subs x2, x2, #1 b.mi 2f diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S index 2a4e239bd17a..fb295f52e9f8 100644 --- a/arch/arm64/lib/memcmp.S +++ b/arch/arm64/lib/memcmp.S @@ -58,7 +58,7 @@ pos .req x11 limit_wd .req x12 mask .req x13 -ENTRY(memcmp) +WEAK(memcmp) cbz limit, .Lret0 eor tmp1, src1, src2 tst tmp1, #7 diff --git a/arch/arm64/lib/strchr.S b/arch/arm64/lib/strchr.S index dae0cf5591f9..7c83091d1bcd 100644 --- a/arch/arm64/lib/strchr.S +++ b/arch/arm64/lib/strchr.S @@ -29,7 +29,7 @@ * Returns: * x0 - address of first occurrence of 'c' or 0 */ -ENTRY(strchr) +WEAK(strchr) and w1, w1, #0xff 1: ldrb w2, [x0], #1 cmp w2, w1 diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S index 471fe61760ef..7d5d15398bfb 100644 --- a/arch/arm64/lib/strcmp.S +++ b/arch/arm64/lib/strcmp.S @@ -60,7 +60,7 @@ tmp3 .req x9 zeroones .req x10 pos .req x11 -ENTRY(strcmp) +WEAK(strcmp) eor tmp1, src1, src2 mov zeroones, #REP8_01 tst tmp1, #7 diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S index 55ccc8e24c08..8e0b14205dcb 100644 --- a/arch/arm64/lib/strlen.S +++ b/arch/arm64/lib/strlen.S @@ -56,7 +56,7 @@ pos .req x12 #define REP8_7f 0x7f7f7f7f7f7f7f7f #define REP8_80 0x8080808080808080 -ENTRY(strlen) +WEAK(strlen) mov zeroones, #REP8_01 bic src, srcin, #15 ands tmp1, srcin, #15 diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S index e267044761c6..66bd145935d9 100644 --- a/arch/arm64/lib/strncmp.S +++ b/arch/arm64/lib/strncmp.S @@ -64,7 +64,7 @@ limit_wd .req x13 mask .req x14 endloop .req x15 -ENTRY(strncmp) +WEAK(strncmp) cbz limit, .Lret0 eor tmp1, src1, src2 mov zeroones, #REP8_01 diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S index eae38da6e0bb..355be04441fe 100644 --- a/arch/arm64/lib/strnlen.S +++ b/arch/arm64/lib/strnlen.S @@ -59,7 +59,7 @@ limit_wd .req x14 #define REP8_7f 0x7f7f7f7f7f7f7f7f #define REP8_80 0x8080808080808080 -ENTRY(strnlen) +WEAK(strnlen) cbz limit, .Lhit_limit mov zeroones, #REP8_01 bic src, srcin, #15 diff --git a/arch/arm64/lib/strrchr.S b/arch/arm64/lib/strrchr.S index f8e2784d5752..ea84924d5990 100644 --- a/arch/arm64/lib/strrchr.S +++ b/arch/arm64/lib/strrchr.S @@ -29,7 +29,7 @@ * Returns: * x0 - address of last occurrence of 'c' or 0 */ -ENTRY(strrchr) +WEAK(strrchr) mov x3, #0 and w1, w1, #0xff 1: ldrb w2, [x0], #1 From 0c96350a2d2f64fe777b444c995f6bb633c5d069 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 26 Oct 2018 15:02:34 -0700 Subject: [PATCH 006/132] lib/test_kasan.c: add tests for several string/memory API functions Arch code may have asm implementation of string/memory API functions instead of using generic one from lib/string.c. KASAN don't see memory accesses in asm code, thus can miss many bugs. E.g. on ARM64 KASAN don't see bugs in memchr(), memcmp(), str[r]chr(), str[n]cmp(), str[n]len(). Add tests for these functions to be sure that we notice the problem on other architectures. Link: http://lkml.kernel.org/r/20180920135631.23833-3-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Ard Biesheuvel Cc: Dmitry Vyukov Cc: Kyeongdon Kim Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index ec657105edbf..51b78405bf24 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -579,6 +579,73 @@ static noinline void __init kmem_cache_invalid_free(void) kmem_cache_destroy(cache); } +static noinline void __init kasan_memchr(void) +{ + char *ptr; + size_t size = 24; + + pr_info("out-of-bounds in memchr\n"); + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); + if (!ptr) + return; + + memchr(ptr, '1', size + 1); + kfree(ptr); +} + +static noinline void __init kasan_memcmp(void) +{ + char *ptr; + size_t size = 24; + int arr[9]; + + pr_info("out-of-bounds in memcmp\n"); + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); + if (!ptr) + return; + + memset(arr, 0, sizeof(arr)); + memcmp(ptr, arr, size+1); + kfree(ptr); +} + +static noinline void __init kasan_strings(void) +{ + char *ptr; + size_t size = 24; + + pr_info("use-after-free in strchr\n"); + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); + if (!ptr) + return; + + kfree(ptr); + + /* + * Try to cause only 1 invalid access (less spam in dmesg). + * For that we need ptr to point to zeroed byte. + * Skip metadata that could be stored in freed object so ptr + * will likely point to zeroed byte. + */ + ptr += 16; + strchr(ptr, '1'); + + pr_info("use-after-free in strrchr\n"); + strrchr(ptr, '1'); + + pr_info("use-after-free in strcmp\n"); + strcmp(ptr, "2"); + + pr_info("use-after-free in strncmp\n"); + strncmp(ptr, "2", 1); + + pr_info("use-after-free in strlen\n"); + strlen(ptr); + + pr_info("use-after-free in strnlen\n"); + strnlen(ptr, 1); +} + static int __init kmalloc_tests_init(void) { /* @@ -618,6 +685,9 @@ static int __init kmalloc_tests_init(void) use_after_scope_test(); kmem_cache_double_free(); kmem_cache_invalid_free(); + kasan_memchr(); + kasan_memcmp(); + kasan_strings(); kasan_restore_multi_shot(multishot); From 1f6904f72937d1867a5d62b124231069d7e71ca9 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 26 Oct 2018 15:02:38 -0700 Subject: [PATCH 007/132] scripts/tags.sh: add DECLARE_HASHTABLE() In addition to DEFINE_HASHTABLE() add DECLARE_ variant. Link: http://lkml.kernel.org/r/153683203215.13678.11468076350083405643.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Reviewed-by: Andrew Morton Cc: Masahiro Yamada Cc: Constantine Shulyupin Cc: Arend van Spriel Cc: Daniel Borkmann Cc: Joey Pabalinas Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/tags.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/tags.sh b/scripts/tags.sh index 26de7d5aa5c8..4fa070f9231a 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -203,7 +203,7 @@ regex_c=( '/\ Date: Fri, 26 Oct 2018 15:02:41 -0700 Subject: [PATCH 008/132] ocfs2/dlm: remove unnecessary parentheses Clang warns when more than one set of parentheses is used for a single conditional statement: fs/ocfs2/dlm/dlmthread.c:534:18: warning: equality comparison with extraneous parentheses [-Wparentheses-equality] if ((res->owner == dlm->node_num)) { ~~~~~~~~~~~^~~~~~~~~~~~~~~~ fs/ocfs2/dlm/dlmthread.c:534:18: note: remove extraneous parentheses around the comparison to silence this warning if ((res->owner == dlm->node_num)) { ~ ^ ~ Link: http://lkml.kernel.org/r/20180924181929.6853-1-natechancellor@gmail.com Signed-off-by: Nathan Chancellor Reported-by: Nick Desaulniers Reviewed-by: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmthread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 838a06d4066a..074d5de17bb2 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -531,7 +531,7 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) assert_spin_locked(&res->spinlock); /* don't shuffle secondary queues */ - if ((res->owner == dlm->node_num)) { + if (res->owner == dlm->node_num) { if (res->state & (DLM_LOCK_RES_MIGRATING | DLM_LOCK_RES_BLOCK_DIRTY)) return; From 2de24cb742d4f0c41358aa078bed7f089c827ac7 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 26 Oct 2018 15:02:45 -0700 Subject: [PATCH 009/132] ocfs2: remove unused pointer 'eb' Pointer 'eb' is being assigned but is never used hence it is redundant and can be removed. Cleans up clang warning: warning: variable 'eb' set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/20180828141907.10826-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a342f008e42f..d1cbb27808e2 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5106,8 +5106,6 @@ int ocfs2_split_extent(handle_t *handle, * rightmost extent list. */ if (path->p_tree_depth) { - struct ocfs2_extent_block *eb; - ret = ocfs2_read_extent_block(et->et_ci, ocfs2_et_get_last_eb_blk(et), &last_eb_bh); @@ -5115,8 +5113,6 @@ int ocfs2_split_extent(handle_t *handle, mlog_errno(ret); goto out; } - - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; } if (rec->e_cpos == split_rec->e_cpos && From 0ae1c2dbdccc090762eb3c89183d7fc80dafca76 Mon Sep 17 00:00:00 2001 From: Ding Xiang Date: Fri, 26 Oct 2018 15:02:48 -0700 Subject: [PATCH 010/132] ocfs2: remove unneeded null check Null check for kfree is unnecessary, so remove it. Link: http://lkml.kernel.org/r/1535704514-26559-1-git-send-email-dingxiang@cmss.chinamobile.com Signed-off-by: Ding Xiang Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 302cd7caa4a7..da578ad4c08f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1392,8 +1392,7 @@ retry: unlock: spin_unlock(&oi->ip_lock); out: - if (new) - kfree(new); + kfree(new); return ret; } From 999865764f5f128896402572b439269acb471022 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Fri, 26 Oct 2018 15:02:52 -0700 Subject: [PATCH 011/132] fs/ocfs2/dlm/dlmdebug.c: fix a sleep-in-atomic-context bug in dlm_print_one_mle() The kernel module may sleep with holding a spinlock. The function call paths (from bottom to top) in Linux-4.16 are: [FUNC] get_zeroed_page(GFP_NOFS) fs/ocfs2/dlm/dlmdebug.c, 332: get_zeroed_page in dlm_print_one_mle fs/ocfs2/dlm/dlmmaster.c, 240: dlm_print_one_mle in __dlm_put_mle fs/ocfs2/dlm/dlmmaster.c, 255: __dlm_put_mle in dlm_put_mle fs/ocfs2/dlm/dlmmaster.c, 254: spin_lock in dlm_put_ml [FUNC] get_zeroed_page(GFP_NOFS) fs/ocfs2/dlm/dlmdebug.c, 332: get_zeroed_page in dlm_print_one_mle fs/ocfs2/dlm/dlmmaster.c, 240: dlm_print_one_mle in __dlm_put_mle fs/ocfs2/dlm/dlmmaster.c, 222: __dlm_put_mle in dlm_put_mle_inuse fs/ocfs2/dlm/dlmmaster.c, 219: spin_lock in dlm_put_mle_inuse To fix this bug, GFP_NOFS is replaced with GFP_ATOMIC. This bug is found by my static analysis tool DSAC. Link: http://lkml.kernel.org/r/20180901112528.27025-1-baijiaju1990@gmail.com Signed-off-by: Jia-Ju Bai Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmdebug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 9b984cae4c4e..1d6dc8422899 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -329,7 +329,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle) { char *buf; - buf = (char *) get_zeroed_page(GFP_NOFS); + buf = (char *) get_zeroed_page(GFP_ATOMIC); if (buf) { dump_mle(mle, buf, PAGE_SIZE - 1); free_page((unsigned long)buf); From 867632d6a6126a9ef3e8d8015423d90f8613f2ef Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 26 Oct 2018 15:02:56 -0700 Subject: [PATCH 012/132] ocfs2: remove set but not used variable 'rb' Fixes gcc '-Wunused-but-set-variable' warning: fs/ocfs2/refcounttree.c: In function 'ocfs2_create_reflink_node': fs/ocfs2/refcounttree.c:4138:31: warning: variable 'rb' set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1536198443-113047-1-git-send-email-yuehaibing@huawei.com Signed-off-by: YueHaibing Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/refcounttree.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7a5ee145c733..1114ef02e780 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4135,7 +4135,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, struct buffer_head *ref_root_bh = NULL; struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); - struct ocfs2_refcount_block *rb; struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; struct ocfs2_refcount_tree *ref_tree; @@ -4162,7 +4161,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, mlog_errno(ret); goto out; } - rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, &ref_tree->rf_ci, ref_root_bh, From 5780a02fd1e87641ad6a8dd6891a1e890cf45c5d Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Fri, 26 Oct 2018 15:02:59 -0700 Subject: [PATCH 013/132] fs/iomap.c: change return type to vm_fault_t Change iomap_page_mkwrite() return type to vm_fault_t. see commit 1c8f422059ae ("mm: change return type to vm_fault_t") for reference. Link: http://lkml.kernel.org/r/20180827172050.GA18673@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/iomap.c | 2 +- include/linux/iomap.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/iomap.c b/fs/iomap.c index ec15cf2ec696..90c2febc93ac 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1057,7 +1057,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, return length; } -int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 3555d54bf79a..9a4258154b25 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -6,6 +6,7 @@ #include #include #include +#include struct address_space; struct fiemap_extent_info; @@ -141,7 +142,8 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); -int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, + const struct iomap_ops *ops); int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, loff_t start, loff_t len, const struct iomap_ops *ops); loff_t iomap_seek_hole(struct inode *inode, loff_t offset, From 253cc22fc6a1245472b12e2162df00c5817402b7 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 26 Oct 2018 15:03:02 -0700 Subject: [PATCH 014/132] xtensa: use generic vga.h What xtensa has in asm/vga.h is the same as what can be found in asm-generic/vga.h. So use the latter header. Link: http://lkml.kernel.org/r/20180907132219.12979-1-jslaby@suse.cz Signed-off-by: Jiri Slaby Acked-by: Max Filippov Cc: Chris Zankel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/xtensa/include/asm/Kbuild | 1 + arch/xtensa/include/asm/vga.h | 19 ------------------- 2 files changed, 1 insertion(+), 19 deletions(-) delete mode 100644 arch/xtensa/include/asm/vga.h diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 82c756431b49..3310adecafb0 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -26,5 +26,6 @@ generic-y += rwsem.h generic-y += sections.h generic-y += topology.h generic-y += trace_clock.h +generic-y += vga.h generic-y += word-at-a-time.h generic-y += xor.h diff --git a/arch/xtensa/include/asm/vga.h b/arch/xtensa/include/asm/vga.h deleted file mode 100644 index 1fd8cab3a297..000000000000 --- a/arch/xtensa/include/asm/vga.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * include/asm-xtensa/vga.h - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2001 - 2005 Tensilica Inc. - */ - -#ifndef _XTENSA_VGA_H -#define _XTENSA_VGA_H - -#define VGA_MAP_MEM(x,s) (unsigned long)phys_to_virt(x) - -#define vga_readb(x) (*(x)) -#define vga_writeb(x,y) (*(y) = (x)) - -#endif From 0684e6526edfb4debf0a0a884834bb1a104085dc Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 26 Oct 2018 15:03:06 -0700 Subject: [PATCH 015/132] mm/slub.c: switch to bitmap_zalloc() Switch to bitmap_zalloc() to show clearly what we are allocating. Besides that it returns pointer of bitmap type instead of opaque void *. Link: http://lkml.kernel.org/r/20180830104301.61649-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Acked-by: Christoph Lameter Reviewed-by: Andrew Morton Tested-by: David Rientjes Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 8da34a8af53d..37e82a0538aa 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3621,9 +3621,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); void *p; - unsigned long *map = kcalloc(BITS_TO_LONGS(page->objects), - sizeof(long), - GFP_ATOMIC); + unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC); if (!map) return; slab_err(s, page, text, s->name); @@ -3638,7 +3636,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, } } slab_unlock(page); - kfree(map); + bitmap_free(map); #endif } @@ -4411,10 +4409,8 @@ static long validate_slab_cache(struct kmem_cache *s) { int node; unsigned long count = 0; - unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)), - sizeof(unsigned long), - GFP_KERNEL); struct kmem_cache_node *n; + unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); if (!map) return -ENOMEM; @@ -4422,7 +4418,7 @@ static long validate_slab_cache(struct kmem_cache *s) flush_all(s); for_each_kmem_cache_node(s, node, n) count += validate_slab_node(s, n, map); - kfree(map); + bitmap_free(map); return count; } /* @@ -4573,14 +4569,12 @@ static int list_locations(struct kmem_cache *s, char *buf, unsigned long i; struct loc_track t = { 0, 0, NULL }; int node; - unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)), - sizeof(unsigned long), - GFP_KERNEL); struct kmem_cache_node *n; + unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) { - kfree(map); + bitmap_free(map); return sprintf(buf, "Out of memory\n"); } /* Push back cpu slabs */ @@ -4646,7 +4640,7 @@ static int list_locations(struct kmem_cache *s, char *buf, } free_loc_track(&t); - kfree(map); + bitmap_free(map); if (!t.count) len += sprintf(buf, "No data\n"); return len; From 61448479a9f2c954cde0cfe778cb6bec5d0a748d Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Fri, 26 Oct 2018 15:03:12 -0700 Subject: [PATCH 016/132] mm: don't warn about large allocations for slab Slub does not call kmalloc_slab() for sizes > KMALLOC_MAX_CACHE_SIZE, instead it falls back to kmalloc_large(). For slab KMALLOC_MAX_CACHE_SIZE == KMALLOC_MAX_SIZE and it calls kmalloc_slab() for all allocations relying on NULL return value for over-sized allocations. This inconsistency leads to unwanted warnings from kmalloc_slab() for over-sized allocations for slab. Returning NULL for failed allocations is the expected behavior. Make slub and slab code consistent by checking size > KMALLOC_MAX_CACHE_SIZE in slab before calling kmalloc_slab(). While we are here also fix the check in kmalloc_slab(). We should check against KMALLOC_MAX_CACHE_SIZE rather than KMALLOC_MAX_SIZE. It all kinda worked because for slab the constants are the same, and slub always checks the size against KMALLOC_MAX_CACHE_SIZE before kmalloc_slab(). But if we get there with size > KMALLOC_MAX_CACHE_SIZE anyhow bad things will happen. For example, in case of a newly introduced bug in slub code. Also move the check in kmalloc_slab() from function entry to the size > 192 case. This partially compensates for the additional check in slab code and makes slub code a bit faster (at least theoretically). Also drop __GFP_NOWARN in the warning check. This warning means a bug in slab code itself, user-passed flags have nothing to do with it. Nothing of this affects slob. Link: http://lkml.kernel.org/r/20180927171502.226522-1-dvyukov@gmail.com Signed-off-by: Dmitry Vyukov Reported-by: syzbot+87829a10073277282ad1@syzkaller.appspotmail.com Reported-by: syzbot+ef4e8fc3a06e9019bb40@syzkaller.appspotmail.com Reported-by: syzbot+6e438f4036df52cbb863@syzkaller.appspotmail.com Reported-by: syzbot+8574471d8734457d98aa@syzkaller.appspotmail.com Reported-by: syzbot+af1504df0807a083dbd9@syzkaller.appspotmail.com Acked-by: Christoph Lameter Acked-by: Vlastimil Babka Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 4 ++++ mm/slab_common.c | 12 ++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index aa76a70e087e..d73c7a4820a4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3675,6 +3675,8 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) struct kmem_cache *cachep; void *ret; + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) + return NULL; cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; @@ -3710,6 +3712,8 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, struct kmem_cache *cachep; void *ret; + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) + return NULL; cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; diff --git a/mm/slab_common.c b/mm/slab_common.c index fea3376f9816..3a7ac4f15194 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1027,18 +1027,18 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) { unsigned int index; - if (unlikely(size > KMALLOC_MAX_SIZE)) { - WARN_ON_ONCE(!(flags & __GFP_NOWARN)); - return NULL; - } - if (size <= 192) { if (!size) return ZERO_SIZE_PTR; index = size_index[size_index_elem(size)]; - } else + } else { + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { + WARN_ON(1); + return NULL; + } index = fls(size - 1); + } #ifdef CONFIG_ZONE_DMA if (unlikely((flags & GFP_DMA))) From c5fd3ca06b4699e251b4a1fb808c2d5124494101 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Fri, 26 Oct 2018 15:03:15 -0700 Subject: [PATCH 017/132] slub: extend slub debug to handle multiple slabs Extend the slub_debug syntax to "slub_debug=[,]*", where may contain an asterisk at the end. For example, the following would poison all kmalloc slabs: slub_debug=P,kmalloc* and the following would apply the default flags to all kmalloc and all block IO slabs: slub_debug=,bio*,kmalloc* Please note that a similar patch was posted by Iliyan Malchev some time ago but was never merged: https://marc.info/?l=linux-mm&m=131283905330474&w=2 Link: http://lkml.kernel.org/r/20180928111139.27962-1-atomlin@redhat.com Signed-off-by: Aaron Tomlin Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Iliyan Malchev Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/slub.rst | 12 +++++++--- mm/slub.c | 50 ++++++++++++++++++++++++++++++++++----- 2 files changed, 53 insertions(+), 9 deletions(-) diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst index 3a775fd64e2d..195928808bac 100644 --- a/Documentation/vm/slub.rst +++ b/Documentation/vm/slub.rst @@ -36,9 +36,10 @@ debugging is enabled. Format: slub_debug= Enable options for all slabs -slub_debug=, - Enable options only for select slabs +slub_debug=,,,... + Enable options only for select slabs (no spaces + after a comma) Possible debug options are:: @@ -62,7 +63,12 @@ Trying to find an issue in the dentry cache? Try:: slub_debug=,dentry -to only enable debugging on the dentry cache. +to only enable debugging on the dentry cache. You may use an asterisk at the +end of the slab name, in order to cover all slabs with the same prefix. For +example, here's how you can poison the dentry cache as well as all kmalloc +slabs: + + slub_debug=P,kmalloc-*,dentry Red zoning and tracking may realign the slab. We can just apply sanity checks to the dentry cache with:: diff --git a/mm/slub.c b/mm/slub.c index 37e82a0538aa..18bd07daf4e4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1276,16 +1276,54 @@ out: __setup("slub_debug", setup_slub_debug); +/* + * kmem_cache_flags - apply debugging options to the cache + * @object_size: the size of an object without meta data + * @flags: flags to set + * @name: name of the cache + * @ctor: constructor function + * + * Debug option(s) are applied to @flags. In addition to the debug + * option(s), if a slab name (or multiple) is specified i.e. + * slub_debug=,, ... + * then only the select slabs will receive the debug option(s). + */ slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)) { - /* - * Enable debugging if selected on the kernel commandline. - */ - if (slub_debug && (!slub_debug_slabs || (name && - !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))) - flags |= slub_debug; + char *iter; + size_t len; + + /* If slub_debug = 0, it folds into the if conditional. */ + if (!slub_debug_slabs) + return flags | slub_debug; + + len = strlen(name); + iter = slub_debug_slabs; + while (*iter) { + char *end, *glob; + size_t cmplen; + + end = strchr(iter, ','); + if (!end) + end = iter + strlen(iter); + + glob = strnchr(iter, end - iter, '*'); + if (glob) + cmplen = glob - iter; + else + cmplen = max_t(size_t, len, (end - iter)); + + if (!strncmp(name, iter, cmplen)) { + flags |= slub_debug; + break; + } + + if (!*end) + break; + iter = end + 1; + } return flags; } From 9b6f7e163cd0f468d1b9696b785659d3c27c8667 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 26 Oct 2018 15:03:19 -0700 Subject: [PATCH 018/132] mm: rework memcg kernel stack accounting If CONFIG_VMAP_STACK is set, kernel stacks are allocated using __vmalloc_node_range() with __GFP_ACCOUNT. So kernel stack pages are charged against corresponding memory cgroups on allocation and uncharged on releasing them. The problem is that we do cache kernel stacks in small per-cpu caches and do reuse them for new tasks, which can belong to different memory cgroups. Each stack page still holds a reference to the original cgroup, so the cgroup can't be released until the vmap area is released. To make this happen we need more than two subsequent exits without forks in between on the current cpu, which makes it very unlikely to happen. As a result, I saw a significant number of dying cgroups (in theory, up to 2 * number_of_cpu + number_of_tasks), which can't be released even by significant memory pressure. As a cgroup structure can take a significant amount of memory (first of all, per-cpu data like memcg statistics), it leads to a noticeable waste of memory. Link: http://lkml.kernel.org/r/20180827162621.30187-1-guro@fb.com Fixes: ac496bf48d97 ("fork: Optimize task creation by caching two thread stacks per CPU if CONFIG_VMAP_STACK=y") Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Andy Lutomirski Cc: Konstantin Khlebnikov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 13 ++++++++- kernel/fork.c | 55 +++++++++++++++++++++++++++++++++----- 2 files changed, 61 insertions(+), 7 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 652f602167df..4399cc3f00e4 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1268,10 +1268,11 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); void memcg_kmem_put_cache(struct kmem_cache *cachep); int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct mem_cgroup *memcg); + +#ifdef CONFIG_MEMCG_KMEM int memcg_kmem_charge(struct page *page, gfp_t gfp, int order); void memcg_kmem_uncharge(struct page *page, int order); -#ifdef CONFIG_MEMCG_KMEM extern struct static_key_false memcg_kmem_enabled_key; extern struct workqueue_struct *memcg_kmem_cache_wq; @@ -1307,6 +1308,16 @@ extern int memcg_expand_shrinker_maps(int new_id); extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); #else + +static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +{ + return 0; +} + +static inline void memcg_kmem_uncharge(struct page *page, int order) +{ +} + #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) diff --git a/kernel/fork.c b/kernel/fork.c index f0b58479534f..3c719fec46c5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -223,9 +223,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) return s->addr; } + /* + * Allocated stacks are cached and later reused by new threads, + * so memcg accounting is performed manually on assigning/releasing + * stacks to tasks. Drop __GFP_ACCOUNT. + */ stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, VMALLOC_START, VMALLOC_END, - THREADINFO_GFP, + THREADINFO_GFP & ~__GFP_ACCOUNT, PAGE_KERNEL, 0, node, __builtin_return_address(0)); @@ -248,9 +253,19 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) static inline void free_thread_stack(struct task_struct *tsk) { #ifdef CONFIG_VMAP_STACK - if (task_stack_vm_area(tsk)) { + struct vm_struct *vm = task_stack_vm_area(tsk); + + if (vm) { int i; + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + mod_memcg_page_state(vm->pages[i], + MEMCG_KERNEL_STACK_KB, + -(int)(PAGE_SIZE / 1024)); + + memcg_kmem_uncharge(vm->pages[i], 0); + } + for (i = 0; i < NR_CACHED_STACKS; i++) { if (this_cpu_cmpxchg(cached_stacks[i], NULL, tsk->stack_vm_area) != NULL) @@ -351,10 +366,6 @@ static void account_kernel_stack(struct task_struct *tsk, int account) NR_KERNEL_STACK_KB, PAGE_SIZE / 1024 * account); } - - /* All stack pages belong to the same memcg. */ - mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); } else { /* * All stack pages are in the same zone and belong to the @@ -370,6 +381,35 @@ static void account_kernel_stack(struct task_struct *tsk, int account) } } +static int memcg_charge_kernel_stack(struct task_struct *tsk) +{ +#ifdef CONFIG_VMAP_STACK + struct vm_struct *vm = task_stack_vm_area(tsk); + int ret; + + if (vm) { + int i; + + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + /* + * If memcg_kmem_charge() fails, page->mem_cgroup + * pointer is NULL, and both memcg_kmem_uncharge() + * and mod_memcg_page_state() in free_thread_stack() + * will ignore this page. So it's safe. + */ + ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0); + if (ret) + return ret; + + mod_memcg_page_state(vm->pages[i], + MEMCG_KERNEL_STACK_KB, + PAGE_SIZE / 1024); + } + } +#endif + return 0; +} + static void release_task_stack(struct task_struct *tsk) { if (WARN_ON(tsk->state != TASK_DEAD)) @@ -807,6 +847,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!stack) goto free_tsk; + if (memcg_charge_kernel_stack(tsk)) + goto free_stack; + stack_vm_area = task_stack_vm_area(tsk); err = arch_dup_task_struct(tsk, orig); From 591edfb10a949d635ed770c6e85ec5286206c07e Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 26 Oct 2018 15:03:23 -0700 Subject: [PATCH 019/132] mm: drain memcg stocks on css offlining Memcg charge is batched using per-cpu stocks, so an offline memcg can be pinned by a cached charge up to a moment, when a process belonging to some other cgroup will charge some memory on the same cpu. In other words, cached charges can prevent a memory cgroup from being reclaimed for some time, without any clear need. Let's optimize it by explicit draining of all stocks on css offlining. As draining is performed asynchronously, and is skipped if any parallel draining is happening, it's cheap. Link: http://lkml.kernel.org/r/20180827162621.30187-2-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Konstantin Khlebnikov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e79cb59552d9..fcec9b39e2a3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4573,6 +4573,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); wb_memcg_offline(memcg); + drain_all_stock(memcg); + mem_cgroup_id_put(memcg); } From 68600f623d69da428c6163275f97ca126e1a8ec5 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 26 Oct 2018 15:03:27 -0700 Subject: [PATCH 020/132] mm: don't miss the last page because of round-off error I've noticed, that dying memory cgroups are often pinned in memory by a single pagecache page. Even under moderate memory pressure they sometimes stayed in such state for a long time. That looked strange. My investigation showed that the problem is caused by applying the LRU pressure balancing math: scan = div64_u64(scan * fraction[lru], denominator), where denominator = fraction[anon] + fraction[file] + 1. Because fraction[lru] is always less than denominator, if the initial scan size is 1, the result is always 0. This means the last page is not scanned and has no chances to be reclaimed. Fix this by rounding up the result of the division. In practice this change significantly improves the speed of dying cgroups reclaim. [guro@fb.com: prevent double calculation of DIV64_U64_ROUND_UP() arguments] Link: http://lkml.kernel.org/r/20180829213311.GA13501@castle Link: http://lkml.kernel.org/r/20180827162621.30187-3-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Andrew Morton Cc: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: Rik van Riel Cc: Konstantin Khlebnikov Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/math64.h | 3 +++ mm/vmscan.c | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/math64.h b/include/linux/math64.h index 837f2f2d1d34..bb2c84afb80c 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -281,4 +281,7 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor) } #endif /* mul_u64_u32_div */ +#define DIV64_U64_ROUND_UP(ll, d) \ + ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); }) + #endif /* _LINUX_MATH64_H */ diff --git a/mm/vmscan.c b/mm/vmscan.c index c5ef7240cbcb..961401c46334 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2456,9 +2456,11 @@ out: /* * Scan types proportional to swappiness and * their relative recent reclaim efficiency. + * Make sure we don't miss the last page + * because of a round-off error. */ - scan = div64_u64(scan * fraction[file], - denominator); + scan = DIV64_U64_ROUND_UP(scan * fraction[file], + denominator); break; case SCAN_FILE: case SCAN_ANON: From 15f570bf3d13aa94a97234538a5110d18df03aa3 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 26 Oct 2018 15:03:31 -0700 Subject: [PATCH 021/132] mm,page_alloc: PF_WQ_WORKER threads must sleep at should_reclaim_retry() Tetsuo Handa has reported that it is possible to bypass the short sleep for PF_WQ_WORKER threads which was introduced by commit 373ccbe5927034b5 ("mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't make any progress") and lock up the system if OOM. The primary reason is that WQ_MEM_RECLAIM WQs are not guaranteed to run even when they have a rescuer available. Those workers might be essential for reclaim to make a forward progress, however. If we are too unlucky all the allocations requests can get stuck waiting for a WQ_MEM_RECLAIM work item and the system is essentially stuck in an OOM condition without much hope to move on. Tetsuo has seen the reclaim stuck on drain_local_pages_wq or xlog_cil_push_work (xfs). There might be others. Since should_reclaim_retry() should be a natural reschedule point, let's do the short sleep for PF_WQ_WORKER threads unconditionally in order to guarantee that other pending work items are started. This will workaround this problem and it is less fragile than hunting down when the sleep is missed. Having a single sleeping point is more robust. [akpm@linux-foundation.org: reflow comment to 80 cols to save a couple of lines] Link: http://lkml.kernel.org/r/20180827135101.15700-1-mhocko@kernel.org Signed-off-by: Michal Hocko Debugged-by: Tetsuo Handa Reported-by: Tetsuo Handa Reviewed-by: Andrew Morton Cc: Roman Gushchin Cc: Johannes Weiner Cc: Vladimir Davydov Cc: David Rientjes Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e2ef1c17942f..7e2fb6e38f34 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3922,6 +3922,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, { struct zone *zone; struct zoneref *z; + bool ret = false; /* * Costly allocations might have made a progress but this doesn't mean @@ -3985,25 +3986,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, } } - /* - * Memory allocation/reclaim might be called from a WQ - * context and the current implementation of the WQ - * concurrency control doesn't recognize that - * a particular WQ is congested if the worker thread is - * looping without ever sleeping. Therefore we have to - * do a short sleep here rather than calling - * cond_resched(). - */ - if (current->flags & PF_WQ_WORKER) - schedule_timeout_uninterruptible(1); - else - cond_resched(); - - return true; + ret = true; + goto out; } } - return false; +out: + /* + * Memory allocation/reclaim might be called from a WQ context and the + * current implementation of the WQ concurrency control doesn't + * recognize that a particular WQ is congested if the worker thread is + * looping without ever sleeping. Therefore we have to do a short sleep + * here rather than calling cond_resched(). + */ + if (current->flags & PF_WQ_WORKER) + schedule_timeout_uninterruptible(1); + else + cond_resched(); + return ret; } static inline bool From 33490af3f5c15757448b6c454ca93b48a333aa1b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 26 Oct 2018 15:03:35 -0700 Subject: [PATCH 022/132] mm, mmu_notifier: be explicit about range invalition non-blocking mode If invalidate_range_start() is called for !blocking mode then all callbacks have to guarantee they will no block/sleep. The same obviously applies to invalidate_range_end because this operation pairs with the former and they are called from the same context. Make sure this is appropriately documented. Link: http://lkml.kernel.org/r/20180827112623.8992-3-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Jerome Glisse Cc: Boris Ostrovsky Cc: David Rientjes Cc: Juergen Gross Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 133ba78820ee..698e371aafe3 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -153,7 +153,9 @@ struct mmu_notifier_ops { * * If blockable argument is set to false then the callback cannot * sleep and has to return with -EAGAIN. 0 should be returned - * otherwise. + * otherwise. Please note that if invalidate_range_start approves + * a non-blocking behavior then the same applies to + * invalidate_range_end. * */ int (*invalidate_range_start)(struct mmu_notifier *mn, From 4e15a073a168b62311db911a55c4d4f1500c2821 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 26 Oct 2018 15:03:39 -0700 Subject: [PATCH 023/132] Revert "mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks" Revert 5ff7091f5a2ca ("mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks"). MMU_INVALIDATE_DOES_NOT_BLOCK flags was the only one used and it is no longer needed since 93065ac753e4 ("mm, oom: distinguish blockable mode for mmu notifiers"). We now have a full support for per range !blocking behavior so we can drop the stop gap workaround which the per notifier flag was used for. Link: http://lkml.kernel.org/r/20180827112623.8992-4-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: David Rientjes Cc: Boris Ostrovsky Cc: Jerome Glisse Cc: Juergen Gross Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/hw/hfi1/mmu_rb.c | 1 - drivers/iommu/amd_iommu_v2.c | 1 - drivers/iommu/intel-svm.c | 1 - drivers/misc/sgi-gru/grutlbpurge.c | 1 - include/linux/mmu_notifier.h | 23 --------------------- mm/mmu_notifier.c | 31 ----------------------------- virt/kvm/kvm_main.c | 1 - 7 files changed, 59 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index e1c7996c018e..475b769e120c 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -77,7 +77,6 @@ static void do_remove(struct mmu_rb_handler *handler, static void handle_remove(struct work_struct *work); static const struct mmu_notifier_ops mn_opts = { - .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = mmu_notifier_range_start, }; diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index 58da65df03f5..fd552235bd13 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -427,7 +427,6 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) } static const struct mmu_notifier_ops iommu_mn = { - .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .release = mn_release, .clear_flush_young = mn_clear_flush_young, .invalidate_range = mn_invalidate_range, diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 4a03e5090952..db301efe126d 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -273,7 +273,6 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) } static const struct mmu_notifier_ops intel_mmuops = { - .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .release = intel_mm_release, .change_pte = intel_change_pte, .invalidate_range = intel_invalidate_range, diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c index be28f05bfafa..03b49d52092e 100644 --- a/drivers/misc/sgi-gru/grutlbpurge.c +++ b/drivers/misc/sgi-gru/grutlbpurge.c @@ -261,7 +261,6 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm) static const struct mmu_notifier_ops gru_mmuops = { - .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = gru_invalidate_range_start, .invalidate_range_end = gru_invalidate_range_end, .release = gru_release, diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 698e371aafe3..9893a6432adf 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -2,7 +2,6 @@ #ifndef _LINUX_MMU_NOTIFIER_H #define _LINUX_MMU_NOTIFIER_H -#include #include #include #include @@ -11,9 +10,6 @@ struct mmu_notifier; struct mmu_notifier_ops; -/* mmu_notifier_ops flags */ -#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01) - #ifdef CONFIG_MMU_NOTIFIER /* @@ -30,15 +26,6 @@ struct mmu_notifier_mm { }; struct mmu_notifier_ops { - /* - * Flags to specify behavior of callbacks for this MMU notifier. - * Used to determine which context an operation may be called. - * - * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not - * block - */ - int flags; - /* * Called either by mmu_notifier_unregister or when the mm is * being destroyed by exit_mmap, always before all pages are @@ -183,10 +170,6 @@ struct mmu_notifier_ops { * Note that this function might be called with just a sub-range * of what was passed to invalidate_range_start()/end(), if * called between those functions. - * - * If this callback cannot block, and invalidate_range_{start,end} - * cannot block, mmu_notifier_ops.flags should have - * MMU_INVALIDATE_DOES_NOT_BLOCK set. */ void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); @@ -241,7 +224,6 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); -extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm); static inline void mmu_notifier_release(struct mm_struct *mm) { @@ -495,11 +477,6 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, { } -static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) -{ - return false; -} - static inline void mmu_notifier_mm_init(struct mm_struct *mm) { } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 82bb1a939c0e..5119ff846769 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -247,37 +247,6 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); -/* - * Must be called while holding mm->mmap_sem for either read or write. - * The result is guaranteed to be valid until mm->mmap_sem is dropped. - */ -bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) -{ - struct mmu_notifier *mn; - int id; - bool ret = false; - - WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem)); - - if (!mm_has_notifiers(mm)) - return ret; - - id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (!mn->ops->invalidate_range && - !mn->ops->invalidate_range_start && - !mn->ops->invalidate_range_end) - continue; - - if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) { - ret = true; - break; - } - } - srcu_read_unlock(&srcu, id); - return ret; -} - static int do_mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm, int take_mmap_sem) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 786ade1843a2..2679e476b6c3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -497,7 +497,6 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { - .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, From 154221c3e52083d9f54fa58b4e1090264969f6bc Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Fri, 26 Oct 2018 15:03:42 -0700 Subject: [PATCH 024/132] kmemleak: add module param to print warnings to dmesg Currently, kmemleak only prints the number of suspected leaks to dmesg but requires the user to read a debugfs file to get the actual stack traces of the objects' allocation points. Add a module option to print the full object information to dmesg too. It can be enabled with kmemleak.verbose=1 on the kernel command line, or "echo 1 > /sys/module/kmemleak/parameters/verbose": This allows easier integration of kmemleak into test systems: We have automated test infrastructure to test our Linux systems. With this option, running our tests with kmemleak is as simple as enabling kmemleak and passing this command line option; the test infrastructure knows how to save kernel logs, which will now include kmemleak reports. Without this option, the test infrastructure needs to be specifically taught to read out the kmemleak debugfs file. Removing this need for special handling makes kmemleak more similar to other kernel debug options (slab debugging, debug objects, etc). Link: http://lkml.kernel.org/r/20180903144046.21023-1-vincent.whitchurch@axis.com Signed-off-by: Vincent Whitchurch Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 42 +++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 17dd883198ae..4f7e4b5a2f08 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include #include @@ -181,6 +182,7 @@ struct kmemleak_object { /* flag set to not scan the object */ #define OBJECT_NO_SCAN (1 << 2) +#define HEX_PREFIX " " /* number of bytes to print per line; must be 16 or 32 */ #define HEX_ROW_SIZE 16 /* number of bytes to print at a time (1, 2, 4, 8) */ @@ -235,6 +237,9 @@ static int kmemleak_skip_disable; /* If there are leaks that can be reported */ static bool kmemleak_found_leaks; +static bool kmemleak_verbose; +module_param_named(verbose, kmemleak_verbose, bool, 0600); + /* * Early object allocation/freeing logging. Kmemleak is initialized after the * kernel allocator. However, both the kernel allocator and kmemleak may @@ -299,6 +304,25 @@ static void kmemleak_disable(void); kmemleak_disable(); \ } while (0) +#define warn_or_seq_printf(seq, fmt, ...) do { \ + if (seq) \ + seq_printf(seq, fmt, ##__VA_ARGS__); \ + else \ + pr_warn(fmt, ##__VA_ARGS__); \ +} while (0) + +static void warn_or_seq_hex_dump(struct seq_file *seq, int prefix_type, + int rowsize, int groupsize, const void *buf, + size_t len, bool ascii) +{ + if (seq) + seq_hex_dump(seq, HEX_PREFIX, prefix_type, rowsize, groupsize, + buf, len, ascii); + else + print_hex_dump(KERN_WARNING, pr_fmt(HEX_PREFIX), prefix_type, + rowsize, groupsize, buf, len, ascii); +} + /* * Printing of the objects hex dump to the seq file. The number of lines to be * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The @@ -314,10 +338,10 @@ static void hex_dump_object(struct seq_file *seq, /* limit the number of lines to HEX_MAX_LINES */ len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE); - seq_printf(seq, " hex dump (first %zu bytes):\n", len); + warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len); kasan_disable_current(); - seq_hex_dump(seq, " ", DUMP_PREFIX_NONE, HEX_ROW_SIZE, - HEX_GROUP_SIZE, ptr, len, HEX_ASCII); + warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE, + HEX_GROUP_SIZE, ptr, len, HEX_ASCII); kasan_enable_current(); } @@ -365,17 +389,17 @@ static void print_unreferenced(struct seq_file *seq, int i; unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); - seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", + warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", object->pointer, object->size); - seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", + warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", object->comm, object->pid, object->jiffies, msecs_age / 1000, msecs_age % 1000); hex_dump_object(seq, object); - seq_printf(seq, " backtrace:\n"); + warn_or_seq_printf(seq, " backtrace:\n"); for (i = 0; i < object->trace_len; i++) { void *ptr = (void *)object->trace[i]; - seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); + warn_or_seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); } } @@ -1598,6 +1622,10 @@ static void kmemleak_scan(void) if (unreferenced_object(object) && !(object->flags & OBJECT_REPORTED)) { object->flags |= OBJECT_REPORTED; + + if (kmemleak_verbose) + print_unreferenced(NULL, object); + new_leaks++; } spin_unlock_irqrestore(&object->lock, flags); From bcd49e86710b42f15c7512de594d23b3ae0b21d7 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 26 Oct 2018 15:03:46 -0700 Subject: [PATCH 025/132] mm/swapfile.c: use __try_to_reclaim_swap() in free_swap_and_cache() The code path to reclaim the swap entry in free_swap_and_cache() is almost same as that of __try_to_reclaim_swap(). The largest difference is just coding style. So the support to the additional requirement of free_swap_and_cache() is added into __try_to_reclaim_swap(). free_swap_and_cache() is changed to call __try_to_reclaim_swap(), and delete the duplicated code. This will improve code readability and reduce the potential bugs. There are 2 functionality differences between __try_to_reclaim_swap() and swap entry reclaim code of free_swap_and_cache(). - free_swap_and_cache() only reclaims the swap entry if the page is unmapped or swap is getting full. The support has been added into __try_to_reclaim_swap(). - try_to_free_swap() (called by __try_to_reclaim_swap()) checks pm_suspended_storage(), while free_swap_and_cache() not. I think this is OK. Because the page and the swap entry can be reclaimed later eventually. Link: http://lkml.kernel.org/r/20180827075535.17406-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Andrew Morton Cc: Dave Hansen Cc: Michal Hocko Cc: Johannes Weiner Cc: Shaohua Li Cc: Hugh Dickins Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 57 ++++++++++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index d954b71c4f9c..0d44179213ed 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -103,26 +103,39 @@ static inline unsigned char swap_count(unsigned char ent) return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ } +/* Reclaim the swap entry anyway if possible */ +#define TTRS_ANYWAY 0x1 +/* + * Reclaim the swap entry if there are no more mappings of the + * corresponding page + */ +#define TTRS_UNMAPPED 0x2 +/* Reclaim the swap entry if swap is getting full*/ +#define TTRS_FULL 0x4 + /* returns 1 if swap entry is freed */ -static int -__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) +static int __try_to_reclaim_swap(struct swap_info_struct *si, + unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); struct page *page; int ret = 0; - page = find_get_page(swap_address_space(entry), swp_offset(entry)); + page = find_get_page(swap_address_space(entry), offset); if (!page) return 0; /* - * This function is called from scan_swap_map() and it's called - * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. - * We have to use trylock for avoiding deadlock. This is a special + * When this function is called from scan_swap_map_slots() and it's + * called by vmscan.c at reclaiming pages. So, we hold a lock on a page, + * here. We have to use trylock for avoiding deadlock. This is a special * case and you should use try_to_free_swap() with explicit lock_page() * in usual operations. */ if (trylock_page(page)) { - ret = try_to_free_swap(page); + if ((flags & TTRS_ANYWAY) || + ((flags & TTRS_UNMAPPED) && !page_mapped(page)) || + ((flags & TTRS_FULL) && mem_cgroup_swap_full(page))) + ret = try_to_free_swap(page); unlock_page(page); } put_page(page); @@ -780,7 +793,7 @@ checks: int swap_was_freed; unlock_cluster(ci); spin_unlock(&si->lock); - swap_was_freed = __try_to_reclaim_swap(si, offset); + swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ if (swap_was_freed) @@ -1612,7 +1625,6 @@ int try_to_free_swap(struct page *page) int free_swap_and_cache(swp_entry_t entry) { struct swap_info_struct *p; - struct page *page = NULL; unsigned char count; if (non_swap_entry(entry)) @@ -1622,31 +1634,12 @@ int free_swap_and_cache(swp_entry_t entry) if (p) { count = __swap_entry_free(p, entry, 1); if (count == SWAP_HAS_CACHE && - !swap_page_trans_huge_swapped(p, entry)) { - page = find_get_page(swap_address_space(entry), - swp_offset(entry)); - if (page && !trylock_page(page)) { - put_page(page); - page = NULL; - } - } else if (!count) + !swap_page_trans_huge_swapped(p, entry)) + __try_to_reclaim_swap(p, swp_offset(entry), + TTRS_UNMAPPED | TTRS_FULL); + else if (!count) free_swap_slot(entry); } - if (page) { - /* - * Not mapped elsewhere, or swap space full? Free it! - * Also recheck PageSwapCache now page is locked (above). - */ - if (PageSwapCache(page) && !PageWriteback(page) && - (!page_mapped(page) || mem_cgroup_swap_full(page)) && - !swap_page_trans_huge_swapped(p, entry)) { - page = compound_head(page); - delete_from_swap_cache(page); - SetPageDirty(page); - } - unlock_page(page); - put_page(page); - } return p != NULL; } From 10e364da10d73ce4e3b61e1c53319ce93ee51c63 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 26 Oct 2018 15:03:49 -0700 Subject: [PATCH 026/132] mm/swapfile.c: call free_swap_slot() in __swap_entry_free() This is a code cleanup patch without functionality change. Originally, when __swap_entry_free() is called, and its return value is 0, free_swap_slot() will always be called to free the swap entry to the per-CPU pool. So move the call to free_swap_slot() to __swap_entry_free() to simplify the code. Link: http://lkml.kernel.org/r/20180827075535.17406-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Andrew Morton Cc: Dave Hansen Cc: Michal Hocko Cc: Johannes Weiner Cc: Shaohua Li Cc: Hugh Dickins Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 0d44179213ed..eaa5e2e55cab 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1182,6 +1182,8 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p, ci = lock_cluster_or_swap_info(p, offset); usage = __swap_entry_free_locked(p, offset, usage); unlock_cluster_or_swap_info(p, ci); + if (!usage) + free_swap_slot(entry); return usage; } @@ -1212,10 +1214,8 @@ void swap_free(swp_entry_t entry) struct swap_info_struct *p; p = _swap_info_get(entry); - if (p) { - if (!__swap_entry_free(p, entry, 1)) - free_swap_slot(entry); - } + if (p) + __swap_entry_free(p, entry, 1); } /* @@ -1637,8 +1637,6 @@ int free_swap_and_cache(swp_entry_t entry) !swap_page_trans_huge_swapped(p, entry)) __try_to_reclaim_swap(p, swp_offset(entry), TTRS_UNMAPPED | TTRS_FULL); - else if (!count) - free_swap_slot(entry); } return p != NULL; } From 979aafa5919b106a65646ac5f73f8354c0164e63 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 26 Oct 2018 15:03:53 -0700 Subject: [PATCH 027/132] mm/swapfile.c: clear si->swap_map[] in swap_free_cluster() si->swap_map[] of the swap entries in cluster needs to be cleared during freeing. Previously, this is done in the caller of swap_free_cluster(). This may cause code duplication (one user now, will add more users later) and lock/unlock cluster unnecessarily. In this patch, the clearing code is moved to swap_free_cluster() to avoid the downside. Link: http://lkml.kernel.org/r/20180827075535.17406-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Andrew Morton Cc: Dave Hansen Cc: Michal Hocko Cc: Johannes Weiner Cc: Shaohua Li Cc: Hugh Dickins Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index eaa5e2e55cab..2681e50592c5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -932,6 +932,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) struct swap_cluster_info *ci; ci = lock_cluster(si, offset); + memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); cluster_set_count_flag(ci, 0, 0); free_cluster(si, idx); unlock_cluster(ci); @@ -1250,9 +1251,6 @@ void put_swap_page(struct page *page, swp_entry_t entry) if (free_entries == SWAPFILE_CLUSTER) { unlock_cluster_or_swap_info(si, ci); spin_lock(&si->lock); - ci = lock_cluster(si, offset); - memset(map, 0, SWAPFILE_CLUSTER); - unlock_cluster(ci); mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); swap_free_cluster(si, idx); spin_unlock(&si->lock); From 7b0e0c0e35f5c7eca719404ca0e38bc964657330 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Fri, 26 Oct 2018 15:03:58 -0700 Subject: [PATCH 028/132] mm/page_alloc.c: clean up check_for_memory() check_for_memory() looks a bit confusing. First of all, we have this: if (N_MEMORY == N_NORMAL_MEMORY) return; Checking the ENUM declaration, looks like N_MEMORY canot be equal to N_NORMAL_MEMORY. I could not find where N_MEMORY is set to N_NORMAL_MEMORY, or the other way around either, so unless I am missing something, this condition will never evaluate to true. It makes sense to get rid of it. Moving forward, the operations within the loop look a bit confusing as well. We set N_HIGH_MEMORY unconditionally, and then we set N_NORMAL_MEMORY in case we have CONFIG_HIGHMEM (N_NORMAL_MEMORY != N_HIGH_MEMORY) and zone <= ZONE_NORMAL. (N_HIGH_MEMORY falls back to N_NORMAL_MEMORY on !CONFIG_HIGHMEM systems, and that is why we can just go ahead and set N_HIGH_MEMORY unconditionally) Although this works, it is a bit subtle. I think that this could be easier to follow: First, we should only set N_HIGH_MEMORY in case we have CONFIG_HIGHMEM. And then we should set N_NORMAL_MEMORY in case zone <= ZONE_NORMAL, without further checking whether we have CONFIG_HIGHMEM or not. Link: http://lkml.kernel.org/r/20180828210158.4617-1-osalvador@techadventures.net Signed-off-by: Oscar Salvador Reviewed-by: Andrew Morton Cc: Michael Hocko Cc: Vlastimil Babka Cc: Pavel Tatashin Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7e2fb6e38f34..747031c2352d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6803,15 +6803,12 @@ static void check_for_memory(pg_data_t *pgdat, int nid) { enum zone_type zone_type; - if (N_MEMORY == N_NORMAL_MEMORY) - return; - for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; if (populated_zone(zone)) { - node_set_state(nid, N_HIGH_MEMORY); - if (N_NORMAL_MEMORY != N_HIGH_MEMORY && - zone_type <= ZONE_NORMAL) + if (IS_ENABLED(CONFIG_HIGHMEM)) + node_set_state(nid, N_HIGH_MEMORY); + if (zone_type <= ZONE_NORMAL) node_set_state(nid, N_NORMAL_MEMORY); break; } From 4b96a37d1c684f12468f436c0d79b5e6830d0b0b Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Fri, 26 Oct 2018 15:04:03 -0700 Subject: [PATCH 029/132] mm: convert to use vm_fault_t As part of vm_fault_t conversion filemap_page_mkwrite() for the NOMMU case was missed. Now converted. Link: http://lkml.kernel.org/r/20180828174952.GA29229@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Andrew Morton Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 52517f28e6f4..de6fed2a0815 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2748,9 +2748,9 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) return generic_file_mmap(file, vma); } #else -int filemap_page_mkwrite(struct vm_fault *vmf) +vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { - return -ENOSYS; + return VM_FAULT_SIGBUS; } int generic_file_mmap(struct file * file, struct vm_area_struct * vma) { From 7f2764cfbd85a18170f9d7a4cf01454dead8b0bc Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Fri, 26 Oct 2018 15:04:06 -0700 Subject: [PATCH 030/132] cramfs: convert to use vmf_insert_mixed cramfs is the only remaining user of vm_insert_mixed() and should be converted to vmf_insert_mixed(). Based on a previous patch from Matthew Wilcox. Link: http://lkml.kernel.org/r/nycvar.YSQ.7.76.1808290945450.10215@knanqh.ubzr Signed-off-by: Nicolas Pitre Reviewed-by: Andrew Morton Cc: Souptick Joarder a Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/cramfs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index f408994fc632..0c35e62f108d 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -418,9 +418,12 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) int i; vma->vm_flags |= VM_MIXEDMAP; for (i = 0; i < pages && !ret; i++) { + vm_fault_t vmf; unsigned long off = i * PAGE_SIZE; pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV); - ret = vm_insert_mixed(vma, vma->vm_start + off, pfn); + vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn); + if (vmf & VM_FAULT_ERROR) + ret = vm_fault_to_errno(vmf, 0); } } From 5d7476374564645b1a2d299e242ad7b17b1104ee Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 26 Oct 2018 15:04:10 -0700 Subject: [PATCH 031/132] mm: remove vm_insert_mixed() All callers are now converted to vmf_insert_mixed() so convert vmf_insert_mixed() from being a compatibility wrapper into the real function. Link: http://lkml.kernel.org/r/20180828145728.11873-3-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Nicolas Pitre Cc: Souptick Joarder Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 15 +-------------- mm/memory.c | 14 ++++++++++---- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index daa2b8f1e9a8..ecc6f9347756 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2506,7 +2506,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); -int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, +vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); @@ -2525,19 +2525,6 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } -static inline vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, - unsigned long addr, pfn_t pfn) -{ - int err = vm_insert_mixed(vma, addr, pfn); - - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err < 0 && err != -EBUSY) - return VM_FAULT_SIGBUS; - - return VM_FAULT_NOPAGE; -} - static inline vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { diff --git a/mm/memory.c b/mm/memory.c index 21a5e6e4758b..200aaf291e98 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1693,13 +1693,19 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, return insert_pfn(vma, addr, pfn, pgprot, mkwrite); } -int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) +vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn) { - return __vm_insert_mixed(vma, addr, pfn, false); + int err = __vm_insert_mixed(vma, addr, pfn, false); + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; } -EXPORT_SYMBOL(vm_insert_mixed); +EXPORT_SYMBOL(vmf_insert_mixed); /* * If the insertion of PTE failed because someone else already added a From f5e6d1d5f8f3080aa7a51acea1f77085f45abe9c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 26 Oct 2018 15:04:13 -0700 Subject: [PATCH 032/132] mm: introduce vmf_insert_pfn_prot() Like vm_insert_pfn_prot(), but returns a vm_fault_t instead of an errno. Also unexport vm_insert_pfn_prot as it has no modular users. Link: http://lkml.kernel.org/r/20180828145728.11873-4-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Nicolas Pitre Cc: Souptick Joarder Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/memory.c | 47 ++++++++++++++++++++++++++++++---------------- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index ecc6f9347756..f1293bdc6de2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2506,6 +2506,8 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); +vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t pgprot); vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, diff --git a/mm/memory.c b/mm/memory.c index 200aaf291e98..b3eecb3aa65f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1596,21 +1596,6 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_pfn); -/** - * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot - * @vma: user vma to map to - * @addr: target user address of this page - * @pfn: source kernel pfn - * @pgprot: pgprot flags for the inserted page - * - * This is exactly like vm_insert_pfn, except that it allows drivers to - * to override pgprot on a per-page basis. - * - * This only makes sense for IO mappings, and it makes no sense for - * cow mappings. In general, using multiple vmas is preferable; - * vm_insert_pfn_prot should only be used if using multiple VMAs is - * impractical. - */ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot) { @@ -1640,7 +1625,37 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, return ret; } -EXPORT_SYMBOL(vm_insert_pfn_prot); + +/** + * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * @pgprot: pgprot flags for the inserted page + * + * This is exactly like vmf_insert_pfn(), except that it allows drivers to + * to override pgprot on a per-page basis. + * + * This only makes sense for IO mappings, and it makes no sense for + * COW mappings. In general, using multiple vmas is preferable; + * vm_insert_pfn_prot should only be used if using multiple VMAs is + * impractical. + * + * Return: vm_fault_t value. + */ +vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t pgprot) +{ + int err = vm_insert_pfn_prot(vma, addr, pfn, pgprot); + + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; +} +EXPORT_SYMBOL(vmf_insert_pfn_prot); static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) { From b13fd1dc9f853835784c4c7e09f2059128986adb Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 26 Oct 2018 15:04:16 -0700 Subject: [PATCH 033/132] x86: convert vdso to use vm_fault_t Return vm_fault_t codes directly from the appropriate mm routines instead of converting from errnos ourselves. Fixes a minor bug where we'd return SIGBUS instead of the correct OOM code if we ran out of memory allocating page tables. Link: http://lkml.kernel.org/r/20180828145728.11873-5-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Thomas Gleixner Reviewed-by: Andrew Morton Cc: Nicolas Pitre Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/entry/vdso/vma.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 3f9d43f26f63..7eb878561910 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -39,7 +39,7 @@ void __init init_vdso_image(const struct vdso_image *image) struct linux_binprm; -static int vdso_fault(const struct vm_special_mapping *sm, +static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { const struct vdso_image *image = vma->vm_mm->context.vdso_image; @@ -84,12 +84,11 @@ static int vdso_mremap(const struct vm_special_mapping *sm, return 0; } -static int vvar_fault(const struct vm_special_mapping *sm, +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { const struct vdso_image *image = vma->vm_mm->context.vdso_image; long sym_offset; - int ret = -EFAULT; if (!image) return VM_FAULT_SIGBUS; @@ -108,29 +107,24 @@ static int vvar_fault(const struct vm_special_mapping *sm, return VM_FAULT_SIGBUS; if (sym_offset == image->sym_vvar_page) { - ret = vm_insert_pfn(vma, vmf->address, - __pa_symbol(&__vvar_page) >> PAGE_SHIFT); + return vmf_insert_pfn(vma, vmf->address, + __pa_symbol(&__vvar_page) >> PAGE_SHIFT); } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = pvclock_get_pvti_cpu0_va(); if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { - ret = vm_insert_pfn_prot( - vma, - vmf->address, - __pa(pvti) >> PAGE_SHIFT, - pgprot_decrypted(vma->vm_page_prot)); + return vmf_insert_pfn_prot(vma, vmf->address, + __pa(pvti) >> PAGE_SHIFT, + pgprot_decrypted(vma->vm_page_prot)); } } else if (sym_offset == image->sym_hvclock_page) { struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page(); if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK)) - ret = vm_insert_pfn(vma, vmf->address, - vmalloc_to_pfn(tsc_pg)); + return vmf_insert_pfn(vma, vmf->address, + vmalloc_to_pfn(tsc_pg)); } - if (ret == 0 || ret == -EBUSY) - return VM_FAULT_NOPAGE; - return VM_FAULT_SIGBUS; } From bc12e6ad9617831727e4201e7cbf5c3b868cc8fd Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 26 Oct 2018 15:04:21 -0700 Subject: [PATCH 034/132] mm: make vm_insert_pfn_prot() static Now this is no longer used outside mm/memory.c, make it static. Link: http://lkml.kernel.org/r/20180828145728.11873-6-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Nicolas Pitre Cc: Souptick Joarder Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- mm/memory.c | 50 +++++++++++++++++++++++----------------------- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f1293bdc6de2..0f5db0455e61 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2504,8 +2504,6 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long addr, int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); -int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, pgprot_t pgprot); vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index b3eecb3aa65f..6365144f8267 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1572,31 +1572,7 @@ out: return retval; } -/** - * vm_insert_pfn - insert single pfn into user vma - * @vma: user vma to map to - * @addr: target user address of this page - * @pfn: source kernel pfn - * - * Similar to vm_insert_page, this allows drivers to insert individual pages - * they've allocated into a user vma. Same comments apply. - * - * This function should only be called from a vm_ops->fault handler, and - * in that case the handler should return NULL. - * - * vma cannot be a COW mapping. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - */ -int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn) -{ - return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); -} -EXPORT_SYMBOL(vm_insert_pfn); - -int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, +static int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot) { int ret; @@ -1626,6 +1602,30 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, return ret; } +/** + * vm_insert_pfn - insert single pfn into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * + * Similar to vm_insert_page, this allows drivers to insert individual pages + * they've allocated into a user vma. Same comments apply. + * + * This function should only be called from a vm_ops->fault handler, and + * in that case the handler should return NULL. + * + * vma cannot be a COW mapping. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + */ +int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_pfn); + /** * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot * @vma: user vma to map to From 67fa1666223d7c825f6651add97f0011fe155f36 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 26 Oct 2018 15:04:26 -0700 Subject: [PATCH 035/132] mm: remove references to vm_insert_pfn() Documentation and comments. Link: http://lkml.kernel.org/r/20180828145728.11873-7-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Nicolas Pitre Cc: Souptick Joarder Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/x86/pat.txt | 4 ++-- include/asm-generic/pgtable.h | 4 ++-- include/linux/hmm.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt index 2a4ee6302122..481d8d8536ac 100644 --- a/Documentation/x86/pat.txt +++ b/Documentation/x86/pat.txt @@ -90,12 +90,12 @@ pci proc | -- | -- | WC | Advanced APIs for drivers ------------------------- A. Exporting pages to users with remap_pfn_range, io_remap_pfn_range, -vm_insert_pfn +vmf_insert_pfn Drivers wanting to export some pages to userspace do it by using mmap interface and a combination of 1) pgprot_noncached() -2) io_remap_pfn_range() or remap_pfn_range() or vm_insert_pfn() +2) io_remap_pfn_range() or remap_pfn_range() or vmf_insert_pfn() With PAT support, a new API pgprot_writecombine is being added. So, drivers can continue to use the above sequence, with either pgprot_noncached() or diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 88ebc6102c7c..5657a20e0c59 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -757,7 +757,7 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) /* * Interfaces that can be used by architecture code to keep track of * memory type of pfn mappings specified by the remap_pfn_range, - * vm_insert_pfn. + * vmf_insert_pfn. */ /* @@ -773,7 +773,7 @@ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, /* * track_pfn_insert is called when a _new_ single pfn is established - * by vm_insert_pfn(). + * by vmf_insert_pfn(). */ static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 4c92e3ba3e16..dde947083d4e 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -107,7 +107,7 @@ enum hmm_pfn_flag_e { * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory * HMM_PFN_NONE: corresponding CPU page table entry is pte_none() * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the - * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not + * result of vmf_insert_pfn() or vm_insert_page(). Therefore, it should not * be mirrored by a device, because the entry will never have HMM_PFN_VALID * set and the pfn value is undefined. * From ae2b01f37044c10e975d22116755df56252b09d8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 26 Oct 2018 15:04:29 -0700 Subject: [PATCH 036/132] mm: remove vm_insert_pfn() All callers are now converted to vmf_insert_pfn() so convert vmf_insert_pfn() from being a compatibility wrapper around vm_insert_pfn() to being a compatibility wrapper around vmf_insert_pfn_prot(). Link: http://lkml.kernel.org/r/20180828145728.11873-8-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Nicolas Pitre Cc: Souptick Joarder Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 15 +------------ mm/memory.c | 54 +++++++++++++++++++++++++--------------------- 2 files changed, 30 insertions(+), 39 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 0f5db0455e61..737279bb479c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2502,7 +2502,7 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); -int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, +vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); @@ -2525,19 +2525,6 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } -static inline vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn) -{ - int err = vm_insert_pfn(vma, addr, pfn); - - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err < 0 && err != -EBUSY) - return VM_FAULT_SIGBUS; - - return VM_FAULT_NOPAGE; -} - static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) diff --git a/mm/memory.c b/mm/memory.c index 6365144f8267..08653d0a795a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1602,30 +1602,6 @@ static int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, return ret; } -/** - * vm_insert_pfn - insert single pfn into user vma - * @vma: user vma to map to - * @addr: target user address of this page - * @pfn: source kernel pfn - * - * Similar to vm_insert_page, this allows drivers to insert individual pages - * they've allocated into a user vma. Same comments apply. - * - * This function should only be called from a vm_ops->fault handler, and - * in that case the handler should return NULL. - * - * vma cannot be a COW mapping. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - */ -int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn) -{ - return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); -} -EXPORT_SYMBOL(vm_insert_pfn); - /** * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot * @vma: user vma to map to @@ -1638,9 +1614,10 @@ EXPORT_SYMBOL(vm_insert_pfn); * * This only makes sense for IO mappings, and it makes no sense for * COW mappings. In general, using multiple vmas is preferable; - * vm_insert_pfn_prot should only be used if using multiple VMAs is + * vmf_insert_pfn_prot should only be used if using multiple VMAs is * impractical. * + * Context: Process context. May allocate using %GFP_KERNEL. * Return: vm_fault_t value. */ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, @@ -1657,6 +1634,33 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vmf_insert_pfn_prot); +/** + * vmf_insert_pfn - insert single pfn into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * + * Similar to vm_insert_page, this allows drivers to insert individual pages + * they've allocated into a user vma. Same comments apply. + * + * This function should only be called from a vm_ops->fault handler, and + * in that case the handler should return the result of this function. + * + * vma cannot be a COW mapping. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * Context: Process context. May allocate using %GFP_KERNEL. + * Return: vm_fault_t value. + */ +vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vmf_insert_pfn); + static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) { /* these checks mirror the abort conditions in vm_normal_page */ From 6d958546ff611c9ae09b181e628c1c5d5da5ebda Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 26 Oct 2018 15:04:33 -0700 Subject: [PATCH 037/132] mm: inline vm_insert_pfn_prot() into caller vm_insert_pfn_prot() is only called from vmf_insert_pfn_prot(), so inline it and convert some of the errnos into vm_fault codes earlier. Link: http://lkml.kernel.org/r/20180828145728.11873-9-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Nicolas Pitre Cc: Souptick Joarder Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 55 +++++++++++++++++++++++------------------------------ 1 file changed, 24 insertions(+), 31 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 08653d0a795a..40b692fa4b99 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1572,36 +1572,6 @@ out: return retval; } -static int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, pgprot_t pgprot) -{ - int ret; - /* - * Technically, architectures with pte_special can avoid all these - * restrictions (same for remap_pfn_range). However we would like - * consistency in testing and feature parity among all, so we should - * try to keep these invariants in place for everybody. - */ - BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); - BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == - (VM_PFNMAP|VM_MIXEDMAP)); - BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); - - if (addr < vma->vm_start || addr >= vma->vm_end) - return -EFAULT; - - if (!pfn_modify_allowed(pfn, pgprot)) - return -EACCES; - - track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); - - ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, - false); - - return ret; -} - /** * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot * @vma: user vma to map to @@ -1623,7 +1593,30 @@ static int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot) { - int err = vm_insert_pfn_prot(vma, addr, pfn, pgprot); + int err; + + /* + * Technically, architectures with pte_special can avoid all these + * restrictions (same for remap_pfn_range). However we would like + * consistency in testing and feature parity among all, so we should + * try to keep these invariants in place for everybody. + */ + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == + (VM_PFNMAP|VM_MIXEDMAP)); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); + BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + + if (!pfn_modify_allowed(pfn, pgprot)) + return VM_FAULT_SIGBUS; + + track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); + + err = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, + false); if (err == -ENOMEM) return VM_FAULT_OOM; From 79f3aa5ba989a1fa6e2ef189f2abdfcee25ba663 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 26 Oct 2018 15:04:37 -0700 Subject: [PATCH 038/132] mm: convert __vm_insert_mixed() to vm_fault_t Both of its callers currently convert its errno return into a vm_fault_t, so move the conversion into __vm_insert_mixed(). Link: http://lkml.kernel.org/r/20180828145728.11873-10-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Nicolas Pitre Cc: Souptick Joarder Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 40b692fa4b99..a016fd1198a9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1668,20 +1668,21 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) return false; } -static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn, bool mkwrite) +static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, + unsigned long addr, pfn_t pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; + int err; BUG_ON(!vm_mixed_ok(vma, pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) - return -EFAULT; + return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, pfn); if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) - return -EACCES; + return VM_FAULT_SIGBUS; /* * If we don't have pte special, then we have to use the pfn_valid() @@ -1700,15 +1701,10 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * result in pfn_t_has_page() == false. */ page = pfn_to_page(pfn_t_to_pfn(pfn)); - return insert_page(vma, addr, page, pgprot); + err = insert_page(vma, addr, page, pgprot); + } else { + err = insert_pfn(vma, addr, pfn, pgprot, mkwrite); } - return insert_pfn(vma, addr, pfn, pgprot, mkwrite); -} - -vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) -{ - int err = __vm_insert_mixed(vma, addr, pfn, false); if (err == -ENOMEM) return VM_FAULT_OOM; @@ -1717,6 +1713,12 @@ vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, return VM_FAULT_NOPAGE; } + +vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn) +{ + return __vm_insert_mixed(vma, addr, pfn, false); +} EXPORT_SYMBOL(vmf_insert_mixed); /* @@ -1724,18 +1726,10 @@ EXPORT_SYMBOL(vmf_insert_mixed); * different entry in the mean time, we treat that as success as we assume * the same entry was actually inserted. */ - vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { - int err; - - err = __vm_insert_mixed(vma, addr, pfn, true); - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err < 0 && err != -EBUSY) - return VM_FAULT_SIGBUS; - return VM_FAULT_NOPAGE; + return __vm_insert_mixed(vma, addr, pfn, true); } EXPORT_SYMBOL(vmf_insert_mixed_mkwrite); From 9b5a8e00d479bb5e55f6902bf50877c080d9506d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 26 Oct 2018 15:04:40 -0700 Subject: [PATCH 039/132] mm: convert insert_pfn() to vm_fault_t All callers convert its errno into a vm_fault_t, so convert it to return a vm_fault_t directly. Link: http://lkml.kernel.org/r/20180828145728.11873-11-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Nicolas Pitre Cc: Souptick Joarder Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index a016fd1198a9..6abc74f41bc0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1520,19 +1520,16 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); -static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, +static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; - int retval; pte_t *pte, entry; spinlock_t *ptl; - retval = -ENOMEM; pte = get_locked_pte(mm, addr, &ptl); if (!pte) - goto out; - retval = -EBUSY; + return VM_FAULT_OOM; if (!pte_none(*pte)) { if (mkwrite) { /* @@ -1565,11 +1562,9 @@ out_mkwrite: set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ - retval = 0; out_unlock: pte_unmap_unlock(pte, ptl); -out: - return retval; + return VM_FAULT_NOPAGE; } /** @@ -1593,8 +1588,6 @@ out: vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot) { - int err; - /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like @@ -1615,15 +1608,8 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); - err = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, + return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, false); - - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err < 0 && err != -EBUSY) - return VM_FAULT_SIGBUS; - - return VM_FAULT_NOPAGE; } EXPORT_SYMBOL(vmf_insert_pfn_prot); @@ -1703,7 +1689,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, page = pfn_to_page(pfn_t_to_pfn(pfn)); err = insert_page(vma, addr, page, pgprot); } else { - err = insert_pfn(vma, addr, pfn, pgprot, mkwrite); + return insert_pfn(vma, addr, pfn, pgprot, mkwrite); } if (err == -ENOMEM) From 426dcd4b600f72d9f797eae385d85af4128589cc Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 26 Oct 2018 15:04:44 -0700 Subject: [PATCH 040/132] hexagon: switch to NO_BOOTMEM Patch series "switch several architectures NO_BOOTMEM". These patches perform conversion to NO_BOOTMEM of hexagon, nios2, uml and unicore32. This patch (of 7): Add registration of the system memory with memblock, eliminate bootmem initialization and convert early memory reservations from bootmem to memblock. Link: http://lkml.kernel.org/r/1533326330-31677-2-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Acked-by: Richard Kuo Cc: Guan Xuetao Cc: Ley Foon Tan Cc: Richard Weinberger Cc: Rob Herring Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/hexagon/Kconfig | 3 +++ arch/hexagon/mm/init.c | 20 ++++++++------------ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 3ef46522e89f..7b25d7c8fa49 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -21,6 +21,9 @@ config HEXAGON select GENERIC_IRQ_SHOW select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK + select HAVE_MEMBLOCK + select ARCH_DISCARD_MEMBLOCK + select NO_BOOTMEM select NEED_SG_DMA_LENGTH select NO_IOPORT_MAP select GENERIC_IOMAP diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index 1495d45e472d..d789b9cc0189 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -176,7 +177,6 @@ size_t hexagon_coherent_pool_size = (size_t) (DMA_RESERVE << 22); void __init setup_arch_memory(void) { - int bootmap_size; /* XXX Todo: this probably should be cleaned up */ u32 *segtable = (u32 *) &swapper_pg_dir[0]; u32 *segtable_end; @@ -195,18 +195,22 @@ void __init setup_arch_memory(void) bootmem_lastpg = PFN_DOWN((bootmem_lastpg << PAGE_SHIFT) & ~((BIG_KERNEL_PAGE_SIZE) - 1)); + memblock_add(PHYS_OFFSET, + (bootmem_lastpg - ARCH_PFN_OFFSET) << PAGE_SHIFT); + + /* Reserve kernel text/data/bss */ + memblock_reserve(PHYS_OFFSET, + (bootmem_startpg - ARCH_PFN_OFFSET) << PAGE_SHIFT); /* * Reserve the top DMA_RESERVE bytes of RAM for DMA (uncached) * memory allocation */ - max_low_pfn = bootmem_lastpg - PFN_DOWN(DMA_RESERVED_BYTES); min_low_pfn = ARCH_PFN_OFFSET; - bootmap_size = init_bootmem_node(NODE_DATA(0), bootmem_startpg, min_low_pfn, max_low_pfn); + memblock_reserve(PFN_PHYS(max_low_pfn), DMA_RESERVED_BYTES); printk(KERN_INFO "bootmem_startpg: 0x%08lx\n", bootmem_startpg); printk(KERN_INFO "bootmem_lastpg: 0x%08lx\n", bootmem_lastpg); - printk(KERN_INFO "bootmap_size: %d\n", bootmap_size); printk(KERN_INFO "min_low_pfn: 0x%08lx\n", min_low_pfn); printk(KERN_INFO "max_low_pfn: 0x%08lx\n", max_low_pfn); @@ -256,14 +260,6 @@ void __init setup_arch_memory(void) printk(KERN_INFO "*segtable = 0x%08x\n", *segtable); #endif - /* - * Free all the memory that wasn't taken up by the bootmap, the DMA - * reserve, or kernel itself. - */ - free_bootmem(PFN_PHYS(bootmem_startpg) + bootmap_size, - PFN_PHYS(bootmem_lastpg - bootmem_startpg) - bootmap_size - - DMA_RESERVED_BYTES); - /* * The bootmem allocator seemingly just lives to feed memory * to the paging system From 6072cf567a2be7f13fb3522156409ca0e7a27ff9 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 26 Oct 2018 15:04:48 -0700 Subject: [PATCH 041/132] of: ignore sub-page memory regions Memory region size is rounded down to page boundary and with sub-page region it becomes 0 and there is no point to add an empty region. Moreover, when the base is less than PAGE_SIZE we get a bogus size as (base + size - 1) evaluates to -1. 8cccffc52694 ("of: check for size < 0 after rounding in early_init_dt_add_memory_arch") introduced a test for wrap around for the case when base is not page aligned, the same test can be used to ignore sub-page region sizes. Link: http://lkml.kernel.org/r/1533326330-31677-3-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Rob Herring Cc: Guan Xuetao Cc: Ley Foon Tan Cc: Richard Kuo Cc: Richard Weinberger Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/of/fdt.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 800ad252cf9c..76c83c1ffeda 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -1127,12 +1127,13 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size) { const u64 phys_offset = MIN_MEMBLOCK_ADDR; + if (size < PAGE_SIZE - (base & ~PAGE_MASK)) { + pr_warn("Ignoring memory block 0x%llx - 0x%llx\n", + base, base + size); + return; + } + if (!PAGE_ALIGNED(base)) { - if (size < PAGE_SIZE - (base & ~PAGE_MASK)) { - pr_warn("Ignoring memory block 0x%llx - 0x%llx\n", - base, base + size); - return; - } size -= PAGE_SIZE - (base & ~PAGE_MASK); base = PAGE_ALIGN(base); } From a811c05c16b53ff89e9cfc029ec77a0aae846881 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 26 Oct 2018 15:04:51 -0700 Subject: [PATCH 042/132] nios2: use generic early_init_dt_add_memory_arch All we have to do is to enable memblock, the generic FDT code will take care of the rest. Link: http://lkml.kernel.org/r/1533326330-31677-4-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Acked-by: Ley Foon Tan Cc: Guan Xuetao Cc: Richard Kuo Cc: Richard Weinberger Cc: Rob Herring Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/nios2/Kconfig | 1 + arch/nios2/kernel/prom.c | 10 ---------- arch/nios2/kernel/setup.c | 2 ++ 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index 03965692fbfe..fd29f438ea70 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -23,6 +23,7 @@ config NIOS2 select SPARSE_IRQ select USB_ARCH_HAS_HCD if USB_SUPPORT select CPU_NO_EFFICIENT_FFS + select HAVE_MEMBLOCK config GENERIC_CSUM def_bool y diff --git a/arch/nios2/kernel/prom.c b/arch/nios2/kernel/prom.c index 8d7446a4b475..ba96a498ef13 100644 --- a/arch/nios2/kernel/prom.c +++ b/arch/nios2/kernel/prom.c @@ -32,16 +32,6 @@ #include -void __init early_init_dt_add_memory_arch(u64 base, u64 size) -{ - u64 kernel_start = (u64)virt_to_phys(_text); - - if (!memory_size && - (kernel_start >= base) && (kernel_start < (base + size))) - memory_size = size; - -} - int __init early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size, bool nomap) { diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c index 926a02b17b31..0946840ac3cd 100644 --- a/arch/nios2/kernel/setup.c +++ b/arch/nios2/kernel/setup.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -147,6 +148,7 @@ void __init setup_arch(char **cmdline_p) console_verbose(); + memory_size = memblock_phys_mem_size(); memory_start = PAGE_ALIGN((unsigned long)__pa(_end)); memory_end = (unsigned long) CONFIG_NIOS2_MEM_BASE + memory_size; From 0042379279bcac871d2a60172f942a3a255ec611 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 26 Oct 2018 15:04:55 -0700 Subject: [PATCH 043/132] nios2: switch to NO_BOOTMEM Remove bootmem bitmap initialization and replace reserve_bootmem() with memblock_reserve(). Link: http://lkml.kernel.org/r/1533326330-31677-5-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Acked-by: Ley Foon Tan Cc: Guan Xuetao Cc: Richard Kuo Cc: Richard Weinberger Cc: Rob Herring Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/nios2/Kconfig | 2 ++ arch/nios2/kernel/prom.c | 7 ------- arch/nios2/kernel/setup.c | 37 +++++-------------------------------- 3 files changed, 7 insertions(+), 39 deletions(-) diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index fd29f438ea70..2df0c57f2833 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -24,6 +24,8 @@ config NIOS2 select USB_ARCH_HAS_HCD if USB_SUPPORT select CPU_NO_EFFICIENT_FFS select HAVE_MEMBLOCK + select ARCH_DISCARD_MEMBLOCK + select NO_BOOTMEM config GENERIC_CSUM def_bool y diff --git a/arch/nios2/kernel/prom.c b/arch/nios2/kernel/prom.c index ba96a498ef13..a6d4f7530247 100644 --- a/arch/nios2/kernel/prom.c +++ b/arch/nios2/kernel/prom.c @@ -32,13 +32,6 @@ #include -int __init early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size, - bool nomap) -{ - reserve_bootmem(base, size, BOOTMEM_DEFAULT); - return 0; -} - void __init early_init_devtree(void *params) { __be32 *dtb = (u32 *)__dtb_start; diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c index 0946840ac3cd..2d0011ddd4d5 100644 --- a/arch/nios2/kernel/setup.c +++ b/arch/nios2/kernel/setup.c @@ -144,10 +144,11 @@ asmlinkage void __init nios2_boot_init(unsigned r4, unsigned r5, unsigned r6, void __init setup_arch(char **cmdline_p) { - int bootmap_size; + int dram_start; console_verbose(); + dram_start = memblock_start_of_DRAM(); memory_size = memblock_phys_mem_size(); memory_start = PAGE_ALIGN((unsigned long)__pa(_end)); memory_end = (unsigned long) CONFIG_NIOS2_MEM_BASE + memory_size; @@ -165,39 +166,11 @@ void __init setup_arch(char **cmdline_p) max_low_pfn = PFN_DOWN(memory_end); max_mapnr = max_low_pfn; - /* - * give all the memory to the bootmap allocator, tell it to put the - * boot mem_map at the start of memory - */ - pr_debug("init_bootmem_node(?,%#lx, %#x, %#lx)\n", - min_low_pfn, PFN_DOWN(PHYS_OFFSET), max_low_pfn); - bootmap_size = init_bootmem_node(NODE_DATA(0), - min_low_pfn, PFN_DOWN(PHYS_OFFSET), - max_low_pfn); - - /* - * free the usable memory, we have to make sure we do not free - * the bootmem bitmap so we then reserve it after freeing it :-) - */ - pr_debug("free_bootmem(%#lx, %#lx)\n", - memory_start, memory_end - memory_start); - free_bootmem(memory_start, memory_end - memory_start); - - /* - * Reserve the bootmem bitmap itself as well. We do this in two - * steps (first step was init_bootmem()) because this catches - * the (very unlikely) case of us accidentally initializing the - * bootmem allocator with an invalid RAM area. - * - * Arguments are start, size - */ - pr_debug("reserve_bootmem(%#lx, %#x)\n", memory_start, bootmap_size); - reserve_bootmem(memory_start, bootmap_size, BOOTMEM_DEFAULT); - + memblock_reserve(dram_start, memory_start - dram_start); #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start) { - reserve_bootmem(virt_to_phys((void *)initrd_start), - initrd_end - initrd_start, BOOTMEM_DEFAULT); + memblock_reserve(virt_to_phys((void *)initrd_start), + initrd_end - initrd_start); } #endif /* CONFIG_BLK_DEV_INITRD */ From be6ec5b1eecf1c5e985831540b90a320a9ed3aa5 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 26 Oct 2018 15:04:58 -0700 Subject: [PATCH 044/132] um: setup_physmem: stop using global variables The setup_physmem() function receives uml_physmem and uml_reserved as parameters and still used these global variables. Replace such usage with local variables. Link: http://lkml.kernel.org/r/1533326330-31677-6-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Acked-by: Richard Weinberger Cc: Guan Xuetao Cc: Ley Foon Tan Cc: Richard Kuo Cc: Rob Herring Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/kernel/physmem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index f02596e9931d..0eaec0e24767 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -86,7 +86,7 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, long map_size; int err; - offset = uml_reserved - uml_physmem; + offset = reserve_end - start; map_size = len - offset; if(map_size <= 0) { os_warn("Too few physical memory! Needed=%lu, given=%lu\n", @@ -96,12 +96,12 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, physmem_fd = create_mem_file(len + highmem); - err = os_map_memory((void *) uml_reserved, physmem_fd, offset, + err = os_map_memory((void *) reserve_end, physmem_fd, offset, map_size, 1, 1, 1); if (err < 0) { os_warn("setup_physmem - mapping %ld bytes of memory at 0x%p " "failed - errno = %d\n", map_size, - (void *) uml_reserved, err); + (void *) reserve_end, err); exit(1); } From ddf63983576ac2520d13a49f03f2d382ac60e2c7 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 26 Oct 2018 15:05:02 -0700 Subject: [PATCH 045/132] um: switch to NO_BOOTMEM Replace bootmem initialization with memblock_add and memblock_reserve calls and explicit initialization of {min,max}_low_pfn. Link: http://lkml.kernel.org/r/1533326330-31677-7-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Acked-by: Richard Weinberger Cc: Guan Xuetao Cc: Ley Foon Tan Cc: Richard Kuo Cc: Rob Herring Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/Kconfig | 2 ++ arch/um/kernel/physmem.c | 20 +++++++++----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 6b9938919f0b..10c15b8853ae 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -12,6 +12,8 @@ config UML select HAVE_UID16 select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_DEBUG_KMEMLEAK + select HAVE_MEMBLOCK + select NO_BOOTMEM select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES select GENERIC_CLOCKEVENTS diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index 0eaec0e24767..296a91a04598 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -80,23 +81,18 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, unsigned long len, unsigned long long highmem) { unsigned long reserve = reserve_end - start; - unsigned long pfn = PFN_UP(__pa(reserve_end)); - unsigned long delta = (len - reserve) >> PAGE_SHIFT; - unsigned long offset, bootmap_size; - long map_size; + long map_size = len - reserve; int err; - offset = reserve_end - start; - map_size = len - offset; if(map_size <= 0) { os_warn("Too few physical memory! Needed=%lu, given=%lu\n", - offset, len); + reserve, len); exit(1); } physmem_fd = create_mem_file(len + highmem); - err = os_map_memory((void *) reserve_end, physmem_fd, offset, + err = os_map_memory((void *) reserve_end, physmem_fd, reserve, map_size, 1, 1, 1); if (err < 0) { os_warn("setup_physmem - mapping %ld bytes of memory at 0x%p " @@ -113,9 +109,11 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE); os_fsync_file(physmem_fd); - bootmap_size = init_bootmem(pfn, pfn + delta); - free_bootmem(__pa(reserve_end) + bootmap_size, - len - bootmap_size - reserve); + memblock_add(__pa(start), len + highmem); + memblock_reserve(__pa(start), reserve); + + min_low_pfn = PFN_UP(__pa(reserve_end)); + max_low_pfn = min_low_pfn + (map_size >> PAGE_SHIFT); } int phys_mapping(unsigned long phys, unsigned long long *offset_out) From e92d39cdb120219a16502ca646d407d61cd7adf4 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 26 Oct 2018 15:05:05 -0700 Subject: [PATCH 046/132] unicore32: switch to NO_BOOTMEM The unicore32 architecture already supports memblock and uses it for some early memory reservations, e.g initrd and the page tables. At some point unicore32 allocates the bootmem bitmap from the memblock and then hands over the memory reservations from memblock to bootmem. This patch removes the bootmem initialization and leaves memblock as the only boot time memory manager for unicore32. Link: http://lkml.kernel.org/r/1533326330-31677-8-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Acked-by: Guan Xuetao Cc: Ley Foon Tan Cc: Richard Kuo Cc: Richard Weinberger Cc: Rob Herring Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/unicore32/Kconfig | 1 + arch/unicore32/mm/init.c | 54 +--------------------------------------- 2 files changed, 2 insertions(+), 53 deletions(-) diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index 3a3b40f79558..0c5111b206bd 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -6,6 +6,7 @@ config UNICORE32 select ARCH_MIGHT_HAVE_PC_SERIO select DMA_DIRECT_OPS select HAVE_MEMBLOCK + select NO_BOOTMEM select HAVE_GENERIC_DMA_COHERENT select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index 5f72a8d1d953..8f8699e62bd5 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -84,58 +84,6 @@ static void __init find_limits(unsigned long *min, unsigned long *max_low, } } -static void __init uc32_bootmem_init(unsigned long start_pfn, - unsigned long end_pfn) -{ - struct memblock_region *reg; - unsigned int boot_pages; - phys_addr_t bitmap; - pg_data_t *pgdat; - - /* - * Allocate the bootmem bitmap page. This must be in a region - * of memory which has already been mapped. - */ - boot_pages = bootmem_bootmap_pages(end_pfn - start_pfn); - bitmap = memblock_alloc_base(boot_pages << PAGE_SHIFT, L1_CACHE_BYTES, - __pfn_to_phys(end_pfn)); - - /* - * Initialise the bootmem allocator, handing the - * memory banks over to bootmem. - */ - node_set_online(0); - pgdat = NODE_DATA(0); - init_bootmem_node(pgdat, __phys_to_pfn(bitmap), start_pfn, end_pfn); - - /* Free the lowmem regions from memblock into bootmem. */ - for_each_memblock(memory, reg) { - unsigned long start = memblock_region_memory_base_pfn(reg); - unsigned long end = memblock_region_memory_end_pfn(reg); - - if (end >= end_pfn) - end = end_pfn; - if (start >= end) - break; - - free_bootmem(__pfn_to_phys(start), (end - start) << PAGE_SHIFT); - } - - /* Reserve the lowmem memblock reserved regions in bootmem. */ - for_each_memblock(reserved, reg) { - unsigned long start = memblock_region_reserved_base_pfn(reg); - unsigned long end = memblock_region_reserved_end_pfn(reg); - - if (end >= end_pfn) - end = end_pfn; - if (start >= end) - break; - - reserve_bootmem(__pfn_to_phys(start), - (end - start) << PAGE_SHIFT, BOOTMEM_DEFAULT); - } -} - static void __init uc32_bootmem_free(unsigned long min, unsigned long max_low, unsigned long max_high) { @@ -232,7 +180,7 @@ void __init bootmem_init(void) find_limits(&min, &max_low, &max_high); - uc32_bootmem_init(min, max_low); + node_set_online(0); /* * Sparsemem tries to allocate bootmem in memory_present(), From 6471f52af786fc9cb82bf1af9d086fb09c62f99b Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 26 Oct 2018 15:05:10 -0700 Subject: [PATCH 047/132] alpha: switch to NO_BOOTMEM Replace bootmem allocator with memblock and enable use of NO_BOOTMEM like on most other architectures. Alpha gets the description of the physical memory from the firmware as an array of memory clusters. Each cluster that is not reserved by the firmware is added to memblock.memory. Once the memblock.memory is set up, we reserve the kernel and initrd pages with memblock reserve. Since we don't need the bootmem bitmap anymore, the code that finds an appropriate place is removed. The conversion does not take care of NUMA support which is marked broken for more than 10 years now. Link: http://lkml.kernel.org/r/1535952894-10967-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/Kconfig | 2 + arch/alpha/kernel/core_irongate.c | 4 +- arch/alpha/kernel/setup.c | 98 ++++---------------------- arch/alpha/mm/numa.c | 113 ++++-------------------------- 4 files changed, 29 insertions(+), 188 deletions(-) diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 5b4f88363453..620b0a711ee4 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -31,6 +31,8 @@ config ALPHA select ODD_RT_SIGACTION select OLD_SIGSUSPEND select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67 + select HAVE_MEMBLOCK + select NO_BOOTMEM help The Alpha is a 64-bit general-purpose processor designed and marketed by the Digital Equipment Corporation of blessed memory, diff --git a/arch/alpha/kernel/core_irongate.c b/arch/alpha/kernel/core_irongate.c index aec757250e07..f70986683fc6 100644 --- a/arch/alpha/kernel/core_irongate.c +++ b/arch/alpha/kernel/core_irongate.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -241,8 +242,7 @@ albacore_init_arch(void) size / 1024); } #endif - reserve_bootmem_node(NODE_DATA(0), pci_mem, memtop - - pci_mem, BOOTMEM_DEFAULT); + memblock_reserve(pci_mem, memtop - pci_mem); printk("irongate_init_arch: temporarily reserving " "region %08lx-%08lx for PCI\n", pci_mem, memtop - 1); } diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 5576f7646fb6..4f0d94471bc9 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -312,9 +313,7 @@ setup_memory(void *kernel_end) { struct memclust_struct * cluster; struct memdesc_struct * memdesc; - unsigned long start_kernel_pfn, end_kernel_pfn; - unsigned long bootmap_size, bootmap_pages, bootmap_start; - unsigned long start, end; + unsigned long kernel_size; unsigned long i; /* Find free clusters, and init and free the bootmem accordingly. */ @@ -322,6 +321,8 @@ setup_memory(void *kernel_end) (hwrpb->mddt_offset + (unsigned long) hwrpb); for_each_mem_cluster(memdesc, cluster, i) { + unsigned long end; + printk("memcluster %lu, usage %01lx, start %8lu, end %8lu\n", i, cluster->usage, cluster->start_pfn, cluster->start_pfn + cluster->numpages); @@ -335,6 +336,9 @@ setup_memory(void *kernel_end) end = cluster->start_pfn + cluster->numpages; if (end > max_low_pfn) max_low_pfn = end; + + memblock_add(PFN_PHYS(cluster->start_pfn), + cluster->numpages << PAGE_SHIFT); } /* @@ -363,87 +367,9 @@ setup_memory(void *kernel_end) max_low_pfn = mem_size_limit; } - /* Find the bounds of kernel memory. */ - start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS); - end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end)); - bootmap_start = -1; - - try_again: - if (max_low_pfn <= end_kernel_pfn) - panic("not enough memory to boot"); - - /* We need to know how many physically contiguous pages - we'll need for the bootmap. */ - bootmap_pages = bootmem_bootmap_pages(max_low_pfn); - - /* Now find a good region where to allocate the bootmap. */ - for_each_mem_cluster(memdesc, cluster, i) { - if (cluster->usage & 3) - continue; - - start = cluster->start_pfn; - end = start + cluster->numpages; - if (start >= max_low_pfn) - continue; - if (end > max_low_pfn) - end = max_low_pfn; - if (start < start_kernel_pfn) { - if (end > end_kernel_pfn - && end - end_kernel_pfn >= bootmap_pages) { - bootmap_start = end_kernel_pfn; - break; - } else if (end > start_kernel_pfn) - end = start_kernel_pfn; - } else if (start < end_kernel_pfn) - start = end_kernel_pfn; - if (end - start >= bootmap_pages) { - bootmap_start = start; - break; - } - } - - if (bootmap_start == ~0UL) { - max_low_pfn >>= 1; - goto try_again; - } - - /* Allocate the bootmap and mark the whole MM as reserved. */ - bootmap_size = init_bootmem(bootmap_start, max_low_pfn); - - /* Mark the free regions. */ - for_each_mem_cluster(memdesc, cluster, i) { - if (cluster->usage & 3) - continue; - - start = cluster->start_pfn; - end = cluster->start_pfn + cluster->numpages; - if (start >= max_low_pfn) - continue; - if (end > max_low_pfn) - end = max_low_pfn; - if (start < start_kernel_pfn) { - if (end > end_kernel_pfn) { - free_bootmem(PFN_PHYS(start), - (PFN_PHYS(start_kernel_pfn) - - PFN_PHYS(start))); - printk("freeing pages %ld:%ld\n", - start, start_kernel_pfn); - start = end_kernel_pfn; - } else if (end > start_kernel_pfn) - end = start_kernel_pfn; - } else if (start < end_kernel_pfn) - start = end_kernel_pfn; - if (start >= end) - continue; - - free_bootmem(PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start)); - printk("freeing pages %ld:%ld\n", start, end); - } - - /* Reserve the bootmap memory. */ - reserve_bootmem(PFN_PHYS(bootmap_start), bootmap_size, - BOOTMEM_DEFAULT); - printk("reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size)); + /* Reserve the kernel memory. */ + kernel_size = virt_to_phys(kernel_end) - KERNEL_START_PHYS; + memblock_reserve(KERNEL_START_PHYS, kernel_size); #ifdef CONFIG_BLK_DEV_INITRD initrd_start = INITRD_START; @@ -459,8 +385,8 @@ setup_memory(void *kernel_end) initrd_end, phys_to_virt(PFN_PHYS(max_low_pfn))); } else { - reserve_bootmem(virt_to_phys((void *)initrd_start), - INITRD_SIZE, BOOTMEM_DEFAULT); + memblock_reserve(virt_to_phys((void *)initrd_start), + INITRD_SIZE); } } #endif /* CONFIG_BLK_DEV_INITRD */ diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c index a9e86475f169..26cd925d19b1 100644 --- a/arch/alpha/mm/numa.c +++ b/arch/alpha/mm/numa.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -59,12 +60,10 @@ setup_memory_node(int nid, void *kernel_end) struct memclust_struct * cluster; struct memdesc_struct * memdesc; unsigned long start_kernel_pfn, end_kernel_pfn; - unsigned long bootmap_size, bootmap_pages, bootmap_start; unsigned long start, end; unsigned long node_pfn_start, node_pfn_end; unsigned long node_min_pfn, node_max_pfn; int i; - unsigned long node_datasz = PFN_UP(sizeof(pg_data_t)); int show_init = 0; /* Find the bounds of current node */ @@ -134,24 +133,14 @@ setup_memory_node(int nid, void *kernel_end) /* Cute trick to make sure our local node data is on local memory */ node_data[nid] = (pg_data_t *)(__va(node_min_pfn << PAGE_SHIFT)); #endif - /* Quasi-mark the pg_data_t as in-use */ - node_min_pfn += node_datasz; - if (node_min_pfn >= node_max_pfn) { - printk(" not enough mem to reserve NODE_DATA"); - return; - } - NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; - printk(" Detected node memory: start %8lu, end %8lu\n", node_min_pfn, node_max_pfn); DBGDCONT(" DISCONTIG: node_data[%d] is at 0x%p\n", nid, NODE_DATA(nid)); - DBGDCONT(" DISCONTIG: NODE_DATA(%d)->bdata is at 0x%p\n", nid, NODE_DATA(nid)->bdata); /* Find the bounds of kernel memory. */ start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS); end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end)); - bootmap_start = -1; if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn)) panic("kernel loaded out of ram"); @@ -161,89 +150,11 @@ setup_memory_node(int nid, void *kernel_end) has much larger alignment than 8Mb, so it's safe. */ node_min_pfn &= ~((1UL << (MAX_ORDER-1))-1); - /* We need to know how many physically contiguous pages - we'll need for the bootmap. */ - bootmap_pages = bootmem_bootmap_pages(node_max_pfn-node_min_pfn); + memblock_add(PFN_PHYS(node_min_pfn), + (node_max_pfn - node_min_pfn) << PAGE_SHIFT); - /* Now find a good region where to allocate the bootmap. */ - for_each_mem_cluster(memdesc, cluster, i) { - if (cluster->usage & 3) - continue; - - start = cluster->start_pfn; - end = start + cluster->numpages; - - if (start >= node_max_pfn || end <= node_min_pfn) - continue; - - if (end > node_max_pfn) - end = node_max_pfn; - if (start < node_min_pfn) - start = node_min_pfn; - - if (start < start_kernel_pfn) { - if (end > end_kernel_pfn - && end - end_kernel_pfn >= bootmap_pages) { - bootmap_start = end_kernel_pfn; - break; - } else if (end > start_kernel_pfn) - end = start_kernel_pfn; - } else if (start < end_kernel_pfn) - start = end_kernel_pfn; - if (end - start >= bootmap_pages) { - bootmap_start = start; - break; - } - } - - if (bootmap_start == -1) - panic("couldn't find a contiguous place for the bootmap"); - - /* Allocate the bootmap and mark the whole MM as reserved. */ - bootmap_size = init_bootmem_node(NODE_DATA(nid), bootmap_start, - node_min_pfn, node_max_pfn); - DBGDCONT(" bootmap_start %lu, bootmap_size %lu, bootmap_pages %lu\n", - bootmap_start, bootmap_size, bootmap_pages); - - /* Mark the free regions. */ - for_each_mem_cluster(memdesc, cluster, i) { - if (cluster->usage & 3) - continue; - - start = cluster->start_pfn; - end = cluster->start_pfn + cluster->numpages; - - if (start >= node_max_pfn || end <= node_min_pfn) - continue; - - if (end > node_max_pfn) - end = node_max_pfn; - if (start < node_min_pfn) - start = node_min_pfn; - - if (start < start_kernel_pfn) { - if (end > end_kernel_pfn) { - free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), - (PFN_PHYS(start_kernel_pfn) - - PFN_PHYS(start))); - printk(" freeing pages %ld:%ld\n", - start, start_kernel_pfn); - start = end_kernel_pfn; - } else if (end > start_kernel_pfn) - end = start_kernel_pfn; - } else if (start < end_kernel_pfn) - start = end_kernel_pfn; - if (start >= end) - continue; - - free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start)); - printk(" freeing pages %ld:%ld\n", start, end); - } - - /* Reserve the bootmap memory. */ - reserve_bootmem_node(NODE_DATA(nid), PFN_PHYS(bootmap_start), - bootmap_size, BOOTMEM_DEFAULT); - printk(" reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size)); + NODE_DATA(nid)->node_start_pfn = node_min_pfn; + NODE_DATA(nid)->node_present_pages = node_max_pfn - node_min_pfn; node_set_online(nid); } @@ -251,6 +162,7 @@ setup_memory_node(int nid, void *kernel_end) void __init setup_memory(void *kernel_end) { + unsigned long kernel_size; int nid; show_mem_layout(); @@ -262,6 +174,9 @@ setup_memory(void *kernel_end) for (nid = 0; nid < MAX_NUMNODES; nid++) setup_memory_node(nid, kernel_end); + kernel_size = virt_to_phys(kernel_end) - KERNEL_START_PHYS; + memblock_reserve(KERNEL_START_PHYS, kernel_size); + #ifdef CONFIG_BLK_DEV_INITRD initrd_start = INITRD_START; if (initrd_start) { @@ -279,9 +194,8 @@ setup_memory(void *kernel_end) phys_to_virt(PFN_PHYS(max_low_pfn))); } else { nid = kvaddr_to_nid(initrd_start); - reserve_bootmem_node(NODE_DATA(nid), - virt_to_phys((void *)initrd_start), - INITRD_SIZE, BOOTMEM_DEFAULT); + memblock_reserve(virt_to_phys((void *)initrd_start), + INITRD_SIZE); } } #endif /* CONFIG_BLK_DEV_INITRD */ @@ -303,9 +217,8 @@ void __init paging_init(void) dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; for_each_online_node(nid) { - bootmem_data_t *bdata = &bootmem_node_data[nid]; - unsigned long start_pfn = bdata->node_min_pfn; - unsigned long end_pfn = bdata->node_low_pfn; + unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn; + unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_present_pages; if (dma_local_pfn >= end_pfn - start_pfn) zones_size[ZONE_DMA] = end_pfn - start_pfn; From 3b9aadf7278d16d7bed4d5d808501065f70898d8 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 26 Oct 2018 15:05:16 -0700 Subject: [PATCH 048/132] userfaultfd: allow get_mempolicy(MPOL_F_NODE|MPOL_F_ADDR) to trigger userfaults get_mempolicy(MPOL_F_NODE|MPOL_F_ADDR) called a get_user_pages that would not be waiting for userfaults before failing and it would hit on a SIGBUS instead. Using get_user_pages_locked/unlocked instead will allow get_mempolicy to allow userfaults to resolve the fault and fill the hole, before grabbing the node id of the page. If the user calls get_mempolicy() with MPOL_F_ADDR | MPOL_F_NODE for an address inside an area managed by uffd and there is no page at that address, the page allocation from within get_mempolicy() will fail because get_user_pages() does not allow for page fault retry required for uffd; the user will get SIGBUS. With this patch, the page fault will be resolved by the uffd and the get_mempolicy() will continue normally. Background: Via code review, previously the syscall would have returned -EFAULT (vm_fault_to_errno), now it will block and wait for an userfault (if it's waken before the fault is resolved it'll still -EFAULT). This way get_mempolicy will give a chance to an "unaware" app to be compliant with userfaults. The reason this visible change is that becoming "userfault compliant" cannot regress anything: all other syscalls including read(2)/write(2) had to become "userfault compliant" long time ago (that's one of the things userfaultfd can do that PROT_NONE and trapping segfaults can't). So this is just one more syscall that become "userfault compliant" like all other major ones already were. This has been happening on virtio-bridge dpdk process which just called get_mempolicy on the guest space post live migration, but before the memory had a chance to be migrated to destination. I didn't run an strace to be able to show the -EFAULT going away, but I've the confirmation of the below debug aid information (only visible with CONFIG_DEBUG_VM=y) going away with the patch: [20116.371461] FAULT_FLAG_ALLOW_RETRY missing 0 [20116.371464] CPU: 1 PID: 13381 Comm: vhost-events Not tainted 4.17.12-200.fc28.x86_64 #1 [20116.371465] Hardware name: LENOVO 20FAS2BN0A/20FAS2BN0A, BIOS N1CET54W (1.22 ) 02/10/2017 [20116.371466] Call Trace: [20116.371473] dump_stack+0x5c/0x80 [20116.371476] handle_userfault.cold.37+0x1b/0x22 [20116.371479] ? remove_wait_queue+0x20/0x60 [20116.371481] ? poll_freewait+0x45/0xa0 [20116.371483] ? do_sys_poll+0x31c/0x520 [20116.371485] ? radix_tree_lookup_slot+0x1e/0x50 [20116.371488] shmem_getpage_gfp+0xce7/0xe50 [20116.371491] ? page_add_file_rmap+0x1a/0x2c0 [20116.371493] shmem_fault+0x78/0x1e0 [20116.371495] ? filemap_map_pages+0x3a1/0x450 [20116.371498] __do_fault+0x1f/0xc0 [20116.371500] __handle_mm_fault+0xe2e/0x12f0 [20116.371502] handle_mm_fault+0xda/0x200 [20116.371504] __get_user_pages+0x238/0x790 [20116.371506] get_user_pages+0x3e/0x50 [20116.371510] kernel_get_mempolicy+0x40b/0x700 [20116.371512] ? vfs_write+0x170/0x1a0 [20116.371515] __x64_sys_get_mempolicy+0x21/0x30 [20116.371517] do_syscall_64+0x5b/0x160 [20116.371520] entry_SYSCALL_64_after_hwframe+0x44/0xa9 The above harmless debug message (not a kernel crash, just a dump_stack()) is shown with CONFIG_DEBUG_VM=y to more quickly identify and improve kernel spots that may have to become "userfaultfd compliant" like this one (without having to run an strace and search for syscall misbehavior). Spots like the above are more closer to a kernel bug for the non-cooperative usages that Mike focuses on, than for for dpdk qemu-cooperative usages that reproduced it, but it's still nicer to get this fixed for dpdk too. The part of the patch that caused me to think is only the implementation issue of mpol_get, but it looks like it should work safe no matter the kind of mempolicy structure that is (the default static policy also starts at 1 so it'll go to 2 and back to 1 without crashing everything at 0). [rppt@linux.vnet.ibm.com: changelog addition] http://lkml.kernel.org/r/20180904073718.GA26916@rapoport-lnx Link: http://lkml.kernel.org/r/20180831214848.23676-1-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reported-by: Maxime Coquelin Tested-by: Dr. David Alan Gilbert Reviewed-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index da858f794eb6..2e76a8f65e94 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -797,16 +797,19 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) } } -static int lookup_node(unsigned long addr) +static int lookup_node(struct mm_struct *mm, unsigned long addr) { struct page *p; int err; - err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL); + int locked = 1; + err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked); if (err >= 0) { err = page_to_nid(p); put_page(p); } + if (locked) + up_read(&mm->mmap_sem); return err; } @@ -817,7 +820,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; - struct mempolicy *pol = current->mempolicy; + struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) @@ -857,7 +860,16 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { - err = lookup_node(addr); + /* + * Take a refcount on the mpol, lookup_node() + * wil drop the mmap_sem, so after calling + * lookup_node() only "pol" remains valid, "vma" + * is stale. + */ + pol_refcount = pol; + vma = NULL; + mpol_get(pol); + err = lookup_node(mm, addr); if (err < 0) goto out; *policy = err; @@ -892,7 +904,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, out: mpol_cond_put(pol); if (vma) - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); + if (pol_refcount) + mpol_put(pol_refcount); return err; } From cc252eae85e09552f9c1e7ac0c3227f835efdf2d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 26 Oct 2018 15:05:34 -0700 Subject: [PATCH 049/132] mm, slab: combine kmalloc_caches and kmalloc_dma_caches Patch series "kmalloc-reclaimable caches", v4. As discussed at LSF/MM [1] here's a patchset that introduces kmalloc-reclaimable caches (more details in the second patch) and uses them for dcache external names. That allows us to repurpose the NR_INDIRECTLY_RECLAIMABLE_BYTES counter later in the series. With patch 3/6, dcache external names are allocated from kmalloc-rcl-* caches, eliminating the need for manual accounting. More importantly, it also ensures the reclaimable kmalloc allocations are grouped in pages separate from the regular kmalloc allocations. The need for proper accounting of dcache external names has shown it's easy for misbehaving process to allocate lots of them, causing premature OOMs. Without the added grouping, it's likely that a similar workload can interleave the dcache external names allocations with regular kmalloc allocations (note: I haven't searched myself for an example of such regular kmalloc allocation, but I would be very surprised if there wasn't some). A pathological case would be e.g. one 64byte regular allocations with 63 external dcache names in a page (64x64=4096), which means the page is not freed even after reclaiming after all dcache names, and the process can thus "steal" the whole page with single 64byte allocation. If other kmalloc users similar to dcache external names become identified, they can also benefit from the new functionality simply by adding __GFP_RECLAIMABLE to the kmalloc calls. Side benefits of the patchset (that could be also merged separately) include removed branch for detecting __GFP_DMA kmalloc(), and shortening kmalloc cache names in /proc/slabinfo output. The latter is potentially an ABI break in case there are tools parsing the names and expecting the values to be in bytes. This is how /proc/slabinfo looks like after booting in virtme: ... kmalloc-rcl-4M 0 0 4194304 1 1024 : tunables 1 1 0 : slabdata 0 0 0 ... kmalloc-rcl-96 7 32 128 32 1 : tunables 120 60 8 : slabdata 1 1 0 kmalloc-rcl-64 25 128 64 64 1 : tunables 120 60 8 : slabdata 2 2 0 kmalloc-rcl-32 0 0 32 124 1 : tunables 120 60 8 : slabdata 0 0 0 kmalloc-4M 0 0 4194304 1 1024 : tunables 1 1 0 : slabdata 0 0 0 kmalloc-2M 0 0 2097152 1 512 : tunables 1 1 0 : slabdata 0 0 0 kmalloc-1M 0 0 1048576 1 256 : tunables 1 1 0 : slabdata 0 0 0 ... /proc/vmstat with renamed nr_indirectly_reclaimable_bytes counter: ... nr_slab_reclaimable 2817 nr_slab_unreclaimable 1781 ... nr_kernel_misc_reclaimable 0 ... /proc/meminfo with new KReclaimable counter: ... Shmem: 564 kB KReclaimable: 11260 kB Slab: 18368 kB SReclaimable: 11260 kB SUnreclaim: 7108 kB KernelStack: 1248 kB ... This patch (of 6): The kmalloc caches currently mainain separate (optional) array kmalloc_dma_caches for __GFP_DMA allocations. There are tests for __GFP_DMA in the allocation hotpaths. We can avoid the branches by combining kmalloc_caches and kmalloc_dma_caches into a single two-dimensional array where the outer dimension is cache "type". This will also allow to add kmalloc-reclaimable caches as a third type. Link: http://lkml.kernel.org/r/20180731090649.16028-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Acked-by: Christoph Lameter Acked-by: Roman Gushchin Cc: Michal Hocko Cc: Johannes Weiner Cc: David Rientjes Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Laura Abbott Cc: Sumit Semwal Cc: Vijayanand Jitta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 44 ++++++++++++++++++++++++++++++++------------ mm/slab.c | 4 ++-- mm/slab_common.c | 31 ++++++++++++------------------- mm/slub.c | 13 +++++++------ 4 files changed, 53 insertions(+), 39 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index ed9cbddeb4a6..2a7137043e91 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -295,11 +295,28 @@ static inline void __check_heap_object(const void *ptr, unsigned long n, #define SLAB_OBJ_MIN_SIZE (KMALLOC_MIN_SIZE < 16 ? \ (KMALLOC_MIN_SIZE) : 16) -#ifndef CONFIG_SLOB -extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; +enum kmalloc_cache_type { + KMALLOC_NORMAL = 0, #ifdef CONFIG_ZONE_DMA -extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; + KMALLOC_DMA, #endif + NR_KMALLOC_TYPES +}; + +#ifndef CONFIG_SLOB +extern struct kmem_cache * +kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; + +static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags) +{ + int is_dma = 0; + +#ifdef CONFIG_ZONE_DMA + is_dma = !!(flags & __GFP_DMA); +#endif + + return is_dma; +} /* * Figure out which kmalloc slab an allocation of a certain size @@ -501,18 +518,20 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) static __always_inline void *kmalloc(size_t size, gfp_t flags) { if (__builtin_constant_p(size)) { +#ifndef CONFIG_SLOB + unsigned int index; +#endif if (size > KMALLOC_MAX_CACHE_SIZE) return kmalloc_large(size, flags); #ifndef CONFIG_SLOB - if (!(flags & GFP_DMA)) { - unsigned int index = kmalloc_index(size); + index = kmalloc_index(size); - if (!index) - return ZERO_SIZE_PTR; + if (!index) + return ZERO_SIZE_PTR; - return kmem_cache_alloc_trace(kmalloc_caches[index], - flags, size); - } + return kmem_cache_alloc_trace( + kmalloc_caches[kmalloc_type(flags)][index], + flags, size); #endif } return __kmalloc(size, flags); @@ -542,13 +561,14 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { #ifndef CONFIG_SLOB if (__builtin_constant_p(size) && - size <= KMALLOC_MAX_CACHE_SIZE && !(flags & GFP_DMA)) { + size <= KMALLOC_MAX_CACHE_SIZE) { unsigned int i = kmalloc_index(size); if (!i) return ZERO_SIZE_PTR; - return kmem_cache_alloc_node_trace(kmalloc_caches[i], + return kmem_cache_alloc_node_trace( + kmalloc_caches[kmalloc_type(flags)][i], flags, node, size); } #endif diff --git a/mm/slab.c b/mm/slab.c index d73c7a4820a4..2a5654bb3b3f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1288,7 +1288,7 @@ void __init kmem_cache_init(void) * Initialize the caches that provide memory for the kmem_cache_node * structures first. Without this, further allocations will bug. */ - kmalloc_caches[INDEX_NODE] = create_kmalloc_cache( + kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache( kmalloc_info[INDEX_NODE].name, kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS, 0, kmalloc_size(INDEX_NODE)); @@ -1304,7 +1304,7 @@ void __init kmem_cache_init(void) for_each_online_node(nid) { init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); - init_list(kmalloc_caches[INDEX_NODE], + init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE], &init_kmem_cache_node[SIZE_NODE + nid], nid); } } diff --git a/mm/slab_common.c b/mm/slab_common.c index 3a7ac4f15194..d880b2a3c81b 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -973,14 +973,10 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, return s; } -struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; +struct kmem_cache * +kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init; EXPORT_SYMBOL(kmalloc_caches); -#ifdef CONFIG_ZONE_DMA -struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; -EXPORT_SYMBOL(kmalloc_dma_caches); -#endif - /* * Conversion table for small slabs sizes / 8 to the index in the * kmalloc array. This is necessary for slabs < 192 since we have non power @@ -1040,12 +1036,7 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) index = fls(size - 1); } -#ifdef CONFIG_ZONE_DMA - if (unlikely((flags & GFP_DMA))) - return kmalloc_dma_caches[index]; - -#endif - return kmalloc_caches[index]; + return kmalloc_caches[kmalloc_type(flags)][index]; } /* @@ -1119,7 +1110,8 @@ void __init setup_kmalloc_cache_index_table(void) static void __init new_kmalloc_cache(int idx, slab_flags_t flags) { - kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, + kmalloc_caches[KMALLOC_NORMAL][idx] = create_kmalloc_cache( + kmalloc_info[idx].name, kmalloc_info[idx].size, flags, 0, kmalloc_info[idx].size); } @@ -1132,9 +1124,10 @@ static void __init new_kmalloc_cache(int idx, slab_flags_t flags) void __init create_kmalloc_caches(slab_flags_t flags) { int i; + int type = KMALLOC_NORMAL; for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { - if (!kmalloc_caches[i]) + if (!kmalloc_caches[type][i]) new_kmalloc_cache(i, flags); /* @@ -1142,9 +1135,9 @@ void __init create_kmalloc_caches(slab_flags_t flags) * These have to be created immediately after the * earlier power of two caches */ - if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) + if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[type][1] && i == 6) new_kmalloc_cache(1, flags); - if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) + if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[type][2] && i == 7) new_kmalloc_cache(2, flags); } @@ -1153,7 +1146,7 @@ void __init create_kmalloc_caches(slab_flags_t flags) #ifdef CONFIG_ZONE_DMA for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache *s = kmalloc_caches[i]; + struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i]; if (s) { unsigned int size = kmalloc_size(i); @@ -1161,8 +1154,8 @@ void __init create_kmalloc_caches(slab_flags_t flags) "dma-kmalloc-%u", size); BUG_ON(!n); - kmalloc_dma_caches[i] = create_kmalloc_cache(n, - size, SLAB_CACHE_DMA | flags, 0, 0); + kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache( + n, size, SLAB_CACHE_DMA | flags, 0, 0); } } #endif diff --git a/mm/slub.c b/mm/slub.c index 18bd07daf4e4..e3629cd7aff1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4689,6 +4689,7 @@ static int list_locations(struct kmem_cache *s, char *buf, static void __init resiliency_test(void) { u8 *p; + int type = KMALLOC_NORMAL; BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); @@ -4701,7 +4702,7 @@ static void __init resiliency_test(void) pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", p + 16); - validate_slab_cache(kmalloc_caches[4]); + validate_slab_cache(kmalloc_caches[type][4]); /* Hmmm... The next two are dangerous */ p = kzalloc(32, GFP_KERNEL); @@ -4710,33 +4711,33 @@ static void __init resiliency_test(void) p); pr_err("If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches[5]); + validate_slab_cache(kmalloc_caches[type][5]); p = kzalloc(64, GFP_KERNEL); p += 64 + (get_cycles() & 0xff) * sizeof(void *); *p = 0x56; pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", p); pr_err("If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches[6]); + validate_slab_cache(kmalloc_caches[type][6]); pr_err("\nB. Corruption after free\n"); p = kzalloc(128, GFP_KERNEL); kfree(p); *p = 0x78; pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[7]); + validate_slab_cache(kmalloc_caches[type][7]); p = kzalloc(256, GFP_KERNEL); kfree(p); p[50] = 0x9a; pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[8]); + validate_slab_cache(kmalloc_caches[type][8]); p = kzalloc(512, GFP_KERNEL); kfree(p); p[512] = 0xab; pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[9]); + validate_slab_cache(kmalloc_caches[type][9]); } #else #ifdef CONFIG_SYSFS From 1291523f2c1d631fea34102fd241fb54a4e8f7a0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 26 Oct 2018 15:05:38 -0700 Subject: [PATCH 050/132] mm, slab/slub: introduce kmalloc-reclaimable caches Kmem caches can be created with a SLAB_RECLAIM_ACCOUNT flag, which indicates they contain objects which can be reclaimed under memory pressure (typically through a shrinker). This makes the slab pages accounted as NR_SLAB_RECLAIMABLE in vmstat, which is reflected also the MemAvailable meminfo counter and in overcommit decisions. The slab pages are also allocated with __GFP_RECLAIMABLE, which is good for anti-fragmentation through grouping pages by mobility. The generic kmalloc-X caches are created without this flag, but sometimes are used also for objects that can be reclaimed, which due to varying size cannot have a dedicated kmem cache with SLAB_RECLAIM_ACCOUNT flag. A prominent example are dcache external names, which prompted the creation of a new, manually managed vmstat counter NR_INDIRECTLY_RECLAIMABLE_BYTES in commit f1782c9bc547 ("dcache: account external names as indirectly reclaimable memory"). To better handle this and any other similar cases, this patch introduces SLAB_RECLAIM_ACCOUNT variants of kmalloc caches, named kmalloc-rcl-X. They are used whenever the kmalloc() call passes __GFP_RECLAIMABLE among gfp flags. They are added to the kmalloc_caches array as a new type. Allocations with both __GFP_DMA and __GFP_RECLAIMABLE will use a dma type cache. This change only applies to SLAB and SLUB, not SLOB. This is fine, since SLOB's target are tiny system and this patch does add some overhead of kmem management objects. Link: http://lkml.kernel.org/r/20180731090649.16028-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Acked-by: Christoph Lameter Acked-by: Roman Gushchin Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Laura Abbott Cc: Matthew Wilcox Cc: Michal Hocko Cc: Sumit Semwal Cc: Vijayanand Jitta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 16 ++++++++++++++- mm/slab_common.c | 48 ++++++++++++++++++++++++++++---------------- 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 2a7137043e91..918f374e7156 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -295,8 +295,13 @@ static inline void __check_heap_object(const void *ptr, unsigned long n, #define SLAB_OBJ_MIN_SIZE (KMALLOC_MIN_SIZE < 16 ? \ (KMALLOC_MIN_SIZE) : 16) +/* + * Whenever changing this, take care of that kmalloc_type() and + * create_kmalloc_caches() still work as intended. + */ enum kmalloc_cache_type { KMALLOC_NORMAL = 0, + KMALLOC_RECLAIM, #ifdef CONFIG_ZONE_DMA KMALLOC_DMA, #endif @@ -310,12 +315,21 @@ kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags) { int is_dma = 0; + int type_dma = 0; + int is_reclaimable; #ifdef CONFIG_ZONE_DMA is_dma = !!(flags & __GFP_DMA); + type_dma = is_dma * KMALLOC_DMA; #endif - return is_dma; + is_reclaimable = !!(flags & __GFP_RECLAIMABLE); + + /* + * If an allocation is both __GFP_DMA and __GFP_RECLAIMABLE, return + * KMALLOC_DMA and effectively ignore __GFP_RECLAIMABLE + */ + return type_dma + (is_reclaimable & !is_dma) * KMALLOC_RECLAIM; } /* diff --git a/mm/slab_common.c b/mm/slab_common.c index d880b2a3c81b..5b19439fd862 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1108,10 +1108,21 @@ void __init setup_kmalloc_cache_index_table(void) } } -static void __init new_kmalloc_cache(int idx, slab_flags_t flags) +static void __init +new_kmalloc_cache(int idx, int type, slab_flags_t flags) { - kmalloc_caches[KMALLOC_NORMAL][idx] = create_kmalloc_cache( - kmalloc_info[idx].name, + const char *name; + + if (type == KMALLOC_RECLAIM) { + flags |= SLAB_RECLAIM_ACCOUNT; + name = kasprintf(GFP_NOWAIT, "kmalloc-rcl-%u", + kmalloc_info[idx].size); + BUG_ON(!name); + } else { + name = kmalloc_info[idx].name; + } + + kmalloc_caches[type][idx] = create_kmalloc_cache(name, kmalloc_info[idx].size, flags, 0, kmalloc_info[idx].size); } @@ -1123,22 +1134,25 @@ static void __init new_kmalloc_cache(int idx, slab_flags_t flags) */ void __init create_kmalloc_caches(slab_flags_t flags) { - int i; - int type = KMALLOC_NORMAL; + int i, type; - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { - if (!kmalloc_caches[type][i]) - new_kmalloc_cache(i, flags); + for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) { + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { + if (!kmalloc_caches[type][i]) + new_kmalloc_cache(i, type, flags); - /* - * Caches that are not of the two-to-the-power-of size. - * These have to be created immediately after the - * earlier power of two caches - */ - if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[type][1] && i == 6) - new_kmalloc_cache(1, flags); - if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[type][2] && i == 7) - new_kmalloc_cache(2, flags); + /* + * Caches that are not of the two-to-the-power-of size. + * These have to be created immediately after the + * earlier power of two caches + */ + if (KMALLOC_MIN_SIZE <= 32 && i == 6 && + !kmalloc_caches[type][1]) + new_kmalloc_cache(1, type, flags); + if (KMALLOC_MIN_SIZE <= 64 && i == 7 && + !kmalloc_caches[type][2]) + new_kmalloc_cache(2, type, flags); + } } /* Kmalloc array is now usable */ From 2e03b4bc4ae84fcc0eee00e5ba5d228901d38809 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 26 Oct 2018 15:05:41 -0700 Subject: [PATCH 051/132] dcache: allocate external names from reclaimable kmalloc caches We can use the newly introduced kmalloc-reclaimable-X caches, to allocate external names in dcache, which will take care of the proper accounting automatically, and also improve anti-fragmentation page grouping. This effectively reverts commit f1782c9bc547 ("dcache: account external names as indirectly reclaimable memory") and instead passes __GFP_RECLAIMABLE to kmalloc(). The accounting thus moves from NR_INDIRECTLY_RECLAIMABLE_BYTES to NR_SLAB_RECLAIMABLE, which is also considered in MemAvailable calculation and overcommit decisions. Link: http://lkml.kernel.org/r/20180731090649.16028-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Acked-by: Roman Gushchin Cc: Christoph Lameter Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Laura Abbott Cc: Matthew Wilcox Cc: Michal Hocko Cc: Sumit Semwal Cc: Vijayanand Jitta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 38 +++++++++----------------------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 2e7e8d85e9b4..c2e443fb76ae 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -257,24 +257,10 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } -static void __d_free_external_name(struct rcu_head *head) -{ - struct external_name *name = container_of(head, struct external_name, - u.head); - - mod_node_page_state(page_pgdat(virt_to_page(name)), - NR_INDIRECTLY_RECLAIMABLE_BYTES, - -ksize(name)); - - kfree(name); -} - static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); - - __d_free_external_name(&external_name(dentry)->u.head); - + kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); } @@ -306,7 +292,7 @@ void release_dentry_name_snapshot(struct name_snapshot *name) struct external_name *p; p = container_of(name->name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->u.count))) - call_rcu(&p->u.head, __d_free_external_name); + kfree_rcu(p, u.head); } } EXPORT_SYMBOL(release_dentry_name_snapshot); @@ -1606,7 +1592,6 @@ EXPORT_SYMBOL(d_invalidate); struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { - struct external_name *ext = NULL; struct dentry *dentry; char *dname; int err; @@ -1627,14 +1612,15 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dname = dentry->d_iname; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); - - ext = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT); - if (!ext) { + struct external_name *p = kmalloc(size + name->len, + GFP_KERNEL_ACCOUNT | + __GFP_RECLAIMABLE); + if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; } - atomic_set(&ext->u.count, 1); - dname = ext->name; + atomic_set(&p->u.count, 1); + dname = p->name; } else { dname = dentry->d_iname; } @@ -1673,12 +1659,6 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) } } - if (unlikely(ext)) { - pg_data_t *pgdat = page_pgdat(virt_to_page(ext)); - mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES, - ksize(ext)); - } - this_cpu_inc(nr_dentry); return dentry; @@ -2707,7 +2687,7 @@ static void copy_name(struct dentry *dentry, struct dentry *target) dentry->d_name.hash_len = target->d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) - call_rcu(&old_name->u.head, __d_free_external_name); + kfree_rcu(old_name, u.head); } /* From b29940c1abd7a4c3abeb926df0a5ec84d6902d47 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 26 Oct 2018 15:05:46 -0700 Subject: [PATCH 052/132] mm: rename and change semantics of nr_indirectly_reclaimable_bytes The vmstat counter NR_INDIRECTLY_RECLAIMABLE_BYTES was introduced by commit eb59254608bc ("mm: introduce NR_INDIRECTLY_RECLAIMABLE_BYTES") with the goal of accounting objects that can be reclaimed, but cannot be allocated via a SLAB_RECLAIM_ACCOUNT cache. This is now possible via kmalloc() with __GFP_RECLAIMABLE flag, and the dcache external names user is converted. The counter is however still useful for accounting direct page allocations (i.e. not slab) with a shrinker, such as the ION page pool. So keep it, and: - change granularity to pages to be more like other counters; sub-page allocations should be able to use kmalloc - rename the counter to NR_KERNEL_MISC_RECLAIMABLE - expose the counter again in vmstat as "nr_kernel_misc_reclaimable"; we can again remove the check for not printing "hidden" counters Link: http://lkml.kernel.org/r/20180731090649.16028-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Christoph Lameter Acked-by: Roman Gushchin Cc: Vijayanand Jitta Cc: Laura Abbott Cc: Sumit Semwal Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/staging/android/ion/ion_page_pool.c | 8 ++++---- include/linux/mmzone.h | 2 +- mm/page_alloc.c | 19 +++++++------------ mm/util.c | 3 +-- mm/vmstat.c | 6 +----- 5 files changed, 14 insertions(+), 24 deletions(-) diff --git a/drivers/staging/android/ion/ion_page_pool.c b/drivers/staging/android/ion/ion_page_pool.c index 9bc56eb48d2a..0d2a95957ee8 100644 --- a/drivers/staging/android/ion/ion_page_pool.c +++ b/drivers/staging/android/ion/ion_page_pool.c @@ -33,8 +33,8 @@ static void ion_page_pool_add(struct ion_page_pool *pool, struct page *page) pool->low_count++; } - mod_node_page_state(page_pgdat(page), NR_INDIRECTLY_RECLAIMABLE_BYTES, - (1 << (PAGE_SHIFT + pool->order))); + mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE, + 1 << pool->order); mutex_unlock(&pool->mutex); } @@ -53,8 +53,8 @@ static struct page *ion_page_pool_remove(struct ion_page_pool *pool, bool high) } list_del(&page->lru); - mod_node_page_state(page_pgdat(page), NR_INDIRECTLY_RECLAIMABLE_BYTES, - -(1 << (PAGE_SHIFT + pool->order))); + mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE, + -(1 << pool->order)); return page; } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d4b0c79d2924..7bbeba21f6a3 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -180,7 +180,7 @@ enum node_stat_item { NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ - NR_INDIRECTLY_RECLAIMABLE_BYTES, /* measured in bytes */ + NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_VM_NODE_STAT_ITEMS }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 747031c2352d..20f25d06c00c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4701,6 +4701,7 @@ long si_mem_available(void) unsigned long pagecache; unsigned long wmark_low = 0; unsigned long pages[NR_LRU_LISTS]; + unsigned long reclaimable; struct zone *zone; int lru; @@ -4726,19 +4727,13 @@ long si_mem_available(void) available += pagecache; /* - * Part of the reclaimable slab consists of items that are in use, - * and cannot be freed. Cap this estimate at the low watermark. + * Part of the reclaimable slab and other kernel memory consists of + * items that are in use, and cannot be freed. Cap this estimate at the + * low watermark. */ - available += global_node_page_state(NR_SLAB_RECLAIMABLE) - - min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, - wmark_low); - - /* - * Part of the kernel memory, which can be released under memory - * pressure. - */ - available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >> - PAGE_SHIFT; + reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) + + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); + available += reclaimable - min(reclaimable / 2, wmark_low); if (available < 0) available = 0; diff --git a/mm/util.c b/mm/util.c index 470f5cd80b64..f740754f5012 100644 --- a/mm/util.c +++ b/mm/util.c @@ -678,8 +678,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) * Part of the kernel memory, which can be released * under memory pressure. */ - free += global_node_page_state( - NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT; + free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); /* * Leave reserved pages. The pages are not for anonymous pages. diff --git a/mm/vmstat.c b/mm/vmstat.c index 7878da76abf2..2cec2fa4c8ae 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1161,7 +1161,7 @@ const char * const vmstat_text[] = { "nr_vmscan_immediate_reclaim", "nr_dirtied", "nr_written", - "", /* nr_indirectly_reclaimable */ + "nr_kernel_misc_reclaimable", /* enum writeback_stat_item counters */ "nr_dirty_threshold", @@ -1706,10 +1706,6 @@ static int vmstat_show(struct seq_file *m, void *arg) unsigned long *l = arg; unsigned long off = l - (unsigned long *)m->private; - /* Skip hidden vmstat items. */ - if (*vmstat_text[off] == '\0') - return 0; - seq_puts(m, vmstat_text[off]); seq_put_decimal_ull(m, " ", *l); seq_putc(m, '\n'); From 61f94e18de94f79abaad3bb83549ff78923ac785 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 26 Oct 2018 15:05:50 -0700 Subject: [PATCH 053/132] mm, proc: add KReclaimable to /proc/meminfo The vmstat NR_KERNEL_MISC_RECLAIMABLE counter is for kernel non-slab allocations that can be reclaimed via shrinker. In /proc/meminfo, we can show the sum of all reclaimable kernel allocations (including slab) as "KReclaimable". Add the same counter also to per-node meminfo under /sys With this counter, users will have more complete information about kernel memory usage. Non-slab reclaimable pages (currently just the ION allocator) will not be missing from /proc/meminfo, making users wonder where part of their memory went. More precisely, they already appear in MemAvailable, but without the new counter, it's not obvious why the value in MemAvailable doesn't fully correspond with the sum of other counters participating in it. Link: http://lkml.kernel.org/r/20180731090649.16028-6-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Roman Gushchin Cc: Christoph Lameter Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Laura Abbott Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Sumit Semwal Cc: Vijayanand Jitta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 4 ++++ drivers/base/node.c | 19 ++++++++++++------- fs/proc/meminfo.c | 16 ++++++++-------- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 22b4b00dee31..12a5e6e693b6 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -858,6 +858,7 @@ Writeback: 0 kB AnonPages: 861800 kB Mapped: 280372 kB Shmem: 644 kB +KReclaimable: 168048 kB Slab: 284364 kB SReclaimable: 159856 kB SUnreclaim: 124508 kB @@ -925,6 +926,9 @@ AnonHugePages: Non-file backed huge pages mapped into userspace page tables ShmemHugePages: Memory used by shared memory (shmem) and tmpfs allocated with huge pages ShmemPmdMapped: Shared memory mapped into userspace with huge pages +KReclaimable: Kernel allocations that the kernel will attempt to reclaim + under memory pressure. Includes SReclaimable (below), and other + direct allocations with a shrinker. Slab: in-kernel data structures cache SReclaimable: Part of Slab, that might be reclaimed, such as caches SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure diff --git a/drivers/base/node.c b/drivers/base/node.c index 1ac4c36e13bb..86d6cd92ce3d 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -67,8 +67,11 @@ static ssize_t node_read_meminfo(struct device *dev, int nid = dev->id; struct pglist_data *pgdat = NODE_DATA(nid); struct sysinfo i; + unsigned long sreclaimable, sunreclaimable; si_meminfo_node(&i, nid); + sreclaimable = node_page_state(pgdat, NR_SLAB_RECLAIMABLE); + sunreclaimable = node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE); n = sprintf(buf, "Node %d MemTotal: %8lu kB\n" "Node %d MemFree: %8lu kB\n" @@ -118,6 +121,7 @@ static ssize_t node_read_meminfo(struct device *dev, "Node %d NFS_Unstable: %8lu kB\n" "Node %d Bounce: %8lu kB\n" "Node %d WritebackTmp: %8lu kB\n" + "Node %d KReclaimable: %8lu kB\n" "Node %d Slab: %8lu kB\n" "Node %d SReclaimable: %8lu kB\n" "Node %d SUnreclaim: %8lu kB\n" @@ -138,20 +142,21 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)), nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), - nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE) + - node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)), - nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE)), + nid, K(sreclaimable + + node_page_state(pgdat, NR_KERNEL_MISC_RECLAIMABLE)), + nid, K(sreclaimable + sunreclaimable), + nid, K(sreclaimable), + nid, K(sunreclaimable) #ifdef CONFIG_TRANSPARENT_HUGEPAGE - nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)), + , nid, K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * - HPAGE_PMD_NR)); -#else - nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE))); + HPAGE_PMD_NR) #endif + ); n += hugetlb_report_node_meminfo(nid, buf + n); return n; } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index edda898714eb..568d90e17c17 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -38,6 +38,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) long cached; long available; unsigned long pages[NR_LRU_LISTS]; + unsigned long sreclaimable, sunreclaim; int lru; si_meminfo(&i); @@ -53,6 +54,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); available = si_mem_available(); + sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE); + sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE); show_val_kb(m, "MemTotal: ", i.totalram); show_val_kb(m, "MemFree: ", i.freeram); @@ -94,14 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Mapped: ", global_node_page_state(NR_FILE_MAPPED)); show_val_kb(m, "Shmem: ", i.sharedram); - show_val_kb(m, "Slab: ", - global_node_page_state(NR_SLAB_RECLAIMABLE) + - global_node_page_state(NR_SLAB_UNRECLAIMABLE)); - - show_val_kb(m, "SReclaimable: ", - global_node_page_state(NR_SLAB_RECLAIMABLE)); - show_val_kb(m, "SUnreclaim: ", - global_node_page_state(NR_SLAB_UNRECLAIMABLE)); + show_val_kb(m, "KReclaimable: ", sreclaimable + + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE)); + show_val_kb(m, "Slab: ", sreclaimable + sunreclaim); + show_val_kb(m, "SReclaimable: ", sreclaimable); + show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", global_zone_page_state(NR_KERNEL_STACK_KB)); show_val_kb(m, "PageTables: ", From f0d77874143df90f9831f30254eb149fc4d76b40 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 26 Oct 2018 15:05:55 -0700 Subject: [PATCH 054/132] mm, slab: shorten kmalloc cache names for large sizes Kmalloc cache names can get quite long for large object sizes, when the sizes are expressed in bytes. Use 'k' and 'M' prefixes to make the names as short as possible e.g. in /proc/slabinfo. This works, as we mostly use power-of-two sizes, with exceptions only below 1k. Example: 'kmalloc-4194304' becomes 'kmalloc-4M' Link: http://lkml.kernel.org/r/20180731090649.16028-7-vbabka@suse.cz Suggested-by: Matthew Wilcox Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Acked-by: Christoph Lameter Acked-by: Roman Gushchin Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Laura Abbott Cc: Michal Hocko Cc: Sumit Semwal Cc: Vijayanand Jitta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 5b19439fd862..7eb8dc136c1c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1050,15 +1050,15 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = { {"kmalloc-16", 16}, {"kmalloc-32", 32}, {"kmalloc-64", 64}, {"kmalloc-128", 128}, {"kmalloc-256", 256}, {"kmalloc-512", 512}, - {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048}, - {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192}, - {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768}, - {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072}, - {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288}, - {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152}, - {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608}, - {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432}, - {"kmalloc-67108864", 67108864} + {"kmalloc-1k", 1024}, {"kmalloc-2k", 2048}, + {"kmalloc-4k", 4096}, {"kmalloc-8k", 8192}, + {"kmalloc-16k", 16384}, {"kmalloc-32k", 32768}, + {"kmalloc-64k", 65536}, {"kmalloc-128k", 131072}, + {"kmalloc-256k", 262144}, {"kmalloc-512k", 524288}, + {"kmalloc-1M", 1048576}, {"kmalloc-2M", 2097152}, + {"kmalloc-4M", 4194304}, {"kmalloc-8M", 8388608}, + {"kmalloc-16M", 16777216}, {"kmalloc-32M", 33554432}, + {"kmalloc-64M", 67108864} }; /* @@ -1108,6 +1108,21 @@ void __init setup_kmalloc_cache_index_table(void) } } +static const char * +kmalloc_cache_name(const char *prefix, unsigned int size) +{ + + static const char units[3] = "\0kM"; + int idx = 0; + + while (size >= 1024 && (size % 1024 == 0)) { + size /= 1024; + idx++; + } + + return kasprintf(GFP_NOWAIT, "%s-%u%c", prefix, size, units[idx]); +} + static void __init new_kmalloc_cache(int idx, int type, slab_flags_t flags) { @@ -1115,7 +1130,7 @@ new_kmalloc_cache(int idx, int type, slab_flags_t flags) if (type == KMALLOC_RECLAIM) { flags |= SLAB_RECLAIM_ACCOUNT; - name = kasprintf(GFP_NOWAIT, "kmalloc-rcl-%u", + name = kmalloc_cache_name("kmalloc-rcl", kmalloc_info[idx].size); BUG_ON(!name); } else { @@ -1164,8 +1179,7 @@ void __init create_kmalloc_caches(slab_flags_t flags) if (s) { unsigned int size = kmalloc_size(i); - char *n = kasprintf(GFP_NOWAIT, - "dma-kmalloc-%u", size); + const char *n = kmalloc_cache_name("dma-kmalloc", size); BUG_ON(!n); kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache( From 95f9ab2d596e8cbb388315e78c82b9a131bf2928 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:05:59 -0700 Subject: [PATCH 055/132] mm: workingset: don't drop refault information prematurely Patch series "psi: pressure stall information for CPU, memory, and IO", v4. Overview PSI reports the overall wallclock time in which the tasks in a system (or cgroup) wait for (contended) hardware resources. This helps users understand the resource pressure their workloads are under, which allows them to rootcause and fix throughput and latency problems caused by overcommitting, underprovisioning, suboptimal job placement in a grid; as well as anticipate major disruptions like OOM. Real-world applications We're using the data collected by PSI (and its previous incarnation, memdelay) quite extensively at Facebook, and with several success stories. One usecase is avoiding OOM hangs/livelocks. The reason these happen is because the OOM killer is triggered by reclaim not being able to free pages, but with fast flash devices there is *always* some clean and uptodate cache to reclaim; the OOM killer never kicks in, even as tasks spend 90% of the time thrashing the cache pages of their own executables. There is no situation where this ever makes sense in practice. We wrote a <100 line POC python script to monitor memory pressure and kill stuff way before such pathological thrashing leads to full system losses that would require forcible hard resets. We've since extended and deployed this code into other places to guarantee latency and throughput SLAs, since they're usually violated way before the kernel OOM killer would ever kick in. It is available here: https://github.com/facebookincubator/oomd Eventually we probably want to trigger the in-kernel OOM killer based on extreme sustained pressure as well, so that Linux can avoid memory livelocks - which technically aren't deadlocks, but to the user indistinguishable from them - out of the box. We'd continue using OOMD as the first line of defense to ensure workload health and implement complex kill policies that are beyond the scope of the kernel. We also use PSI memory pressure for loadshedding. Our batch job infrastructure used to use heuristics based on various VM stats to anticipate OOM situations, with lackluster success. We switched it to PSI and managed to anticipate and avoid OOM kills and lockups fairly reliably. The reduction of OOM outages in the worker pool raised the pool's aggregate productivity, and we were able to switch that service to smaller machines. Lastly, we use cgroups to isolate a machine's main workload from maintenance crap like package upgrades, logging, configuration, as well as to prevent multiple workloads on a machine from stepping on each others' toes. We were not able to configure this properly without the pressure metrics; we would see latency or bandwidth drops, but it would often be hard to impossible to rootcause it post-mortem. We now log and graph pressure for the containers in our fleet and can trivially link latency spikes and throughput drops to shortages of specific resources after the fact, and fix the job config/scheduling. PSI has also received testing, feedback, and feature requests from Android and EndlessOS for the purpose of low-latency OOM killing, to intervene in pressure situations before the UI starts hanging. How do you use this feature? A kernel with CONFIG_PSI=y will create a /proc/pressure directory with 3 files: cpu, memory, and io. If using cgroup2, cgroups will also have cpu.pressure, memory.pressure and io.pressure files, which simply aggregate task stalls at the cgroup level instead of system-wide. The cpu file contains one line: some avg10=2.04 avg60=0.75 avg300=0.40 total=157656722 The averages give the percentage of walltime in which one or more tasks are delayed on the runqueue while another task has the CPU. They're recent averages over 10s, 1m, 5m windows, so you can tell short term trends from long term ones, similarly to the load average. The total= value gives the absolute stall time in microseconds. This allows detecting latency spikes that might be too short to sway the running averages. It also allows custom time averaging in case the 10s/1m/5m windows aren't adequate for the usecase (or are too coarse with future hardware). What to make of this "some" metric? If CPU utilization is at 100% and CPU pressure is 0, it means the system is perfectly utilized, with one runnable thread per CPU and nobody waiting. At two or more runnable tasks per CPU, the system is 100% overcommitted and the pressure average will indicate as much. From a utilization perspective this is a great state of course: no CPU cycles are being wasted, even when 50% of the threads were to go idle (as most workloads do vary). From the perspective of the individual job it's not great, however, and they would do better with more resources. Depending on what your priority and options are, raised "some" numbers may or may not require action. The memory file contains two lines: some avg10=70.24 avg60=68.52 avg300=69.91 total=3559632828 full avg10=57.59 avg60=58.06 avg300=60.38 total=3300487258 The some line is the same as for cpu, the time in which at least one task is stalled on the resource. In the case of memory, this includes waiting on swap-in, page cache refaults and page reclaim. The full line, however, indicates time in which *nobody* is using the CPU productively due to pressure: all non-idle tasks are waiting for memory in one form or another. Significant time spent in there is a good trigger for killing things, moving jobs to other machines, or dropping incoming requests, since neither the jobs nor the machine overall are making too much headway. The io file is similar to memory. Because the block layer doesn't have a concept of hardware contention right now (how much longer is my IO request taking due to other tasks?), it reports CPU potential lost on all IO delays, not just the potential lost due to competition. FAQ Q: How is PSI's CPU component different from the load average? A: There are several quirks in the load average that make it hard to impossible to tell how overcommitted the CPU really is. 1. The load average is reported as a raw number of active tasks. You need to know how many CPUs there are in the system, how many CPUs the workload is allowed to use, then think about what the proportion between load and the number of CPUs mean for the tasks trying to run. PSI reports the percentage of wallclock time in which tasks are waiting for a CPU to run on. It doesn't matter how many CPUs are present or usable. The number always tells the quality of life of tasks in the system or in a particular cgroup. 2. The shortest averaging window is 1m, which is extremely coarse, and it's sampled in 5s intervals. A *lot* can happen on a CPU in 5 seconds. This *may* be able to identify persistent long-term trends and very clear and obvious overloads, but it's unusable for latency spikes and more subtle overutilization. PSI's shortest window is 10s. It also exports the cumulative stall times (in microseconds) of synchronously recorded events. 3. On Linux, the load average for historical reasons includes all TASK_UNINTERRUPTIBLE tasks. This gives a broader sense of how busy the system is, but on the flipside it doesn't distinguish whether tasks are likely to contend over the CPU or IO - which obviously requires very different interventions from a sys admin or a job scheduler. PSI reports independent metrics for CPU and IO. You can tell which resource is making the tasks wait, but in conjunction still see how overloaded the system is overall. Q: What's the cost / performance impact of this feature? A: PSI's primary cost is in the scheduler, in particular task wakeups and sleeps. I benchmarked this code using Facebook's two most scheduling sensitive workloads: memcache and webserver. They handle a ton of small requests - lots of wakeups and sleeps with little actual work in between - so they tend to be canaries for scheduler regressions. In the tests, the boxes were handling live traffic over the course of several hours. Half the machines, the control, ran with CONFIG_PSI=n. For memcache I used eight machines total. They're 2-socket, 14 core, 56 thread boxes. The test runs for half the test period, flips the test and control kernels on the hardware to rule out HW factors, DC location etc., then runs the other half of the test. For the webservers, I used 32 machines total. They're single socket, 16 core, 32 thread machines. During the memcache test, CPU load was nopsi=78.05% psi=78.98% in the first half and nopsi=77.52% psi=78.25%, so PSI added between 0.7 and 0.9 percentage points to the CPU load, a difference of about 1%. UPDATE: I re-ran this test with the v3 version of this patch set and the CPU utilization was equivalent between test and control. UPDATE: v4 is on par with v3. As far as end-to-end request latency from the client perspective goes, we don't sample those finely enough to capture the requests going to those particular machines during the test, but we know the p50 turnaround time in this workload is 54us, and perf bench sched pipe on those machines show nopsi=5.232666 us/op and psi=5.587347 us/op, so this doesn't add much here either. The profile for the pipe benchmark shows: 0.87% sched-pipe [kernel.vmlinux] [k] psi_group_change 0.83% perf.real [kernel.vmlinux] [k] psi_group_change 0.82% perf.real [kernel.vmlinux] [k] psi_task_change 0.58% sched-pipe [kernel.vmlinux] [k] psi_task_change The webserver load is running inside 4 nested cgroup levels. The CPU load with both nopsi and psi kernels was indistinguishable at 81%. For comparison, we had to disable the cgroup cpu controller on the webservers because it added 4 percentage points to the CPU% during this same exact test. Versions of this accounting code now run on 80% of our fleet. None of our workloads have reported regressions during the rollout. Daniel Drake said: : I just retested the latest version at : http://git.cmpxchg.org/cgit.cgi/linux-psi.git (Linux 4.18) and the results : are great. : : Test setup: : Endless OS : GeminiLake N4200 low end laptop : 2GB RAM : swap (and zram swap) disabled : : Baseline test: open a handful of large-ish apps and several website : tabs in Google Chrome. : : Results: after a couple of minutes, system is excessively thrashing, mouse : cursor can barely be moved, UI is not responding to mouse clicks, so it's : impractical to recover from this situation as an ordinary user : : Add my simple killer: : https://gist.github.com/dsd/a8988bf0b81a6163475988120fe8d9cd : : Results: when the thrashing causes the UI to become sluggish, the killer : steps in and kills something (usually a chrome tab), and the system : remains usable. I repeatedly opened more apps and more websites over a 15 : minute period but I wasn't able to get the system to a point of UI : unresponsiveness. Suren said: : Backported to 4.9 and retested on ARMv8 8 code system running Android. : Signals behave as expected reacting to memory pressure, no jumps in : "total" counters that would indicate an overflow/underflow issues. Nicely : done! This patch (of 9): If we keep just enough refault information to match the *current* page cache during reclaim time, we could lose a lot of events when there is only a temporary spike in non-cache memory consumption that pushes out all the cache. Once cache comes back, we won't see those refaults. They might not be actionable for LRU aging, but we want to know about them for measuring memory pressure. [hannes@cmpxchg.org: switch to NUMA-aware lru and slab counters] Link: http://lkml.kernel.org/r/20181009184732.762-2-hannes@cmpxchg.org Link: http://lkml.kernel.org/r/20180828172258.3185-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Tested-by: Daniel Drake Tested-by: Suren Baghdasaryan Cc: Ingo Molnar Cc: Tejun Heo Cc: Vinayak Menon Cc: Christopher Lameter Cc: Peter Enderborg Cc: Shakeel Butt Cc: Mike Galbraith Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/workingset.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/mm/workingset.c b/mm/workingset.c index 4516dd790129..7d5fa0dd2b38 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -364,7 +364,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, { unsigned long max_nodes; unsigned long nodes; - unsigned long cache; + unsigned long pages; nodes = list_lru_shrink_count(&shadow_nodes, sc); @@ -390,14 +390,20 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, * * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE */ +#ifdef CONFIG_MEMCG if (sc->memcg) { - cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, - LRU_ALL_FILE); - } else { - cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + - node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); - } - max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); + struct lruvec *lruvec; + + pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, + LRU_ALL); + lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); + pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE); + pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE); + } else +#endif + pages = node_present_pages(sc->nid); + + max_nodes = pages >> (RADIX_TREE_MAP_SHIFT - 3); if (!nodes) return SHRINK_EMPTY; From 1899ad18c6072d689896badafb81267b0a1092a4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:04 -0700 Subject: [PATCH 056/132] mm: workingset: tell cache transitions from workingset thrashing Refaults happen during transitions between workingsets as well as in-place thrashing. Knowing the difference between the two has a range of applications, including measuring the impact of memory shortage on the system performance, as well as the ability to smarter balance pressure between the filesystem cache and the swap-backed workingset. During workingset transitions, inactive cache refaults and pushes out established active cache. When that active cache isn't stale, however, and also ends up refaulting, that's bonafide thrashing. Introduce a new page flag that tells on eviction whether the page has been active or not in its lifetime. This bit is then stored in the shadow entry, to classify refaults as transitioning or thrashing. How many page->flags does this leave us with on 32-bit? 20 bits are always page flags 21 if you have an MMU 23 with the zone bits for DMA, Normal, HighMem, Movable 29 with the sparsemem section bits 30 if PAE is enabled 31 with this patch. So on 32-bit PAE, that leaves 1 bit for distinguishing two NUMA nodes. If that's not enough, the system can switch to discontigmem and re-gain the 6 or 7 sparsemem section bits. Link: http://lkml.kernel.org/r/20180828172258.3185-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Daniel Drake Tested-by: Suren Baghdasaryan Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + include/linux/page-flags.h | 5 +- include/linux/swap.h | 2 +- include/trace/events/mmflags.h | 1 + mm/filemap.c | 9 ++-- mm/huge_memory.c | 1 + mm/migrate.c | 2 + mm/swap_state.c | 1 + mm/vmscan.c | 1 + mm/vmstat.c | 1 + mm/workingset.c | 95 ++++++++++++++++++++++------------ 11 files changed, 77 insertions(+), 42 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7bbeba21f6a3..ba51d5bf7af1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -163,6 +163,7 @@ enum node_stat_item { NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_REFAULT, WORKINGSET_ACTIVATE, + WORKINGSET_RESTORE, WORKINGSET_NODERECLAIM, NR_ANON_MAPPED, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 74bee8cecf4c..4d99504f6496 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -69,13 +69,14 @@ */ enum pageflags { PG_locked, /* Page is locked. Don't touch. */ - PG_error, PG_referenced, PG_uptodate, PG_dirty, PG_lru, PG_active, + PG_workingset, PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ + PG_error, PG_slab, PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ PG_arch_1, @@ -280,6 +281,8 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) TESTCLEARFLAG(Active, active, PF_HEAD) +PAGEFLAG(Workingset, workingset, PF_HEAD) + TESTCLEARFLAG(Workingset, workingset, PF_HEAD) __PAGEFLAG(Slab, slab, PF_NO_TAIL) __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 8e2c11e692ba..b93740d72e78 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -296,7 +296,7 @@ struct vma_swap_readahead { /* linux/mm/workingset.c */ void *workingset_eviction(struct address_space *mapping, struct page *page); -bool workingset_refault(void *shadow); +void workingset_refault(struct page *page, void *shadow); void workingset_activation(struct page *page); /* Do not use directly, use workingset_lookup_update */ diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index a81cffb76d89..a1675d43777e 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -88,6 +88,7 @@ {1UL << PG_dirty, "dirty" }, \ {1UL << PG_lru, "lru" }, \ {1UL << PG_active, "active" }, \ + {1UL << PG_workingset, "workingset" }, \ {1UL << PG_slab, "slab" }, \ {1UL << PG_owner_priv_1, "owner_priv_1" }, \ {1UL << PG_arch_1, "arch_1" }, \ diff --git a/mm/filemap.c b/mm/filemap.c index de6fed2a0815..7997adce5a29 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -915,12 +915,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, * data from the working set, only to cache data that will * get overwritten with something else, is a waste of memory. */ - if (!(gfp_mask & __GFP_WRITE) && - shadow && workingset_refault(shadow)) { - SetPageActive(page); - workingset_activation(page); - } else - ClearPageActive(page); + WARN_ON_ONCE(PageActive(page)); + if (!(gfp_mask & __GFP_WRITE) && shadow) + workingset_refault(page, shadow); lru_cache_add(page); } return ret; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index deed97fba979..8ea1b36bd452 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2369,6 +2369,7 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_mlocked) | (1L << PG_uptodate) | (1L << PG_active) | + (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | (1L << PG_dirty))); diff --git a/mm/migrate.c b/mm/migrate.c index 84381b55b2bd..1ea27b343ccd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -685,6 +685,8 @@ void migrate_page_states(struct page *newpage, struct page *page) SetPageActive(newpage); } else if (TestClearPageUnevictable(page)) SetPageUnevictable(newpage); + if (PageWorkingset(page)) + SetPageWorkingset(newpage); if (PageChecked(page)) SetPageChecked(newpage); if (PageMappedToDisk(page)) diff --git a/mm/swap_state.c b/mm/swap_state.c index ecee9c6c4cc1..0d6a7f268d2e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -448,6 +448,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * Initiate read into locked page and return. */ + SetPageWorkingset(new_page); lru_cache_add_anon(new_page); *new_page_allocated = true; return new_page; diff --git a/mm/vmscan.c b/mm/vmscan.c index 961401c46334..87e9fef341d2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2145,6 +2145,7 @@ static void shrink_active_list(unsigned long nr_to_scan, } ClearPageActive(page); /* we are de-activating */ + SetPageWorkingset(page); list_add(&page->lru, &l_inactive); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 2cec2fa4c8ae..d918f6192d15 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1145,6 +1145,7 @@ const char * const vmstat_text[] = { "nr_isolated_file", "workingset_refault", "workingset_activate", + "workingset_restore", "workingset_nodereclaim", "nr_anon_pages", "nr_mapped", diff --git a/mm/workingset.c b/mm/workingset.c index 7d5fa0dd2b38..99b7f7c09b13 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -121,7 +121,7 @@ * the only thing eating into inactive list space is active pages. * * - * Activating refaulting pages + * Refaulting inactive pages * * All that is known about the active list is that the pages have been * accessed more than once in the past. This means that at any given @@ -134,6 +134,10 @@ * used less frequently than the refaulting page - or even not used at * all anymore. * + * That means if inactive cache is refaulting with a suitable refault + * distance, we assume the cache workingset is transitioning and put + * pressure on the current active list. + * * If this is wrong and demotion kicks in, the pages which are truly * used more frequently will be reactivated while the less frequently * used once will be evicted from memory. @@ -141,6 +145,14 @@ * But if this is right, the stale pages will be pushed out of memory * and the used pages get to stay in cache. * + * Refaulting active pages + * + * If on the other hand the refaulting pages have recently been + * deactivated, it means that the active list is no longer protecting + * actively used cache from reclaim. The cache is NOT transitioning to + * a different workingset; the existing workingset is thrashing in the + * space allocated to the page cache. + * * * Implementation * @@ -156,8 +168,7 @@ */ #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ - NODES_SHIFT + \ - MEM_CGROUP_ID_SHIFT) + 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* @@ -170,23 +181,28 @@ */ static unsigned int bucket_order __read_mostly; -static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) +static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, + bool workingset) { eviction >>= bucket_order; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; + eviction = (eviction << 1) | workingset; eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, - unsigned long *evictionp) + unsigned long *evictionp, bool *workingsetp) { unsigned long entry = (unsigned long)shadow; int memcgid, nid; + bool workingset; entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; + workingset = entry & 1; + entry >>= 1; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); @@ -195,6 +211,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *memcgidp = memcgid; *pgdat = NODE_DATA(nid); *evictionp = entry << bucket_order; + *workingsetp = workingset; } /** @@ -207,8 +224,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, */ void *workingset_eviction(struct address_space *mapping, struct page *page) { - struct mem_cgroup *memcg = page_memcg(page); struct pglist_data *pgdat = page_pgdat(page); + struct mem_cgroup *memcg = page_memcg(page); int memcgid = mem_cgroup_id(memcg); unsigned long eviction; struct lruvec *lruvec; @@ -220,30 +237,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) lruvec = mem_cgroup_lruvec(pgdat, memcg); eviction = atomic_long_inc_return(&lruvec->inactive_age); - return pack_shadow(memcgid, pgdat, eviction); + return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } /** * workingset_refault - evaluate the refault of a previously evicted page + * @page: the freshly allocated replacement page * @shadow: shadow entry of the evicted page * * Calculates and evaluates the refault distance of the previously * evicted page in the context of the node it was allocated in. - * - * Returns %true if the page should be activated, %false otherwise. */ -bool workingset_refault(void *shadow) +void workingset_refault(struct page *page, void *shadow) { unsigned long refault_distance; + struct pglist_data *pgdat; unsigned long active_file; struct mem_cgroup *memcg; unsigned long eviction; struct lruvec *lruvec; unsigned long refault; - struct pglist_data *pgdat; + bool workingset; int memcgid; - unpack_shadow(shadow, &memcgid, &pgdat, &eviction); + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); rcu_read_lock(); /* @@ -263,41 +280,51 @@ bool workingset_refault(void *shadow) * configurations instead. */ memcg = mem_cgroup_from_id(memcgid); - if (!mem_cgroup_disabled() && !memcg) { - rcu_read_unlock(); - return false; - } + if (!mem_cgroup_disabled() && !memcg) + goto out; lruvec = mem_cgroup_lruvec(pgdat, memcg); refault = atomic_long_read(&lruvec->inactive_age); active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); /* - * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. + * Calculate the refault distance * - * There is a special case: usually, shadow entries have a - * short lifetime and are either refaulted or reclaimed along - * with the inode before they get too old. But it is not - * impossible for the inactive_age to lap a shadow entry in - * the field, which can then can result in a false small - * refault distance, leading to a false activation should this - * old entry actually refault again. However, earlier kernels - * used to deactivate unconditionally with *every* reclaim - * invocation for the longest time, so the occasional - * inappropriate activation leading to pressure on the active - * list is not a problem. + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. There is a + * special case: usually, shadow entries have a short lifetime + * and are either refaulted or reclaimed along with the inode + * before they get too old. But it is not impossible for the + * inactive_age to lap a shadow entry in the field, which can + * then result in a false small refault distance, leading to a + * false activation should this old entry actually refault + * again. However, earlier kernels used to deactivate + * unconditionally with *every* reclaim invocation for the + * longest time, so the occasional inappropriate activation + * leading to pressure on the active list is not a problem. */ refault_distance = (refault - eviction) & EVICTION_MASK; inc_lruvec_state(lruvec, WORKINGSET_REFAULT); - if (refault_distance <= active_file) { - inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); - rcu_read_unlock(); - return true; + /* + * Compare the distance to the existing workingset size. We + * don't act on pages that couldn't stay resident even if all + * the memory was available to the page cache. + */ + if (refault_distance > active_file) + goto out; + + SetPageActive(page); + atomic_long_inc(&lruvec->inactive_age); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); + + /* Page was active prior to eviction */ + if (workingset) { + SetPageWorkingset(page); + inc_lruvec_state(lruvec, WORKINGSET_RESTORE); } +out: rcu_read_unlock(); - return false; } /** From b1d29ba82cf2bc784f4c963ddd6a2cf29e229b33 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:08 -0700 Subject: [PATCH 057/132] delayacct: track delays from thrashing cache pages Delay accounting already measures the time a task spends in direct reclaim and waiting for swapin, but in low memory situations tasks spend can spend a significant amount of their time waiting on thrashing page cache. This isn't tracked right now. To know the full impact of memory contention on an individual task, measure the delay when waiting for a recently evicted active cache page to read back into memory. Also update tools/accounting/getdelays.c: [hannes@computer accounting]$ sudo ./getdelays -d -p 1 print delayacct stats ON PID 1 CPU count real total virtual total delay total delay average 50318 745000000 847346785 400533713 0.008ms IO count delay total delay average 435 122601218 0ms SWAP count delay total delay average 0 0 0ms RECLAIM count delay total delay average 0 0 0ms THRASHING count delay total delay average 19 12621439 0ms Link: http://lkml.kernel.org/r/20180828172258.3185-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Daniel Drake Tested-by: Suren Baghdasaryan Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 23 +++++++++++++++++++++++ include/uapi/linux/taskstats.h | 6 +++++- kernel/delayacct.c | 15 +++++++++++++++ mm/filemap.c | 11 +++++++++++ tools/accounting/getdelays.c | 8 +++++++- 5 files changed, 61 insertions(+), 2 deletions(-) diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 31c865d1842e..577d1b25fccd 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -57,7 +57,12 @@ struct task_delay_info { u64 freepages_start; u64 freepages_delay; /* wait for memory reclaim */ + + u64 thrashing_start; + u64 thrashing_delay; /* wait for thrashing page */ + u32 freepages_count; /* total count of memory reclaim */ + u32 thrashing_count; /* total count of thrash waits */ }; #endif @@ -76,6 +81,8 @@ extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); extern __u64 __delayacct_blkio_ticks(struct task_struct *); extern void __delayacct_freepages_start(void); extern void __delayacct_freepages_end(void); +extern void __delayacct_thrashing_start(void); +extern void __delayacct_thrashing_end(void); static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) { @@ -156,6 +163,18 @@ static inline void delayacct_freepages_end(void) __delayacct_freepages_end(); } +static inline void delayacct_thrashing_start(void) +{ + if (current->delays) + __delayacct_thrashing_start(); +} + +static inline void delayacct_thrashing_end(void) +{ + if (current->delays) + __delayacct_thrashing_end(); +} + #else static inline void delayacct_set_flag(int flag) {} @@ -182,6 +201,10 @@ static inline void delayacct_freepages_start(void) {} static inline void delayacct_freepages_end(void) {} +static inline void delayacct_thrashing_start(void) +{} +static inline void delayacct_thrashing_end(void) +{} #endif /* CONFIG_TASK_DELAY_ACCT */ diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index b7aa7bb2349f..5e8ca16a9079 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -34,7 +34,7 @@ */ -#define TASKSTATS_VERSION 8 +#define TASKSTATS_VERSION 9 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ @@ -164,6 +164,10 @@ struct taskstats { /* Delay waiting for memory reclaim */ __u64 freepages_count; __u64 freepages_delay_total; + + /* Delay waiting for thrashing page */ + __u64 thrashing_count; + __u64 thrashing_delay_total; }; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ca8ac2824f0b..2a12b988c717 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -135,9 +135,12 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; tmp = d->freepages_delay_total + tsk->delays->freepages_delay; d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; + tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; + d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; + d->thrashing_count += tsk->delays->thrashing_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; @@ -169,3 +172,15 @@ void __delayacct_freepages_end(void) ¤t->delays->freepages_count); } +void __delayacct_thrashing_start(void) +{ + current->delays->thrashing_start = ktime_get_ns(); +} + +void __delayacct_thrashing_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->thrashing_start, + ¤t->delays->thrashing_delay, + ¤t->delays->thrashing_count); +} diff --git a/mm/filemap.c b/mm/filemap.c index 7997adce5a29..01a841f17bf4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "internal.h" #define CREATE_TRACE_POINTS @@ -1073,8 +1074,15 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; + bool thrashing = false; int ret = 0; + if (bit_nr == PG_locked && !PageSwapBacked(page) && + !PageUptodate(page) && PageWorkingset(page)) { + delayacct_thrashing_start(); + thrashing = true; + } + init_wait(wait); wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; wait->func = wake_page_function; @@ -1113,6 +1121,9 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, finish_wait(q, wait); + if (thrashing) + delayacct_thrashing_end(); + /* * A signal could leave PageWaiters set. Clearing it here if * !waitqueue_active would be possible (by open-coding finish_wait), diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 9f420d98b5fb..8cb504d30384 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -203,6 +203,8 @@ static void print_delayacct(struct taskstats *t) "SWAP %15s%15s%15s\n" " %15llu%15llu%15llums\n" "RECLAIM %12s%15s%15s\n" + " %15llu%15llu%15llums\n" + "THRASHING%12s%15s%15s\n" " %15llu%15llu%15llums\n", "count", "real total", "virtual total", "delay total", "delay average", @@ -222,7 +224,11 @@ static void print_delayacct(struct taskstats *t) "count", "delay total", "delay average", (unsigned long long)t->freepages_count, (unsigned long long)t->freepages_delay_total, - average_ms(t->freepages_delay_total, t->freepages_count)); + average_ms(t->freepages_delay_total, t->freepages_count), + "count", "delay total", "delay average", + (unsigned long long)t->thrashing_count, + (unsigned long long)t->thrashing_delay_total, + average_ms(t->thrashing_delay_total, t->thrashing_count)); } static void task_context_switch_counts(struct taskstats *t) From 8508cf3ffad4defa202b303e5b6379efc4cd9054 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:11 -0700 Subject: [PATCH 058/132] sched: loadavg: consolidate LOAD_INT, LOAD_FRAC, CALC_LOAD There are several definitions of those functions/macros in places that mess with fixed-point load averages. Provide an official version. [akpm@linux-foundation.org: fix missed conversion in block/blk-iolatency.c] Link: http://lkml.kernel.org/r/20180828172258.3185-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Suren Baghdasaryan Tested-by: Daniel Drake Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../platforms/cell/cpufreq_spudemand.c | 2 +- arch/powerpc/platforms/cell/spufs/sched.c | 9 +++----- arch/s390/appldata/appldata_os.c | 4 ---- block/blk-iolatency.c | 8 ++++--- drivers/cpuidle/governors/menu.c | 4 ---- fs/proc/loadavg.c | 3 --- include/linux/sched/loadavg.h | 21 +++++++++++++++---- kernel/debug/kdb/kdb_main.c | 7 +------ kernel/sched/loadavg.c | 15 ------------- 9 files changed, 27 insertions(+), 46 deletions(-) diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c index 882944c36ef5..5d8e8b6bb1cc 100644 --- a/arch/powerpc/platforms/cell/cpufreq_spudemand.c +++ b/arch/powerpc/platforms/cell/cpufreq_spudemand.c @@ -49,7 +49,7 @@ static int calc_freq(struct spu_gov_info_struct *info) cpu = info->policy->cpu; busy_spus = atomic_read(&cbe_spu_info[cpu_to_node(cpu)].busy_spus); - CALC_LOAD(info->busy_spus, EXP, busy_spus * FIXED_1); + info->busy_spus = calc_load(info->busy_spus, EXP, busy_spus * FIXED_1); pr_debug("cpu %d: busy_spus=%d, info->busy_spus=%ld\n", cpu, busy_spus, info->busy_spus); diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index c9ef3c532169..9fcccb4490b9 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -987,9 +987,9 @@ static void spu_calc_load(void) unsigned long active_tasks; /* fixed-point */ active_tasks = count_active_contexts() * FIXED_1; - CALC_LOAD(spu_avenrun[0], EXP_1, active_tasks); - CALC_LOAD(spu_avenrun[1], EXP_5, active_tasks); - CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks); + spu_avenrun[0] = calc_load(spu_avenrun[0], EXP_1, active_tasks); + spu_avenrun[1] = calc_load(spu_avenrun[1], EXP_5, active_tasks); + spu_avenrun[2] = calc_load(spu_avenrun[2], EXP_15, active_tasks); } static void spusched_wake(struct timer_list *unused) @@ -1071,9 +1071,6 @@ void spuctx_switch_state(struct spu_context *ctx, } } -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - static int show_spu_loadavg(struct seq_file *s, void *private) { int a, b, c; diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c index 433a994b1a89..54f375627532 100644 --- a/arch/s390/appldata/appldata_os.c +++ b/arch/s390/appldata/appldata_os.c @@ -25,10 +25,6 @@ #include "appldata.h" - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - /* * OS data * diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 35c48d7b8f78..28f80d227528 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -153,7 +153,7 @@ struct iolatency_grp { #define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC /* * These are the constants used to fake the fixed-point moving average - * calculation just like load average. The call to CALC_LOAD folds + * calculation just like load average. The call to calc_load() folds * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling * window size is bucketed to try to approximately calculate average * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows @@ -248,7 +248,7 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, return; /* - * CALC_LOAD takes in a number stored in fixed point representation. + * calc_load() takes in a number stored in fixed point representation. * Because we are using this for IO time in ns, the values stored * are significantly larger than the FIXED_1 denominator (2048). * Therefore, rounding errors in the calculation are negligible and @@ -257,7 +257,9 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, div64_u64(iolat->cur_win_nsec, BLKIOLATENCY_EXP_BUCKET_SIZE)); - CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat->rqs.mean); + iolat->lat_avg = calc_load(iolat->lat_avg, + iolatency_exp_factors[exp_idx], + stat->rqs.mean); } static inline bool iolatency_may_queue(struct iolatency_grp *iolat, diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 575a68f31761..71979605246e 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -130,10 +130,6 @@ struct menu_device { int interval_ptr; }; - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - static inline int get_loadavg(unsigned long load) { return LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10; diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index d06694757201..8468baee951d 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -10,9 +10,6 @@ #include #include -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - static int loadavg_proc_show(struct seq_file *m, void *v) { unsigned long avnrun[3]; diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h index 80bc84ba5d2a..cc9cc62bb1f8 100644 --- a/include/linux/sched/loadavg.h +++ b/include/linux/sched/loadavg.h @@ -22,10 +22,23 @@ extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); #define EXP_5 2014 /* 1/exp(5sec/5min) */ #define EXP_15 2037 /* 1/exp(5sec/15min) */ -#define CALC_LOAD(load,exp,n) \ - load *= exp; \ - load += n*(FIXED_1-exp); \ - load >>= FSHIFT; +/* + * a1 = a0 * e + a * (1 - e) + */ +static inline unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + unsigned long newload; + + newload = load * exp + active * (FIXED_1 - exp); + if (active >= load) + newload += FIXED_1-1; + + return newload / FIXED_1; +} + +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) extern void calc_global_load(unsigned long ticks); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2ddfce8f1e8f..bb4fe4e1a601 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2556,16 +2556,11 @@ static int kdb_summary(int argc, const char **argv) } kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60); - /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */ - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n", LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]), LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]), LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2])); -#undef LOAD_INT -#undef LOAD_FRAC + /* Display in kilobytes */ #define K(x) ((x) << (PAGE_SHIFT - 10)) kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index a171c1258109..54fbdfb2d86c 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -91,21 +91,6 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) return delta; } -/* - * a1 = a0 * e + a * (1 - e) - */ -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - unsigned long newload; - - newload = load * exp + active * (FIXED_1 - exp); - if (active >= load) - newload += FIXED_1-1; - - return newload / FIXED_1; -} - #ifdef CONFIG_NO_HZ_COMMON /* * Handle NO_HZ for the global load-average. From 5c54f5b9edb1aa2eabbb1091c458f1b6776a1896 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:16 -0700 Subject: [PATCH 059/132] sched: loadavg: make calc_load_n() public It's going to be used in a later patch. Keep the churn separate. Link: http://lkml.kernel.org/r/20180828172258.3185-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Suren Baghdasaryan Tested-by: Daniel Drake Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/loadavg.h | 3 + kernel/sched/loadavg.c | 138 +++++++++++++++++----------------- 2 files changed, 72 insertions(+), 69 deletions(-) diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h index cc9cc62bb1f8..4859bea47a7b 100644 --- a/include/linux/sched/loadavg.h +++ b/include/linux/sched/loadavg.h @@ -37,6 +37,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) return newload / FIXED_1; } +extern unsigned long calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n); + #define LOAD_INT(x) ((x) >> FSHIFT) #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 54fbdfb2d86c..28a516575c18 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -91,6 +91,75 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) return delta; } +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) { + for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + } + + return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + #ifdef CONFIG_NO_HZ_COMMON /* * Handle NO_HZ for the global load-average. @@ -210,75 +279,6 @@ static long calc_load_nohz_fold(void) return delta; } -/** - * fixed_power_int - compute: x^n, in O(log n) time - * - * @x: base of the power - * @frac_bits: fractional bits of @x - * @n: power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - */ -static unsigned long -fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -{ - unsigned long result = 1UL << frac_bits; - - if (n) { - for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; - } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; - } - } - - return result; -} - -/* - * a1 = a0 * e + a * (1 - e) - * - * a2 = a1 * e + a * (1 - e) - * = (a0 * e + a * (1 - e)) * e + a * (1 - e) - * = a0 * e^2 + a * (1 - e) * (1 + e) - * - * a3 = a2 * e + a * (1 - e) - * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) - * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) - * - * ... - * - * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] - * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) - * = a0 * e^n + a * (1 - e^n) - * - * [1] application of the geometric series: - * - * n 1 - x^(n+1) - * S_n := \Sum x^i = ------------- - * i=0 1 - x - */ -static unsigned long -calc_load_n(unsigned long load, unsigned long exp, - unsigned long active, unsigned int n) -{ - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -} - /* * NO_HZ can leave us missing all per-CPU ticks calling * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into From 1f351d7f7590857ea281579c26e6045b4c548ef4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:19 -0700 Subject: [PATCH 060/132] sched: sched.h: make rq locking and clock functions available in stats.h kernel/sched/sched.h includes "stats.h" half-way through the file. The next patch introduces users of sched.h's rq locking functions and update_rq_clock() in kernel/sched/stats.h. Move those definitions up in the file so they are available in stats.h. Link: http://lkml.kernel.org/r/20180828172258.3185-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Suren Baghdasaryan Tested-by: Daniel Drake Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/sched.h | 164 +++++++++++++++++++++---------------------- 1 file changed, 82 insertions(+), 82 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b8c007713b3b..65a75b317935 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -957,6 +957,8 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +extern void update_rq_clock(struct rq *rq); + static inline u64 __rq_clock_broken(struct rq *rq) { return READ_ONCE(rq->clock); @@ -1075,6 +1077,86 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) #endif } +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock); + +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(p->pi_lock) + __acquires(rq->lock); + +static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); +} + +static inline void +rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irqsave(&rq->lock, rf->flags); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock_irq(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_relock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_repin_lock(rq, rf); +} + +static inline void +rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irqrestore(&rq->lock, rf->flags); +} + +static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irq(&rq->lock); +} + +static inline void +rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + #ifdef CONFIG_NUMA enum numa_topology_type { NUMA_DIRECT, @@ -1717,8 +1799,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) sched_update_tick_dependency(rq); } -extern void update_rq_clock(struct rq *rq); - extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); @@ -1783,86 +1863,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) #endif #endif -struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) - __acquires(rq->lock); - -struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) - __acquires(p->pi_lock) - __acquires(rq->lock); - -static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); -} - -static inline void -task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) - __releases(rq->lock) - __releases(p->pi_lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -} - -static inline void -rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock_irqsave(&rq->lock, rf->flags); - rq_pin_lock(rq, rf); -} - -static inline void -rq_lock_irq(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock_irq(&rq->lock); - rq_pin_lock(rq, rf); -} - -static inline void -rq_lock(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock(&rq->lock); - rq_pin_lock(rq, rf); -} - -static inline void -rq_relock(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock(&rq->lock); - rq_repin_lock(rq, rf); -} - -static inline void -rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock_irqrestore(&rq->lock, rf->flags); -} - -static inline void -rq_unlock_irq(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock_irq(&rq->lock); -} - -static inline void -rq_unlock(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); -} - #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT From 246b3b3342c9b0a2e24cda2178be87bc36e1c874 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:23 -0700 Subject: [PATCH 061/132] sched: introduce this_rq_lock_irq() do_sched_yield() disables IRQs, looks up this_rq() and locks it. The next patch is adding another site with the same pattern, so provide a convenience function for it. Link: http://lkml.kernel.org/r/20180828172258.3185-8-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Suren Baghdasaryan Tested-by: Daniel Drake Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 4 +--- kernel/sched/sched.h | 12 ++++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2e696b03e99d..f3efef387797 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4933,9 +4933,7 @@ static void do_sched_yield(void) struct rq_flags rf; struct rq *rq; - local_irq_disable(); - rq = this_rq(); - rq_lock(rq, &rf); + rq = this_rq_lock_irq(&rf); schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 65a75b317935..1de189bb9209 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1157,6 +1157,18 @@ rq_unlock(struct rq *rq, struct rq_flags *rf) raw_spin_unlock(&rq->lock); } +static inline struct rq * +this_rq_lock_irq(struct rq_flags *rf) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + rq_lock(rq, rf); + return rq; +} + #ifdef CONFIG_NUMA enum numa_topology_type { NUMA_DIRECT, From eb414681d5a07d28d2ff90dc05f69ec6b232ebd2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:27 -0700 Subject: [PATCH 062/132] psi: pressure stall information for CPU, memory, and IO When systems are overcommitted and resources become contended, it's hard to tell exactly the impact this has on workload productivity, or how close the system is to lockups and OOM kills. In particular, when machines work multiple jobs concurrently, the impact of overcommit in terms of latency and throughput on the individual job can be enormous. In order to maximize hardware utilization without sacrificing individual job health or risk complete machine lockups, this patch implements a way to quantify resource pressure in the system. A kernel built with CONFIG_PSI=y creates files in /proc/pressure/ that expose the percentage of time the system is stalled on CPU, memory, or IO, respectively. Stall states are aggregate versions of the per-task delay accounting delays: cpu: some tasks are runnable but not executing on a CPU memory: tasks are reclaiming, or waiting for swapin or thrashing cache io: tasks are waiting for io completions These percentages of walltime can be thought of as pressure percentages, and they give a general sense of system health and productivity loss incurred by resource overcommit. They can also indicate when the system is approaching lockup scenarios and OOMs. To do this, psi keeps track of the task states associated with each CPU and samples the time they spend in stall states. Every 2 seconds, the samples are averaged across CPUs - weighted by the CPUs' non-idle time to eliminate artifacts from unused CPUs - and translated into percentages of walltime. A running average of those percentages is maintained over 10s, 1m, and 5m periods (similar to the loadaverage). [hannes@cmpxchg.org: doc fixlet, per Randy] Link: http://lkml.kernel.org/r/20180828205625.GA14030@cmpxchg.org [hannes@cmpxchg.org: code optimization] Link: http://lkml.kernel.org/r/20180907175015.GA8479@cmpxchg.org [hannes@cmpxchg.org: rename psi_clock() to psi_update_work(), per Peter] Link: http://lkml.kernel.org/r/20180907145404.GB11088@cmpxchg.org [hannes@cmpxchg.org: fix build] Link: http://lkml.kernel.org/r/20180913014222.GA2370@cmpxchg.org Link: http://lkml.kernel.org/r/20180828172258.3185-9-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Daniel Drake Tested-by: Suren Baghdasaryan Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/accounting/psi.txt | 64 +++ include/linux/psi.h | 28 ++ include/linux/psi_types.h | 92 +++++ include/linux/sched.h | 10 + init/Kconfig | 15 + kernel/fork.c | 4 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 12 +- kernel/sched/psi.c | 657 +++++++++++++++++++++++++++++++ kernel/sched/sched.h | 2 + kernel/sched/stats.h | 86 ++++ mm/compaction.c | 5 + mm/filemap.c | 15 +- mm/page_alloc.c | 9 + mm/vmscan.c | 9 + 15 files changed, 1003 insertions(+), 6 deletions(-) create mode 100644 Documentation/accounting/psi.txt create mode 100644 include/linux/psi.h create mode 100644 include/linux/psi_types.h create mode 100644 kernel/sched/psi.c diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt new file mode 100644 index 000000000000..3753a82f1cf5 --- /dev/null +++ b/Documentation/accounting/psi.txt @@ -0,0 +1,64 @@ +================================ +PSI - Pressure Stall Information +================================ + +:Date: April, 2018 +:Author: Johannes Weiner + +When CPU, memory or IO devices are contended, workloads experience +latency spikes, throughput losses, and run the risk of OOM kills. + +Without an accurate measure of such contention, users are forced to +either play it safe and under-utilize their hardware resources, or +roll the dice and frequently suffer the disruptions resulting from +excessive overcommit. + +The psi feature identifies and quantifies the disruptions caused by +such resource crunches and the time impact it has on complex workloads +or even entire systems. + +Having an accurate measure of productivity losses caused by resource +scarcity aids users in sizing workloads to hardware--or provisioning +hardware according to workload demand. + +As psi aggregates this information in realtime, systems can be managed +dynamically using techniques such as load shedding, migrating jobs to +other systems or data centers, or strategically pausing or killing low +priority or restartable batch jobs. + +This allows maximizing hardware utilization without sacrificing +workload health or risking major disruptions such as OOM kills. + +Pressure interface +================== + +Pressure information for each resource is exported through the +respective file in /proc/pressure/ -- cpu, memory, and io. + +The format for CPU is as such: + +some avg10=0.00 avg60=0.00 avg300=0.00 total=0 + +and for memory and IO: + +some avg10=0.00 avg60=0.00 avg300=0.00 total=0 +full avg10=0.00 avg60=0.00 avg300=0.00 total=0 + +The "some" line indicates the share of time in which at least some +tasks are stalled on a given resource. + +The "full" line indicates the share of time in which all non-idle +tasks are stalled on a given resource simultaneously. In this state +actual CPU cycles are going to waste, and a workload that spends +extended time in this state is considered to be thrashing. This has +severe impact on performance, and it's useful to distinguish this +situation from a state where some tasks are stalled but the CPU is +still doing productive work. As such, time spent in this subset of the +stall state is tracked separately and exported in the "full" averages. + +The ratios are tracked as recent trends over ten, sixty, and three +hundred second windows, which gives insight into short term events as +well as medium and long term trends. The total absolute stall time is +tracked and exported as well, to allow detection of latency spikes +which wouldn't necessarily make a dent in the time averages, or to +average trends over custom time frames. diff --git a/include/linux/psi.h b/include/linux/psi.h new file mode 100644 index 000000000000..b0daf050de58 --- /dev/null +++ b/include/linux/psi.h @@ -0,0 +1,28 @@ +#ifndef _LINUX_PSI_H +#define _LINUX_PSI_H + +#include +#include + +#ifdef CONFIG_PSI + +extern bool psi_disabled; + +void psi_init(void); + +void psi_task_change(struct task_struct *task, int clear, int set); + +void psi_memstall_tick(struct task_struct *task, int cpu); +void psi_memstall_enter(unsigned long *flags); +void psi_memstall_leave(unsigned long *flags); + +#else /* CONFIG_PSI */ + +static inline void psi_init(void) {} + +static inline void psi_memstall_enter(unsigned long *flags) {} +static inline void psi_memstall_leave(unsigned long *flags) {} + +#endif /* CONFIG_PSI */ + +#endif /* _LINUX_PSI_H */ diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h new file mode 100644 index 000000000000..2cf422db5d18 --- /dev/null +++ b/include/linux/psi_types.h @@ -0,0 +1,92 @@ +#ifndef _LINUX_PSI_TYPES_H +#define _LINUX_PSI_TYPES_H + +#include +#include + +#ifdef CONFIG_PSI + +/* Tracked task states */ +enum psi_task_count { + NR_IOWAIT, + NR_MEMSTALL, + NR_RUNNING, + NR_PSI_TASK_COUNTS, +}; + +/* Task state bitmasks */ +#define TSK_IOWAIT (1 << NR_IOWAIT) +#define TSK_MEMSTALL (1 << NR_MEMSTALL) +#define TSK_RUNNING (1 << NR_RUNNING) + +/* Resources that workloads could be stalled on */ +enum psi_res { + PSI_IO, + PSI_MEM, + PSI_CPU, + NR_PSI_RESOURCES, +}; + +/* + * Pressure states for each resource: + * + * SOME: Stalled tasks & working tasks + * FULL: Stalled tasks & no working tasks + */ +enum psi_states { + PSI_IO_SOME, + PSI_IO_FULL, + PSI_MEM_SOME, + PSI_MEM_FULL, + PSI_CPU_SOME, + /* Only per-CPU, to weigh the CPU in the global average: */ + PSI_NONIDLE, + NR_PSI_STATES, +}; + +struct psi_group_cpu { + /* 1st cacheline updated by the scheduler */ + + /* Aggregator needs to know of concurrent changes */ + seqcount_t seq ____cacheline_aligned_in_smp; + + /* States of the tasks belonging to this group */ + unsigned int tasks[NR_PSI_TASK_COUNTS]; + + /* Period time sampling buckets for each state of interest (ns) */ + u32 times[NR_PSI_STATES]; + + /* Time of last task change in this group (rq_clock) */ + u64 state_start; + + /* 2nd cacheline updated by the aggregator */ + + /* Delta detection against the sampling buckets */ + u32 times_prev[NR_PSI_STATES] ____cacheline_aligned_in_smp; +}; + +struct psi_group { + /* Protects data updated during an aggregation */ + struct mutex stat_lock; + + /* Per-cpu task state & time tracking */ + struct psi_group_cpu __percpu *pcpu; + + /* Periodic aggregation state */ + u64 total_prev[NR_PSI_STATES - 1]; + u64 last_update; + u64 next_update; + struct delayed_work clock_work; + + /* Total stall times and sampled pressure averages */ + u64 total[NR_PSI_STATES - 1]; + unsigned long avg[NR_PSI_STATES - 1][3]; +}; + +#else /* CONFIG_PSI */ + +struct psi_group { }; + +#endif /* CONFIG_PSI */ + +#endif /* _LINUX_PSI_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index adfb3f9a7597..b8fcc6b3080c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -706,6 +707,10 @@ struct task_struct { unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; unsigned sched_remote_wakeup:1; +#ifdef CONFIG_PSI + unsigned sched_psi_wake_requeue:1; +#endif + /* Force alignment to the next boundary: */ unsigned :0; @@ -965,6 +970,10 @@ struct task_struct { kernel_siginfo_t *last_siginfo; struct task_io_accounting ioac; +#ifdef CONFIG_PSI + /* Pressure stall state */ + unsigned int psi_flags; +#endif #ifdef CONFIG_TASK_XACCT /* Accumulated RSS usage: */ u64 acct_rss_mem1; @@ -1391,6 +1400,7 @@ extern struct pid *cad_pid; #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ +#define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ diff --git a/init/Kconfig b/init/Kconfig index 317d5ccb5191..26e639df5517 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -490,6 +490,21 @@ config TASK_IO_ACCOUNTING Say N if unsure. +config PSI + bool "Pressure stall information tracking" + help + Collect metrics that indicate how overcommitted the CPU, memory, + and IO capacity are in the system. + + If you say Y here, the kernel will create /proc/pressure/ with the + pressure statistics files cpu, memory, and io. These will indicate + the share of walltime in which some or all tasks in the system are + delayed due to contention of the respective resource. + + For more details see Documentation/accounting/psi.txt. + + Say N if unsure. + endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION diff --git a/kernel/fork.c b/kernel/fork.c index 3c719fec46c5..8f82a3bdcb8f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1822,6 +1822,10 @@ static __latent_entropy struct task_struct *copy_process( p->default_timer_slack_ns = current->timer_slack_ns; +#ifdef CONFIG_PSI + p->psi_flags = 0; +#endif + task_io_accounting_init(&p->ioac); acct_clear_integrals(p); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7fe183404c38..21fb5a5662b5 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o +obj-$(CONFIG_PSI) += psi.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f3efef387797..fd2fce8a001b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -722,8 +722,10 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & ENQUEUE_RESTORE)) + if (!(flags & ENQUEUE_RESTORE)) { sched_info_queued(rq, p); + psi_enqueue(p, flags & ENQUEUE_WAKEUP); + } p->sched_class->enqueue_task(rq, p, flags); } @@ -733,8 +735,10 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & DEQUEUE_SAVE)) + if (!(flags & DEQUEUE_SAVE)) { sched_info_dequeued(rq, p); + psi_dequeue(p, flags & DEQUEUE_SLEEP); + } p->sched_class->dequeue_task(rq, p, flags); } @@ -2037,6 +2041,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; + psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } @@ -3051,6 +3056,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); cpu_load_update_active(rq); calc_global_load_tick(rq); + psi_task_tick(rq); rq_unlock(rq, &rf); @@ -6067,6 +6073,8 @@ void __init sched_init(void) init_schedstats(); + psi_init(); + scheduler_running = 1; } diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c new file mode 100644 index 000000000000..595414599b98 --- /dev/null +++ b/kernel/sched/psi.c @@ -0,0 +1,657 @@ +/* + * Pressure stall information for CPU, memory and IO + * + * Copyright (c) 2018 Facebook, Inc. + * Author: Johannes Weiner + * + * When CPU, memory and IO are contended, tasks experience delays that + * reduce throughput and introduce latencies into the workload. Memory + * and IO contention, in addition, can cause a full loss of forward + * progress in which the CPU goes idle. + * + * This code aggregates individual task delays into resource pressure + * metrics that indicate problems with both workload health and + * resource utilization. + * + * Model + * + * The time in which a task can execute on a CPU is our baseline for + * productivity. Pressure expresses the amount of time in which this + * potential cannot be realized due to resource contention. + * + * This concept of productivity has two components: the workload and + * the CPU. To measure the impact of pressure on both, we define two + * contention states for a resource: SOME and FULL. + * + * In the SOME state of a given resource, one or more tasks are + * delayed on that resource. This affects the workload's ability to + * perform work, but the CPU may still be executing other tasks. + * + * In the FULL state of a given resource, all non-idle tasks are + * delayed on that resource such that nobody is advancing and the CPU + * goes idle. This leaves both workload and CPU unproductive. + * + * (Naturally, the FULL state doesn't exist for the CPU resource.) + * + * SOME = nr_delayed_tasks != 0 + * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 + * + * The percentage of wallclock time spent in those compound stall + * states gives pressure numbers between 0 and 100 for each resource, + * where the SOME percentage indicates workload slowdowns and the FULL + * percentage indicates reduced CPU utilization: + * + * %SOME = time(SOME) / period + * %FULL = time(FULL) / period + * + * Multiple CPUs + * + * The more tasks and available CPUs there are, the more work can be + * performed concurrently. This means that the potential that can go + * unrealized due to resource contention *also* scales with non-idle + * tasks and CPUs. + * + * Consider a scenario where 257 number crunching tasks are trying to + * run concurrently on 256 CPUs. If we simply aggregated the task + * states, we would have to conclude a CPU SOME pressure number of + * 100%, since *somebody* is waiting on a runqueue at all + * times. However, that is clearly not the amount of contention the + * workload is experiencing: only one out of 256 possible exceution + * threads will be contended at any given time, or about 0.4%. + * + * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any + * given time *one* of the tasks is delayed due to a lack of memory. + * Again, looking purely at the task state would yield a memory FULL + * pressure number of 0%, since *somebody* is always making forward + * progress. But again this wouldn't capture the amount of execution + * potential lost, which is 1 out of 4 CPUs, or 25%. + * + * To calculate wasted potential (pressure) with multiple processors, + * we have to base our calculation on the number of non-idle tasks in + * conjunction with the number of available CPUs, which is the number + * of potential execution threads. SOME becomes then the proportion of + * delayed tasks to possibe threads, and FULL is the share of possible + * threads that are unproductive due to delays: + * + * threads = min(nr_nonidle_tasks, nr_cpus) + * SOME = min(nr_delayed_tasks / threads, 1) + * FULL = (threads - min(nr_running_tasks, threads)) / threads + * + * For the 257 number crunchers on 256 CPUs, this yields: + * + * threads = min(257, 256) + * SOME = min(1 / 256, 1) = 0.4% + * FULL = (256 - min(257, 256)) / 256 = 0% + * + * For the 1 out of 4 memory-delayed tasks, this yields: + * + * threads = min(4, 4) + * SOME = min(1 / 4, 1) = 25% + * FULL = (4 - min(3, 4)) / 4 = 25% + * + * [ Substitute nr_cpus with 1, and you can see that it's a natural + * extension of the single-CPU model. ] + * + * Implementation + * + * To assess the precise time spent in each such state, we would have + * to freeze the system on task changes and start/stop the state + * clocks accordingly. Obviously that doesn't scale in practice. + * + * Because the scheduler aims to distribute the compute load evenly + * among the available CPUs, we can track task state locally to each + * CPU and, at much lower frequency, extrapolate the global state for + * the cumulative stall times and the running averages. + * + * For each runqueue, we track: + * + * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) + * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) + * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) + * + * and then periodically aggregate: + * + * tNONIDLE = sum(tNONIDLE[i]) + * + * tSOME = sum(tSOME[i] * tNONIDLE[i]) / tNONIDLE + * tFULL = sum(tFULL[i] * tNONIDLE[i]) / tNONIDLE + * + * %SOME = tSOME / period + * %FULL = tFULL / period + * + * This gives us an approximation of pressure that is practical + * cost-wise, yet way more sensitive and accurate than periodic + * sampling of the aggregate task states would be. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "sched.h" + +static int psi_bug __read_mostly; + +bool psi_disabled __read_mostly; +core_param(psi_disabled, psi_disabled, bool, 0644); + +/* Running averages - we need to be higher-res than loadavg */ +#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */ +#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */ +#define EXP_60s 1981 /* 1/exp(2s/60s) */ +#define EXP_300s 2034 /* 1/exp(2s/300s) */ + +/* Sampling frequency in nanoseconds */ +static u64 psi_period __read_mostly; + +/* System-level pressure and stall tracking */ +static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); +static struct psi_group psi_system = { + .pcpu = &system_group_pcpu, +}; + +static void psi_update_work(struct work_struct *work); + +static void group_init(struct psi_group *group) +{ + int cpu; + + for_each_possible_cpu(cpu) + seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); + group->next_update = sched_clock() + psi_period; + INIT_DELAYED_WORK(&group->clock_work, psi_update_work); + mutex_init(&group->stat_lock); +} + +void __init psi_init(void) +{ + if (psi_disabled) + return; + + psi_period = jiffies_to_nsecs(PSI_FREQ); + group_init(&psi_system); +} + +static bool test_state(unsigned int *tasks, enum psi_states state) +{ + switch (state) { + case PSI_IO_SOME: + return tasks[NR_IOWAIT]; + case PSI_IO_FULL: + return tasks[NR_IOWAIT] && !tasks[NR_RUNNING]; + case PSI_MEM_SOME: + return tasks[NR_MEMSTALL]; + case PSI_MEM_FULL: + return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; + case PSI_CPU_SOME: + return tasks[NR_RUNNING] > 1; + case PSI_NONIDLE: + return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || + tasks[NR_RUNNING]; + default: + return false; + } +} + +static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +{ + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + unsigned int tasks[NR_PSI_TASK_COUNTS]; + u64 now, state_start; + unsigned int seq; + int s; + + /* Snapshot a coherent view of the CPU state */ + do { + seq = read_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); + memcpy(times, groupc->times, sizeof(groupc->times)); + memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_start = groupc->state_start; + } while (read_seqcount_retry(&groupc->seq, seq)); + + /* Calculate state time deltas against the previous snapshot */ + for (s = 0; s < NR_PSI_STATES; s++) { + u32 delta; + /* + * In addition to already concluded states, we also + * incorporate currently active states on the CPU, + * since states may last for many sampling periods. + * + * This way we keep our delta sampling buckets small + * (u32) and our reported pressure close to what's + * actually happening. + */ + if (test_state(tasks, s)) + times[s] += now - state_start; + + delta = times[s] - groupc->times_prev[s]; + groupc->times_prev[s] = times[s]; + + times[s] = delta; + } +} + +static void calc_avgs(unsigned long avg[3], int missed_periods, + u64 time, u64 period) +{ + unsigned long pct; + + /* Fill in zeroes for periods of no activity */ + if (missed_periods) { + avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods); + avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods); + avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods); + } + + /* Sample the most recent active period */ + pct = div_u64(time * 100, period); + pct *= FIXED_1; + avg[0] = calc_load(avg[0], EXP_10s, pct); + avg[1] = calc_load(avg[1], EXP_60s, pct); + avg[2] = calc_load(avg[2], EXP_300s, pct); +} + +static bool update_stats(struct psi_group *group) +{ + u64 deltas[NR_PSI_STATES - 1] = { 0, }; + unsigned long missed_periods = 0; + unsigned long nonidle_total = 0; + u64 now, expires, period; + int cpu; + int s; + + mutex_lock(&group->stat_lock); + + /* + * Collect the per-cpu time buckets and average them into a + * single time sample that is normalized to wallclock time. + * + * For averaging, each CPU is weighted by its non-idle time in + * the sampling period. This eliminates artifacts from uneven + * loading, or even entirely idle CPUs. + */ + for_each_possible_cpu(cpu) { + u32 times[NR_PSI_STATES]; + u32 nonidle; + + get_recent_times(group, cpu, times); + + nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); + nonidle_total += nonidle; + + for (s = 0; s < PSI_NONIDLE; s++) + deltas[s] += (u64)times[s] * nonidle; + } + + /* + * Integrate the sample into the running statistics that are + * reported to userspace: the cumulative stall times and the + * decaying averages. + * + * Pressure percentages are sampled at PSI_FREQ. We might be + * called more often when the user polls more frequently than + * that; we might be called less often when there is no task + * activity, thus no data, and clock ticks are sporadic. The + * below handles both. + */ + + /* total= */ + for (s = 0; s < NR_PSI_STATES - 1; s++) + group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + + /* avgX= */ + now = sched_clock(); + expires = group->next_update; + if (now < expires) + goto out; + if (now - expires > psi_period) + missed_periods = div_u64(now - expires, psi_period); + + /* + * The periodic clock tick can get delayed for various + * reasons, especially on loaded systems. To avoid clock + * drift, we schedule the clock in fixed psi_period intervals. + * But the deltas we sample out of the per-cpu buckets above + * are based on the actual time elapsing between clock ticks. + */ + group->next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->last_update + (missed_periods * psi_period)); + group->last_update = now; + + for (s = 0; s < NR_PSI_STATES - 1; s++) { + u32 sample; + + sample = group->total[s] - group->total_prev[s]; + /* + * Due to the lockless sampling of the time buckets, + * recorded time deltas can slip into the next period, + * which under full pressure can result in samples in + * excess of the period length. + * + * We don't want to report non-sensical pressures in + * excess of 100%, nor do we want to drop such events + * on the floor. Instead we punt any overage into the + * future until pressure subsides. By doing this we + * don't underreport the occurring pressure curve, we + * just report it delayed by one period length. + * + * The error isn't cumulative. As soon as another + * delta slips from a period P to P+1, by definition + * it frees up its time T in P. + */ + if (sample > period) + sample = period; + group->total_prev[s] += sample; + calc_avgs(group->avg[s], missed_periods, sample, period); + } +out: + mutex_unlock(&group->stat_lock); + return nonidle_total; +} + +static void psi_update_work(struct work_struct *work) +{ + struct delayed_work *dwork; + struct psi_group *group; + bool nonidle; + + dwork = to_delayed_work(work); + group = container_of(dwork, struct psi_group, clock_work); + + /* + * If there is task activity, periodically fold the per-cpu + * times and feed samples into the running averages. If things + * are idle and there is no data to process, stop the clock. + * Once restarted, we'll catch up the running averages in one + * go - see calc_avgs() and missed_periods. + */ + + nonidle = update_stats(group); + + if (nonidle) { + unsigned long delay = 0; + u64 now; + + now = sched_clock(); + if (group->next_update > now) + delay = nsecs_to_jiffies(group->next_update - now) + 1; + schedule_delayed_work(dwork, delay); + } +} + +static void record_times(struct psi_group_cpu *groupc, int cpu, + bool memstall_tick) +{ + u32 delta; + u64 now; + + now = cpu_clock(cpu); + delta = now - groupc->state_start; + groupc->state_start = now; + + if (test_state(groupc->tasks, PSI_IO_SOME)) { + groupc->times[PSI_IO_SOME] += delta; + if (test_state(groupc->tasks, PSI_IO_FULL)) + groupc->times[PSI_IO_FULL] += delta; + } + + if (test_state(groupc->tasks, PSI_MEM_SOME)) { + groupc->times[PSI_MEM_SOME] += delta; + if (test_state(groupc->tasks, PSI_MEM_FULL)) + groupc->times[PSI_MEM_FULL] += delta; + else if (memstall_tick) { + u32 sample; + /* + * Since we care about lost potential, a + * memstall is FULL when there are no other + * working tasks, but also when the CPU is + * actively reclaiming and nothing productive + * could run even if it were runnable. + * + * When the timer tick sees a reclaiming CPU, + * regardless of runnable tasks, sample a FULL + * tick (or less if it hasn't been a full tick + * since the last state change). + */ + sample = min(delta, (u32)jiffies_to_nsecs(1)); + groupc->times[PSI_MEM_FULL] += sample; + } + } + + if (test_state(groupc->tasks, PSI_CPU_SOME)) + groupc->times[PSI_CPU_SOME] += delta; + + if (test_state(groupc->tasks, PSI_NONIDLE)) + groupc->times[PSI_NONIDLE] += delta; +} + +static void psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set) +{ + struct psi_group_cpu *groupc; + unsigned int t, m; + + groupc = per_cpu_ptr(group->pcpu, cpu); + + /* + * First we assess the aggregate resource states this CPU's + * tasks have been in since the last change, and account any + * SOME and FULL time these may have resulted in. + * + * Then we update the task counts according to the state + * change requested through the @clear and @set bits. + */ + write_seqcount_begin(&groupc->seq); + + record_times(groupc, cpu, false); + + for (t = 0, m = clear; m; m &= ~(1 << t), t++) { + if (!(m & (1 << t))) + continue; + if (groupc->tasks[t] == 0 && !psi_bug) { + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n", + cpu, t, groupc->tasks[0], + groupc->tasks[1], groupc->tasks[2], + clear, set); + psi_bug = 1; + } + groupc->tasks[t]--; + } + + for (t = 0; set; set &= ~(1 << t), t++) + if (set & (1 << t)) + groupc->tasks[t]++; + + write_seqcount_end(&groupc->seq); + + if (!delayed_work_pending(&group->clock_work)) + schedule_delayed_work(&group->clock_work, PSI_FREQ); +} + +void psi_task_change(struct task_struct *task, int clear, int set) +{ + int cpu = task_cpu(task); + + if (!task->pid) + return; + + if (((task->psi_flags & set) || + (task->psi_flags & clear) != clear) && + !psi_bug) { + printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", + task->pid, task->comm, cpu, + task->psi_flags, clear, set); + psi_bug = 1; + } + + task->psi_flags &= ~clear; + task->psi_flags |= set; + + psi_group_change(&psi_system, cpu, clear, set); +} + +void psi_memstall_tick(struct task_struct *task, int cpu) +{ + struct psi_group_cpu *groupc; + + groupc = per_cpu_ptr(psi_system.pcpu, cpu); + write_seqcount_begin(&groupc->seq); + record_times(groupc, cpu, true); + write_seqcount_end(&groupc->seq); +} + +/** + * psi_memstall_enter - mark the beginning of a memory stall section + * @flags: flags to handle nested sections + * + * Marks the calling task as being stalled due to a lack of memory, + * such as waiting for a refault or performing reclaim. + */ +void psi_memstall_enter(unsigned long *flags) +{ + struct rq_flags rf; + struct rq *rq; + + if (psi_disabled) + return; + + *flags = current->flags & PF_MEMSTALL; + if (*flags) + return; + /* + * PF_MEMSTALL setting & accounting needs to be atomic wrt + * changes to the task's scheduling state, otherwise we can + * race with CPU migration. + */ + rq = this_rq_lock_irq(&rf); + + current->flags |= PF_MEMSTALL; + psi_task_change(current, 0, TSK_MEMSTALL); + + rq_unlock_irq(rq, &rf); +} + +/** + * psi_memstall_leave - mark the end of an memory stall section + * @flags: flags to handle nested memdelay sections + * + * Marks the calling task as no longer stalled due to lack of memory. + */ +void psi_memstall_leave(unsigned long *flags) +{ + struct rq_flags rf; + struct rq *rq; + + if (psi_disabled) + return; + + if (*flags) + return; + /* + * PF_MEMSTALL clearing & accounting needs to be atomic wrt + * changes to the task's scheduling state, otherwise we could + * race with CPU migration. + */ + rq = this_rq_lock_irq(&rf); + + current->flags &= ~PF_MEMSTALL; + psi_task_change(current, TSK_MEMSTALL, 0); + + rq_unlock_irq(rq, &rf); +} + +static int psi_show(struct seq_file *m, struct psi_group *group, + enum psi_res res) +{ + int full; + + if (psi_disabled) + return -EOPNOTSUPP; + + update_stats(group); + + for (full = 0; full < 2 - (res == PSI_CPU); full++) { + unsigned long avg[3]; + u64 total; + int w; + + for (w = 0; w < 3; w++) + avg[w] = group->avg[res * 2 + full][w]; + total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); + + seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", + full ? "full" : "some", + LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), + LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), + LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), + total); + } + + return 0; +} + +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_io_show, NULL); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_memory_show, NULL); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_cpu_show, NULL); +} + +static const struct file_operations psi_io_fops = { + .open = psi_io_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations psi_memory_fops = { + .open = psi_memory_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations psi_cpu_fops = { + .open = psi_cpu_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init psi_proc_init(void) +{ + proc_mkdir("pressure", NULL); + proc_create("pressure/io", 0, NULL, &psi_io_fops); + proc_create("pressure/memory", 0, NULL, &psi_memory_fops); + proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + return 0; +} +module_init(psi_proc_init); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1de189bb9209..618577fc9aa8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -319,6 +320,7 @@ extern bool dl_cpu_busy(unsigned int cpu); #ifdef CONFIG_CGROUP_SCHED #include +#include struct cfs_rq; struct rt_rq; diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 8aea199a39b4..4904c4677000 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -55,6 +55,92 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_val_or_zero(var) 0 #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_PSI +/* + * PSI tracks state that persists across sleeps, such as iowaits and + * memory stalls. As a result, it has to distinguish between sleeps, + * where a task's runnable state changes, and requeues, where a task + * and its state are being moved between CPUs and runqueues. + */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) +{ + int clear = 0, set = TSK_RUNNING; + + if (psi_disabled) + return; + + if (!wakeup || p->sched_psi_wake_requeue) { + if (p->flags & PF_MEMSTALL) + set |= TSK_MEMSTALL; + if (p->sched_psi_wake_requeue) + p->sched_psi_wake_requeue = 0; + } else { + if (p->in_iowait) + clear |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_dequeue(struct task_struct *p, bool sleep) +{ + int clear = TSK_RUNNING, set = 0; + + if (psi_disabled) + return; + + if (!sleep) { + if (p->flags & PF_MEMSTALL) + clear |= TSK_MEMSTALL; + } else { + if (p->in_iowait) + set |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_ttwu_dequeue(struct task_struct *p) +{ + if (psi_disabled) + return; + /* + * Is the task being migrated during a wakeup? Make sure to + * deregister its sleep-persistent psi states from the old + * queue, and let psi_enqueue() know it has to requeue. + */ + if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) { + struct rq_flags rf; + struct rq *rq; + int clear = 0; + + if (p->in_iowait) + clear |= TSK_IOWAIT; + if (p->flags & PF_MEMSTALL) + clear |= TSK_MEMSTALL; + + rq = __task_rq_lock(p, &rf); + psi_task_change(p, clear, 0); + p->sched_psi_wake_requeue = 1; + __task_rq_unlock(rq, &rf); + } +} + +static inline void psi_task_tick(struct rq *rq) +{ + if (psi_disabled) + return; + + if (unlikely(rq->curr->flags & PF_MEMSTALL)) + psi_memstall_tick(rq->curr, cpu_of(rq)); +} +#else /* CONFIG_PSI */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} +static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_ttwu_dequeue(struct task_struct *p) {} +static inline void psi_task_tick(struct rq *rq) {} +#endif /* CONFIG_PSI */ + #ifdef CONFIG_SCHED_INFO static inline void sched_info_reset_dequeued(struct task_struct *t) { diff --git a/mm/compaction.c b/mm/compaction.c index faca45ebe62d..7c607479de4a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "internal.h" #ifdef CONFIG_COMPACTION @@ -2068,11 +2069,15 @@ static int kcompactd(void *p) pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; while (!kthread_should_stop()) { + unsigned long pflags; + trace_mm_compaction_kcompactd_sleep(pgdat->node_id); wait_event_freezable(pgdat->kcompactd_wait, kcompactd_work_requested(pgdat)); + psi_memstall_enter(&pflags); kcompactd_do_work(pgdat); + psi_memstall_leave(&pflags); } return 0; diff --git a/mm/filemap.c b/mm/filemap.c index 01a841f17bf4..41586009fa42 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "internal.h" #define CREATE_TRACE_POINTS @@ -1075,11 +1076,14 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; + unsigned long pflags; int ret = 0; - if (bit_nr == PG_locked && !PageSwapBacked(page) && + if (bit_nr == PG_locked && !PageUptodate(page) && PageWorkingset(page)) { - delayacct_thrashing_start(); + if (!PageSwapBacked(page)) + delayacct_thrashing_start(); + psi_memstall_enter(&pflags); thrashing = true; } @@ -1121,8 +1125,11 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, finish_wait(q, wait); - if (thrashing) - delayacct_thrashing_end(); + if (thrashing) { + if (!PageSwapBacked(page)) + delayacct_thrashing_end(); + psi_memstall_leave(&pflags); + } /* * A signal could leave PageWaiters set. Clearing it here if diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 20f25d06c00c..f97b5a1700a4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -3549,15 +3550,20 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, enum compact_priority prio, enum compact_result *compact_result) { struct page *page; + unsigned long pflags; unsigned int noreclaim_flag; if (!order) return NULL; + psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); + *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, prio); + memalloc_noreclaim_restore(noreclaim_flag); + psi_memstall_leave(&pflags); if (*compact_result <= COMPACT_INACTIVE) return NULL; @@ -3756,11 +3762,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct reclaim_state reclaim_state; int progress; unsigned int noreclaim_flag; + unsigned long pflags; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); + psi_memstall_enter(&pflags); fs_reclaim_acquire(gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); reclaim_state.reclaimed_slab = 0; @@ -3772,6 +3780,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, current->reclaim_state = NULL; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(gfp_mask); + psi_memstall_leave(&pflags); cond_resched(); diff --git a/mm/vmscan.c b/mm/vmscan.c index 87e9fef341d2..8ea87586925e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -3305,6 +3306,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, { struct zonelist *zonelist; unsigned long nr_reclaimed; + unsigned long pflags; int nid; unsigned int noreclaim_flag; struct scan_control sc = { @@ -3333,9 +3335,13 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.gfp_mask, sc.reclaim_idx); + psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + memalloc_noreclaim_restore(noreclaim_flag); + psi_memstall_leave(&pflags); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -3500,6 +3506,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) int i; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + unsigned long pflags; struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -3510,6 +3517,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) .may_swap = 1, }; + psi_memstall_enter(&pflags); __fs_reclaim_acquire(); count_vm_event(PAGEOUTRUN); @@ -3611,6 +3619,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) out: snapshot_refaults(NULL, pgdat); __fs_reclaim_release(); + psi_memstall_leave(&pflags); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller From 2ce7135adc9ad081aa3c49744144376ac74fea60 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:31 -0700 Subject: [PATCH 063/132] psi: cgroup support On a system that executes multiple cgrouped jobs and independent workloads, we don't just care about the health of the overall system, but also that of individual jobs, so that we can ensure individual job health, fairness between jobs, or prioritize some jobs over others. This patch implements pressure stall tracking for cgroups. In kernels with CONFIG_PSI=y, cgroup2 groups will have cpu.pressure, memory.pressure, and io.pressure files that track aggregate pressure stall times for only the tasks inside the cgroup. Link: http://lkml.kernel.org/r/20180828172258.3185-10-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Acked-by: Peter Zijlstra (Intel) Tested-by: Daniel Drake Tested-by: Suren Baghdasaryan Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/accounting/psi.txt | 9 ++ Documentation/admin-guide/cgroup-v2.rst | 18 ++++ include/linux/cgroup-defs.h | 4 + include/linux/cgroup.h | 15 +++ include/linux/psi.h | 25 +++++ init/Kconfig | 4 + kernel/cgroup/cgroup.c | 45 ++++++++- kernel/sched/psi.c | 118 ++++++++++++++++++++++-- 8 files changed, 228 insertions(+), 10 deletions(-) diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt index 3753a82f1cf5..b8ca28b60215 100644 --- a/Documentation/accounting/psi.txt +++ b/Documentation/accounting/psi.txt @@ -62,3 +62,12 @@ well as medium and long term trends. The total absolute stall time is tracked and exported as well, to allow detection of latency spikes which wouldn't necessarily make a dent in the time averages, or to average trends over custom time frames. + +Cgroup2 interface +================= + +In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem +mounted, pressure stall information is also tracked for tasks grouped +into cgroups. Each subdirectory in the cgroupfs mountpoint contains +cpu.pressure, memory.pressure, and io.pressure files; the format is +the same as the /proc/pressure/ files. diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index caf36105a1c7..8389d6f72a77 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -966,6 +966,12 @@ All time durations are in microseconds. $PERIOD duration. "max" for $MAX indicates no limit. If only one number is written, $MAX is updated. + cpu.pressure + A read-only nested-key file which exists on non-root cgroups. + + Shows pressure stall information for CPU. See + Documentation/accounting/psi.txt for details. + Memory ------ @@ -1271,6 +1277,12 @@ PAGE_SIZE multiple when read back. higher than the limit for an extended period of time. This reduces the impact on the workload and memory management. + memory.pressure + A read-only nested-key file which exists on non-root cgroups. + + Shows pressure stall information for memory. See + Documentation/accounting/psi.txt for details. + Usage Guidelines ~~~~~~~~~~~~~~~~ @@ -1408,6 +1420,12 @@ IO Interface Files 8:16 rbps=2097152 wbps=max riops=max wiops=max + io.pressure + A read-only nested-key file which exists on non-root cgroups. + + Shows pressure stall information for IO. See + Documentation/accounting/psi.txt for details. + Writeback ~~~~~~~~~ diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 22254c1fe1c5..5e1694fe035b 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -20,6 +20,7 @@ #include #include #include +#include #ifdef CONFIG_CGROUPS @@ -436,6 +437,9 @@ struct cgroup { /* used to schedule release agent */ struct work_struct release_agent_work; + /* used to track pressure stalls */ + struct psi_group psi; + /* used to store eBPF programs */ struct cgroup_bpf bpf; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b622d6608605..9968332cceed 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -650,6 +650,11 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) pr_cont_kernfs_path(cgrp->kn); } +static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) +{ + return &cgrp->psi; +} + static inline void cgroup_init_kthreadd(void) { /* @@ -703,6 +708,16 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) return NULL; } +static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) +{ + return NULL; +} + +static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) +{ + return NULL; +} + static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { diff --git a/include/linux/psi.h b/include/linux/psi.h index b0daf050de58..8e0725aac0aa 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -4,6 +4,9 @@ #include #include +struct seq_file; +struct css_set; + #ifdef CONFIG_PSI extern bool psi_disabled; @@ -16,6 +19,14 @@ void psi_memstall_tick(struct task_struct *task, int cpu); void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); +int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); + +#ifdef CONFIG_CGROUPS +int psi_cgroup_alloc(struct cgroup *cgrp); +void psi_cgroup_free(struct cgroup *cgrp); +void cgroup_move_task(struct task_struct *p, struct css_set *to); +#endif + #else /* CONFIG_PSI */ static inline void psi_init(void) {} @@ -23,6 +34,20 @@ static inline void psi_init(void) {} static inline void psi_memstall_enter(unsigned long *flags) {} static inline void psi_memstall_leave(unsigned long *flags) {} +#ifdef CONFIG_CGROUPS +static inline int psi_cgroup_alloc(struct cgroup *cgrp) +{ + return 0; +} +static inline void psi_cgroup_free(struct cgroup *cgrp) +{ +} +static inline void cgroup_move_task(struct task_struct *p, struct css_set *to) +{ + rcu_assign_pointer(p->cgroups, to); +} +#endif + #endif /* CONFIG_PSI */ #endif /* _LINUX_PSI_H */ diff --git a/init/Kconfig b/init/Kconfig index 26e639df5517..a4112e95724a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -501,6 +501,10 @@ config PSI the share of walltime in which some or all tasks in the system are delayed due to contention of the respective resource. + In kernels with cgroup support, cgroups (cgroup2 only) will + have cpu.pressure, memory.pressure, and io.pressure files, + which aggregate pressure stalls for the grouped tasks only. + For more details see Documentation/accounting/psi.txt. Say N if unsure. diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4c1cf0969a80..8b79318810ad 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -862,7 +863,7 @@ static void css_set_move_task(struct task_struct *task, */ WARN_ON_ONCE(task->flags & PF_EXITING); - rcu_assign_pointer(task->cgroups, to_cset); + cgroup_move_task(task, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); } @@ -3446,6 +3447,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) return ret; } +#ifdef CONFIG_PSI +static int cgroup_io_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); +} +static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); +} +static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); +} +#endif + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; @@ -4576,6 +4592,23 @@ static struct cftype cgroup_base_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_stat_show, }, +#ifdef CONFIG_PSI + { + .name = "io.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_io_pressure_show, + }, + { + .name = "memory.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_memory_pressure_show, + }, + { + .name = "cpu.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_cpu_pressure_show, + }, +#endif { } /* terminate */ }; @@ -4636,6 +4669,7 @@ static void css_free_rwork_fn(struct work_struct *work) */ cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); + psi_cgroup_free(cgrp); if (cgroup_on_dfl(cgrp)) cgroup_rstat_exit(cgrp); kfree(cgrp); @@ -4892,10 +4926,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->self.parent = &parent->self; cgrp->root = root; cgrp->level = level; - ret = cgroup_bpf_inherit(cgrp); + + ret = psi_cgroup_alloc(cgrp); if (ret) goto out_idr_free; + ret = cgroup_bpf_inherit(cgrp); + if (ret) + goto out_psi_free; + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; @@ -4933,6 +4972,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) return cgrp; +out_psi_free: + psi_cgroup_free(cgrp); out_idr_free: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_stat_exit: diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 595414599b98..7cdecfc010af 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -473,9 +473,35 @@ static void psi_group_change(struct psi_group *group, int cpu, schedule_delayed_work(&group->clock_work, PSI_FREQ); } +static struct psi_group *iterate_groups(struct task_struct *task, void **iter) +{ +#ifdef CONFIG_CGROUPS + struct cgroup *cgroup = NULL; + + if (!*iter) + cgroup = task->cgroups->dfl_cgrp; + else if (*iter == &psi_system) + return NULL; + else + cgroup = cgroup_parent(*iter); + + if (cgroup && cgroup_parent(cgroup)) { + *iter = cgroup; + return cgroup_psi(cgroup); + } +#else + if (*iter) + return NULL; +#endif + *iter = &psi_system; + return &psi_system; +} + void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); + struct psi_group *group; + void *iter = NULL; if (!task->pid) return; @@ -492,17 +518,23 @@ void psi_task_change(struct task_struct *task, int clear, int set) task->psi_flags &= ~clear; task->psi_flags |= set; - psi_group_change(&psi_system, cpu, clear, set); + while ((group = iterate_groups(task, &iter))) + psi_group_change(group, cpu, clear, set); } void psi_memstall_tick(struct task_struct *task, int cpu) { - struct psi_group_cpu *groupc; + struct psi_group *group; + void *iter = NULL; - groupc = per_cpu_ptr(psi_system.pcpu, cpu); - write_seqcount_begin(&groupc->seq); - record_times(groupc, cpu, true); - write_seqcount_end(&groupc->seq); + while ((group = iterate_groups(task, &iter))) { + struct psi_group_cpu *groupc; + + groupc = per_cpu_ptr(group->pcpu, cpu); + write_seqcount_begin(&groupc->seq); + record_times(groupc, cpu, true); + write_seqcount_end(&groupc->seq); + } } /** @@ -565,8 +597,78 @@ void psi_memstall_leave(unsigned long *flags) rq_unlock_irq(rq, &rf); } -static int psi_show(struct seq_file *m, struct psi_group *group, - enum psi_res res) +#ifdef CONFIG_CGROUPS +int psi_cgroup_alloc(struct cgroup *cgroup) +{ + if (psi_disabled) + return 0; + + cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); + if (!cgroup->psi.pcpu) + return -ENOMEM; + group_init(&cgroup->psi); + return 0; +} + +void psi_cgroup_free(struct cgroup *cgroup) +{ + if (psi_disabled) + return; + + cancel_delayed_work_sync(&cgroup->psi.clock_work); + free_percpu(cgroup->psi.pcpu); +} + +/** + * cgroup_move_task - move task to a different cgroup + * @task: the task + * @to: the target css_set + * + * Move task to a new cgroup and safely migrate its associated stall + * state between the different groups. + * + * This function acquires the task's rq lock to lock out concurrent + * changes to the task's scheduling state and - in case the task is + * running - concurrent changes to its stall state. + */ +void cgroup_move_task(struct task_struct *task, struct css_set *to) +{ + bool move_psi = !psi_disabled; + unsigned int task_flags = 0; + struct rq_flags rf; + struct rq *rq; + + if (move_psi) { + rq = task_rq_lock(task, &rf); + + if (task_on_rq_queued(task)) + task_flags = TSK_RUNNING; + else if (task->in_iowait) + task_flags = TSK_IOWAIT; + + if (task->flags & PF_MEMSTALL) + task_flags |= TSK_MEMSTALL; + + if (task_flags) + psi_task_change(task, task_flags, 0); + } + + /* + * Lame to do this here, but the scheduler cannot be locked + * from the outside, so we move cgroups from inside sched/. + */ + rcu_assign_pointer(task->cgroups, to); + + if (move_psi) { + if (task_flags) + psi_task_change(task, 0, task_flags); + + task_rq_unlock(rq, task, &rf); + } +} +#endif /* CONFIG_CGROUPS */ + +int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full; From 505802a53510e54ad5fbbd655a68893df83bfb91 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:35 -0700 Subject: [PATCH 064/132] mm: workingset: use cheaper __inc_lruvec_state in irqsafe node reclaim No need to use the preemption-safe lruvec state function inside the reclaim region that has irqs disabled. Link: http://lkml.kernel.org/r/20181009184732.762-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Andrew Morton Reviewed-by: Rik van Riel Acked-by: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/workingset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/workingset.c b/mm/workingset.c index 99b7f7c09b13..5a72c9d5e195 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -500,7 +500,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; - inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); + __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); __radix_tree_delete_node(&mapping->i_pages, node, workingset_lookup_update(mapping)); From 68d48e6a2df575b935edd420396c3cb8b6aa6ad3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:39 -0700 Subject: [PATCH 065/132] mm: workingset: add vmstat counter for shadow nodes Make it easier to catch bugs in the shadow node shrinker by adding a counter for the shadow nodes in circulation. [akpm@linux-foundation.org: assert that irqs are disabled, for __inc_lruvec_page_state()] [akpm@linux-foundation.org: s/WARN_ON_ONCE/VM_WARN_ON_ONCE/, per Johannes] Link: http://lkml.kernel.org/r/20181009184732.762-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + mm/vmstat.c | 1 + mm/workingset.c | 14 ++++++++++++-- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ba51d5bf7af1..9f0caccd5833 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -161,6 +161,7 @@ enum node_stat_item { NR_SLAB_UNRECLAIMABLE, NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ + WORKINGSET_NODES, WORKINGSET_REFAULT, WORKINGSET_ACTIVATE, WORKINGSET_RESTORE, diff --git a/mm/vmstat.c b/mm/vmstat.c index d918f6192d15..dab53430f63c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1143,6 +1143,7 @@ const char * const vmstat_text[] = { "nr_slab_unreclaimable", "nr_isolated_anon", "nr_isolated_file", + "workingset_nodes", "workingset_refault", "workingset_activate", "workingset_restore", diff --git a/mm/workingset.c b/mm/workingset.c index 5a72c9d5e195..7e6ef312cea5 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -377,12 +377,20 @@ void workingset_update_node(struct radix_tree_node *node) * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ + VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ + if (node->count && node->count == node->exceptional) { - if (list_empty(&node->private_list)) + if (list_empty(&node->private_list)) { list_lru_add(&shadow_nodes, &node->private_list); + __inc_lruvec_page_state(virt_to_page(node), + WORKINGSET_NODES); + } } else { - if (!list_empty(&node->private_list)) + if (!list_empty(&node->private_list)) { list_lru_del(&shadow_nodes, &node->private_list); + __dec_lruvec_page_state(virt_to_page(node), + WORKINGSET_NODES); + } } } @@ -473,6 +481,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } list_lru_isolate(lru, item); + __dec_lruvec_page_state(virt_to_page(node), WORKINGSET_NODES); + spin_unlock(lru_lock); /* From 4b85afbdacd290c7a22c96df40a6433fdcacb509 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:42 -0700 Subject: [PATCH 066/132] mm: zero-seek shrinkers The page cache and most shrinkable slab caches hold data that has been read from disk, but there are some caches that only cache CPU work, such as the dentry and inode caches of procfs and sysfs, as well as the subset of radix tree nodes that track non-resident page cache. Currently, all these are shrunk at the same rate: using DEFAULT_SEEKS for the shrinker's seeks setting tells the reclaim algorithm that for every two page cache pages scanned it should scan one slab object. This is a bogus setting. A virtual inode that required no IO to create is not twice as valuable as a page cache page; shadow cache entries with eviction distances beyond the size of memory aren't either. In most cases, the behavior in practice is still fine. Such virtual caches don't tend to grow and assert themselves aggressively, and usually get picked up before they cause problems. But there are scenarios where that's not true. Our database workloads suffer from two of those. For one, their file workingset is several times bigger than available memory, which has the kernel aggressively create shadow page cache entries for the non-resident parts of it. The workingset code does tell the VM that most of these are expendable, but the VM ends up balancing them 2:1 to cache pages as per the seeks setting. This is a huge waste of memory. These workloads also deal with tens of thousands of open files and use /proc for introspection, which ends up growing the proc_inode_cache to absurdly large sizes - again at the cost of valuable cache space, which isn't a reasonable trade-off, given that proc inodes can be re-created without involving the disk. This patch implements a "zero-seek" setting for shrinkers that results in a target ratio of 0:1 between their objects and IO-backed caches. This allows such virtual caches to grow when memory is available (they do cache/avoid CPU work after all), but effectively disables them as soon as IO-backed objects are under pressure. It then switches the shrinkers for procfs and sysfs metadata, as well as excess page cache shadow nodes, to the new zero-seek setting. Link: http://lkml.kernel.org/r/20181009184732.762-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Domas Mituzas Reviewed-by: Andrew Morton Reviewed-by: Rik van Riel Acked-by: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/kernfs/mount.c | 3 +++ fs/proc/inode.c | 3 +++ mm/vmscan.c | 15 ++++++++++++--- mm/workingset.c | 2 +- 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index ff2716f9322e..fdf527b6d79c 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -236,6 +236,9 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) sb->s_export_op = &kernfs_export_ops; sb->s_time_gran = 1; + /* sysfs dentries and inodes don't require IO to create */ + sb->s_shrink.seeks = 0; + /* get root inode, initialize and unlock it */ mutex_lock(&kernfs_mutex); inode = kernfs_get_inode(sb, info->root->kn); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index fc5306a31a1d..5792f9e39466 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -516,6 +516,9 @@ int proc_fill_super(struct super_block *s, void *data, int silent) */ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + /* procfs dentries and inodes don't require IO to create */ + s->s_shrink.seeks = 0; + pde_get(&proc_root); root_inode = proc_get_inode(s, &proc_root); if (!root_inode) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 8ea87586925e..28c9ae5633b9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -474,9 +474,18 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); total_scan = nr; - delta = freeable >> priority; - delta *= 4; - do_div(delta, shrinker->seeks); + if (shrinker->seeks) { + delta = freeable >> priority; + delta *= 4; + do_div(delta, shrinker->seeks); + } else { + /* + * These objects don't require any IO to create. Trim + * them aggressively under memory pressure to keep + * them from causing refetches in the IO caches. + */ + delta = freeable / 2; + } /* * Make sure we apply some minimal pressure on default priority diff --git a/mm/workingset.c b/mm/workingset.c index 7e6ef312cea5..cbc13d4dfa79 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -534,7 +534,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, static struct shrinker workingset_shadow_shrinker = { .count_objects = count_shadow_nodes, .scan_objects = scan_shadow_nodes, - .seeks = DEFAULT_SEEKS, + .seeks = 0, /* ->count reports only fully expendable nodes */ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, }; From e9b257ed150c1f43912bd66031185598451f68a9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:45 -0700 Subject: [PATCH 067/132] mm/memcontrol.c: fix memory.stat item ordering The refault stats go better with the page fault stats, and are of higher interest than the stats on LRU operations. In fact they used to be grouped together; when the LRU operation stats were added later on, they were wedged in between. Move them back together. Documentation/admin-guide/cgroup-v2.rst already lists them in the right order. Link: http://lkml.kernel.org/r/20181010140239.GA2527@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Rik van Riel Cc: Michal Hocko Cc: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fcec9b39e2a3..0e9ede617b89 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5597,6 +5597,13 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); + seq_printf(m, "workingset_refault %lu\n", + acc.stat[WORKINGSET_REFAULT]); + seq_printf(m, "workingset_activate %lu\n", + acc.stat[WORKINGSET_ACTIVATE]); + seq_printf(m, "workingset_nodereclaim %lu\n", + acc.stat[WORKINGSET_NODERECLAIM]); + seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + acc.events[PGSCAN_DIRECT]); @@ -5607,13 +5614,6 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); - seq_printf(m, "workingset_refault %lu\n", - acc.stat[WORKINGSET_REFAULT]); - seq_printf(m, "workingset_activate %lu\n", - acc.stat[WORKINGSET_ACTIVATE]); - seq_printf(m, "workingset_nodereclaim %lu\n", - acc.stat[WORKINGSET_NODERECLAIM]); - return 0; } From 2c029a1ea3aac296cd5b47584a579defcc4b4aa0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 26 Oct 2018 15:06:49 -0700 Subject: [PATCH 068/132] mm, page_alloc: drop should_suppress_show_mem should_suppress_show_mem() was introduced to reduce the overhead of show_mem on large NUMA systems. Things have changed since then though. Namely c78e93630d15 ("mm: do not walk all of system memory during show_mem") has reduced the overhead considerably. Moreover warn_alloc_show_mem clears SHOW_MEM_FILTER_NODES when called from the IRQ context already so we are not printing per node stats. Remove should_suppress_show_mem because we are losing potentially interesting information about allocation failures. We have seen a bug report where system gets unresponsive under memory pressure and there is only kernel: [2032243.696888] qlge 0000:8b:00.1 ql1: Could not get a page chunk, i=8, clean_idx =200 . kernel: [2032243.710725] swapper/7: page allocation failure: order:1, mode:0x1084120(GFP_ATOMIC|__GFP_COLD|__GFP_COMP) without an additional information for debugging. It would be great to see the state of the page allocator at the moment. Link: http://lkml.kernel.org/r/20180907114334.7088-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f97b5a1700a4..eb6c50cc8880 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3367,26 +3367,12 @@ try_this_zone: return NULL; } -/* - * Large machines with many possible nodes should not always dump per-node - * meminfo in irq context. - */ -static inline bool should_suppress_show_mem(void) -{ - bool ret = false; - -#if NODES_SHIFT > 8 - ret = in_interrupt(); -#endif - return ret; -} - static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) { unsigned int filter = SHOW_MEM_FILTER_NODES; static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); - if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) + if (!__ratelimit(&show_mem_rs)) return; /* From c3df29d13044d885695067fa0b1386824942557a Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 26 Oct 2018 15:06:53 -0700 Subject: [PATCH 069/132] mm/swap.c: remove duplicated include Remove duplicated include linux/memremap.h Link: http://lkml.kernel.org/r/20180917131308.16420-1-yuehaibing@huawei.com Signed-off-by: YueHaibing Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/swap.c b/mm/swap.c index 26fc9b5f1b6c..87a54c8dee34 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include From dedf2c73b80b4566dfcae8ebe9ed46a38b63a1f9 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Fri, 26 Oct 2018 15:06:57 -0700 Subject: [PATCH 070/132] mm/mempolicy.c: use match_string() helper to simplify the code match_string() returns the index of an array for a matching string, which can be used intead of open coded implementation. Link: http://lkml.kernel.org/r/1536988365-50310-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Andrey Ryabinin Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2e76a8f65e94..cfd26d7e61a1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2711,12 +2711,11 @@ static const char * const policy_modes[] = int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; - unsigned short mode; unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); - int err = 1; + int err = 1, mode; if (nodelist) { /* NUL-terminate mode or flags string */ @@ -2731,12 +2730,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) if (flags) *flags++ = '\0'; /* terminate mode string */ - for (mode = 0; mode < MPOL_MAX; mode++) { - if (!strcmp(str, policy_modes[mode])) { - break; - } - } - if (mode >= MPOL_MAX) + mode = match_string(policy_modes, MPOL_MAX, str); + if (mode < 0) goto out; switch (mode) { From 52414d3302577bb60e4ba3c21b7957da7c7a8d21 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 26 Oct 2018 15:07:00 -0700 Subject: [PATCH 071/132] kvfree(): fix misleading comment vfree() might sleep if called not in interrupt context. So does kvfree() too. Fix misleading kvfree()'s comment about allowed context. Link: http://lkml.kernel.org/r/20180914130512.10394-1-aryabinin@virtuozzo.com Fixes: 04b8e946075d ("mm/util.c: improve kvfree() kerneldoc") Signed-off-by: Andrey Ryabinin Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/util.c b/mm/util.c index f740754f5012..8bf08b5b5760 100644 --- a/mm/util.c +++ b/mm/util.c @@ -435,7 +435,7 @@ EXPORT_SYMBOL(kvmalloc_node); * It is slightly more efficient to use kfree() or vfree() if you are certain * that you know which one to use. * - * Context: Any context except NMI. + * Context: Either preemptible task context or not-NMI interrupt. */ void kvfree(const void *addr) { From 3ca4ea3a7a78a243ee9edf71a2736bc8fb26d70f Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 26 Oct 2018 15:07:03 -0700 Subject: [PATCH 072/132] mm/vmalloc.c: improve vfree() kerneldoc vfree() might sleep if called not in interrupt context. Explain that in the comment. Link: http://lkml.kernel.org/r/20180914130512.10394-2-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a728fc492557..d00d42d6bf79 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1577,6 +1577,8 @@ void vfree_atomic(const void *addr) * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling * conventions for vfree() arch-depenedent would be a really bad idea) * + * May sleep if called *not* from interrupt context. + * * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) */ void vfree(const void *addr) From a8dda165ec34fac2b4119654330150e2c896e531 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 26 Oct 2018 15:07:07 -0700 Subject: [PATCH 073/132] vfree: add debug might_sleep() Add might_sleep() call to vfree() to catch potential sleep-in-atomic bugs earlier. [aryabinin@virtuozzo.com: drop might_sleep_if() from kvfree()] Link: http://lkml.kernel.org/r/7e19e4df-b1a6-29bd-9ae7-0266d50bef1d@virtuozzo.com Link: http://lkml.kernel.org/r/20180914130512.10394-3-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d00d42d6bf79..97d4b25d0373 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1587,6 +1587,8 @@ void vfree(const void *addr) kmemleak_free(addr); + might_sleep_if(!in_interrupt()); + if (!addr) return; if (unlikely(in_interrupt())) From dd2283f2605e3b3e9c61bcae844b34f2afa4813f Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 26 Oct 2018 15:07:11 -0700 Subject: [PATCH 074/132] mm: mmap: zap pages with read mmap_sem in munmap Patch series "mm: zap pages with read mmap_sem in munmap for large mapping", v11. Background: Recently, when we ran some vm scalability tests on machines with large memory, we ran into a couple of mmap_sem scalability issues when unmapping large memory space, please refer to https://lkml.org/lkml/2017/12/14/733 and https://lkml.org/lkml/2018/2/20/576. History: Then akpm suggested to unmap large mapping section by section and drop mmap_sem at a time to mitigate it (see https://lkml.org/lkml/2018/3/6/784). V1 patch series was submitted to the mailing list per Andrew's suggestion (see https://lkml.org/lkml/2018/3/20/786). Then I received a lot great feedback and suggestions. Then this topic was discussed on LSFMM summit 2018. In the summit, Michal Hocko suggested (also in the v1 patches review) to try "two phases" approach. Zapping pages with read mmap_sem, then doing via cleanup with write mmap_sem (for discussion detail, see https://lwn.net/Articles/753269/) Approach: Zapping pages is the most time consuming part, according to the suggestion from Michal Hocko [1], zapping pages can be done with holding read mmap_sem, like what MADV_DONTNEED does. Then re-acquire write mmap_sem to cleanup vmas. But, we can't call MADV_DONTNEED directly, since there are two major drawbacks: * The unexpected state from PF if it wins the race in the middle of munmap. It may return zero page, instead of the content or SIGSEGV. * Can't handle VM_LOCKED | VM_HUGETLB | VM_PFNMAP and uprobe mappings, which is a showstopper from akpm But, some part may need write mmap_sem, for example, vma splitting. So, the design is as follows: acquire write mmap_sem lookup vmas (find and split vmas) deal with special mappings detach vmas downgrade_write zap pages free page tables release mmap_sem The vm events with read mmap_sem may come in during page zapping, but since vmas have been detached before, they, i.e. page fault, gup, etc, will not be able to find valid vma, then just return SIGSEGV or -EFAULT as expected. If the vma has VM_HUGETLB | VM_PFNMAP, they are considered as special mappings. They will be handled by falling back to regular do_munmap() with exclusive mmap_sem held in this patch since they may update vm flags. But, with the "detach vmas first" approach, the vmas have been detached when vm flags are updated, so it sounds safe to update vm flags with read mmap_sem for this specific case. So, VM_HUGETLB and VM_PFNMAP will be handled by using the optimized path in the following separate patches for bisectable sake. Unmapping uprobe areas may need update mm flags (MMF_RECALC_UPROBES). However it is fine to have false-positive MMF_RECALC_UPROBES according to uprobes developer. So, uprobe unmap will not be handled by the regular path. With the "detach vmas first" approach we don't have to re-acquire mmap_sem again to clean up vmas to avoid race window which might get the address space changed since downgrade_write() doesn't release the lock to lead regression, which simply downgrades to read lock. And, since the lock acquire/release cost is managed to the minimum and almost as same as before, the optimization could be extended to any size of mapping without incurring significant penalty to small mappings. For the time being, just do this in munmap syscall path. Other vm_munmap() or do_munmap() call sites (i.e mmap, mremap, etc) remain intact due to some implementation difficulties since they acquire write mmap_sem from very beginning and hold it until the end, do_munmap() might be called in the middle. But, the optimized do_munmap would like to be called without mmap_sem held so that we can do the optimization. So, if we want to do the similar optimization for mmap/mremap path, I'm afraid we would have to redesign them. mremap might be called on very large area depending on the usecases, the optimization to it will be considered in the future. This patch (of 3): When running some mmap/munmap scalability tests with large memory (i.e. > 300GB), the below hung task issue may happen occasionally. INFO: task ps:14018 blocked for more than 120 seconds. Tainted: G E 4.9.79-009.ali3000.alios7.x86_64 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. ps D 0 14018 1 0x00000004 ffff885582f84000 ffff885e8682f000 ffff880972943000 ffff885ebf499bc0 ffff8828ee120000 ffffc900349bfca8 ffffffff817154d0 0000000000000040 00ffffff812f872a ffff885ebf499bc0 024000d000948300 ffff880972943000 Call Trace: [] ? __schedule+0x250/0x730 [] schedule+0x36/0x80 [] rwsem_down_read_failed+0xf0/0x150 [] call_rwsem_down_read_failed+0x18/0x30 [] down_read+0x20/0x40 [] proc_pid_cmdline_read+0xd9/0x4e0 [] ? do_filp_open+0xa5/0x100 [] __vfs_read+0x37/0x150 [] ? security_file_permission+0x9b/0xc0 [] vfs_read+0x96/0x130 [] SyS_read+0x55/0xc0 [] entry_SYSCALL_64_fastpath+0x1a/0xc5 It is because munmap holds mmap_sem exclusively from very beginning to all the way down to the end, and doesn't release it in the middle. When unmapping large mapping, it may take long time (take ~18 seconds to unmap 320GB mapping with every single page mapped on an idle machine). Zapping pages is the most time consuming part, according to the suggestion from Michal Hocko [1], zapping pages can be done with holding read mmap_sem, like what MADV_DONTNEED does. Then re-acquire write mmap_sem to cleanup vmas. But, some part may need write mmap_sem, for example, vma splitting. So, the design is as follows: acquire write mmap_sem lookup vmas (find and split vmas) deal with special mappings detach vmas downgrade_write zap pages free page tables release mmap_sem The vm events with read mmap_sem may come in during page zapping, but since vmas have been detached before, they, i.e. page fault, gup, etc, will not be able to find valid vma, then just return SIGSEGV or -EFAULT as expected. If the vma has VM_HUGETLB | VM_PFNMAP, they are considered as special mappings. They will be handled by without downgrading mmap_sem in this patch since they may update vm flags. But, with the "detach vmas first" approach, the vmas have been detached when vm flags are updated, so it sounds safe to update vm flags with read mmap_sem for this specific case. So, VM_HUGETLB and VM_PFNMAP will be handled by using the optimized path in the following separate patches for bisectable sake. Unmapping uprobe areas may need update mm flags (MMF_RECALC_UPROBES). However it is fine to have false-positive MMF_RECALC_UPROBES according to uprobes developer. With the "detach vmas first" approach we don't have to re-acquire mmap_sem again to clean up vmas to avoid race window which might get the address space changed since downgrade_write() doesn't release the lock to lead regression, which simply downgrades to read lock. And, since the lock acquire/release cost is managed to the minimum and almost as same as before, the optimization could be extended to any size of mapping without incurring significant penalty to small mappings. For the time being, just do this in munmap syscall path. Other vm_munmap() or do_munmap() call sites (i.e mmap, mremap, etc) remain intact due to some implementation difficulties since they acquire write mmap_sem from very beginning and hold it until the end, do_munmap() might be called in the middle. But, the optimized do_munmap would like to be called without mmap_sem held so that we can do the optimization. So, if we want to do the similar optimization for mmap/mremap path, I'm afraid we would have to redesign them. mremap might be called on very large area depending on the usecases, the optimization to it will be considered in the future. With the patches, exclusive mmap_sem hold time when munmap a 80GB address space on a machine with 32 cores of E5-2680 @ 2.70GHz dropped to us level from second. munmap_test-15002 [008] 594.380138: funcgraph_entry: | __vm_munmap() { munmap_test-15002 [008] 594.380146: funcgraph_entry: !2485684 us | unmap_region(); munmap_test-15002 [008] 596.865836: funcgraph_exit: !2485692 us | } Here the execution time of unmap_region() is used to evaluate the time of holding read mmap_sem, then the remaining time is used with holding exclusive lock. [1] https://lwn.net/Articles/753269/ Link: http://lkml.kernel.org/r/1537376621-51150-2-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Suggested-by: Michal Hocko Suggested-by: Kirill A. Shutemov Suggested-by: Matthew Wilcox Reviewed-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Laurent Dufour Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 59 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 11 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index f7cd9cb966c0..330f12c17fa1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2687,8 +2687,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, * work. This now handles partial unmappings. * Jeremy Fitzhardinge */ -int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, - struct list_head *uf) +static int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, + struct list_head *uf, bool downgrade) { unsigned long end; struct vm_area_struct *vma, *prev, *last; @@ -2770,25 +2770,47 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, mm->locked_vm -= vma_pages(tmp); munlock_vma_pages_all(tmp); } + + /* + * Unmapping vmas, which have VM_HUGETLB or VM_PFNMAP, + * need get done with write mmap_sem held since they may + * update vm_flags. + */ + if (downgrade && + (tmp->vm_flags & (VM_HUGETLB | VM_PFNMAP))) + downgrade = false; + tmp = tmp->vm_next; } } - /* - * Remove the vma's, and unmap the actual pages - */ + /* Detach vmas from rbtree */ detach_vmas_to_be_unmapped(mm, vma, prev, end); - unmap_region(mm, vma, prev, start, end); + /* + * mpx unmap needs to be called with mmap_sem held for write. + * It is safe to call it before unmap_region(). + */ arch_unmap(mm, vma, start, end); + if (downgrade) + downgrade_write(&mm->mmap_sem); + + unmap_region(mm, vma, prev, start, end); + /* Fix up all other VM information */ remove_vma_list(mm, vma); - return 0; + return downgrade ? 1 : 0; } -int vm_munmap(unsigned long start, size_t len) +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, + struct list_head *uf) +{ + return __do_munmap(mm, start, len, uf, false); +} + +static int __vm_munmap(unsigned long start, size_t len, bool downgrade) { int ret; struct mm_struct *mm = current->mm; @@ -2797,17 +2819,32 @@ int vm_munmap(unsigned long start, size_t len) if (down_write_killable(&mm->mmap_sem)) return -EINTR; - ret = do_munmap(mm, start, len, &uf); - up_write(&mm->mmap_sem); + ret = __do_munmap(mm, start, len, &uf, downgrade); + /* + * Returning 1 indicates mmap_sem is downgraded. + * But 1 is not legal return value of vm_munmap() and munmap(), reset + * it to 0 before return. + */ + if (ret == 1) { + up_read(&mm->mmap_sem); + ret = 0; + } else + up_write(&mm->mmap_sem); + userfaultfd_unmap_complete(mm, &uf); return ret; } + +int vm_munmap(unsigned long start, size_t len) +{ + return __vm_munmap(start, len, false); +} EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { profile_munmap(addr); - return vm_munmap(addr, len); + return __vm_munmap(addr, len, true); } From b4cefb36051244bcb5651026d862c332a6cac7df Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 26 Oct 2018 15:07:15 -0700 Subject: [PATCH 075/132] mm: unmap VM_HUGETLB mappings with optimized path When unmapping VM_HUGETLB mappings, vm flags need to be updated. Since the vmas have been detached, so it sounds safe to update vm flags with read mmap_sem. Link: http://lkml.kernel.org/r/1537376621-51150-3-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Reviewed-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 330f12c17fa1..ea3188bcc9b4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2777,7 +2777,7 @@ static int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, * update vm_flags. */ if (downgrade && - (tmp->vm_flags & (VM_HUGETLB | VM_PFNMAP))) + (tmp->vm_flags & VM_PFNMAP)) downgrade = false; tmp = tmp->vm_next; From cb4922496ae40a775a1b17025eaa1060e8991253 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 26 Oct 2018 15:07:18 -0700 Subject: [PATCH 076/132] mm: unmap VM_PFNMAP mappings with optimized path When unmapping VM_PFNMAP mappings, vm flags need to be updated. Since the vmas have been detached, so it sounds safe to update vm flags with read mmap_sem. Link: http://lkml.kernel.org/r/1537376621-51150-4-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Reviewed-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index ea3188bcc9b4..58e323c92c8e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2771,15 +2771,6 @@ static int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, munlock_vma_pages_all(tmp); } - /* - * Unmapping vmas, which have VM_HUGETLB or VM_PFNMAP, - * need get done with write mmap_sem held since they may - * update vm_flags. - */ - if (downgrade && - (tmp->vm_flags & VM_PFNMAP)) - downgrade = false; - tmp = tmp->vm_next; } } From 3cb7b121ff4dfbf202845b8ea27d9a437eae210f Mon Sep 17 00:00:00 2001 From: "haiqing.shq" Date: Fri, 26 Oct 2018 15:07:22 -0700 Subject: [PATCH 077/132] mm/filemap.c: Use existing variable Use the variable write_len instead of ov_iter_count(from). Link: http://lkml.kernel.org/r/1537375855-2088-1-git-send-email-leviathan0992@gmail.com Signed-off-by: haiqing.shq Reviewed-by: Andrew Morton Cc: Jan Kara Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 41586009fa42..0b8c6de6d995 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3027,7 +3027,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_NOWAIT) { /* If there are pages to writeback, return */ if (filemap_range_has_page(inode->i_mapping, pos, - pos + iov_iter_count(from))) + pos + write_len)) return -EAGAIN; } else { written = filemap_write_and_wait_range(mapping, pos, From 83d83612d707c3709e030c745e3df8d4e17bbfa2 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Fri, 26 Oct 2018 15:07:25 -0700 Subject: [PATCH 078/132] mm/memory_hotplug.c: spare unnecessary calls to node_set_state In node_states_check_changes_online, we check if the node will have to be set for any of the N_*_MEMORY states after the pages have been onlined. Later on, we perform the activation in node_states_set_node. Currently, in node_states_set_node we set the node to N_MEMORY unconditionally. This means that we call node_set_state for N_MEMORY every time pages go online, but we only need to do it if the node has not yet been set for N_MEMORY. Fix this by checking status_change_nid. Link: http://lkml.kernel.org/r/20180919100819.25518-2-osalvador@techadventures.net Signed-off-by: Oscar Salvador Reviewed-by: Pavel Tatashin Cc: Michal Hocko Cc: Dan Williams Cc: David Hildenbrand Cc: Jonathan Cameron Cc: Cc: Mathieu Malaterre Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 38d94b703e9d..63facfc57224 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -753,7 +753,8 @@ static void node_states_set_node(int node, struct memory_notify *arg) if (arg->status_change_nid_high >= 0) node_set_state(node, N_HIGH_MEMORY); - node_set_state(node, N_MEMORY); + if (arg->status_change_nid >= 0) + node_set_state(node, N_MEMORY); } static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, From cf01f6f5e398a74f00fa9ac490ec98c12e63e4b1 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Fri, 26 Oct 2018 15:07:28 -0700 Subject: [PATCH 079/132] mm/memory_hotplug.c: tidy up node_states_clear_node() node_states_clear has the following if statements: if ((N_MEMORY != N_NORMAL_MEMORY) && (arg->status_change_nid_high >= 0)) ... if ((N_MEMORY != N_HIGH_MEMORY) && (arg->status_change_nid >= 0)) ... N_MEMORY can never be equal to neither N_NORMAL_MEMORY nor N_HIGH_MEMORY. Similar problem was found in [1]. Since this is wrong, let us get rid of it. [1] https://patchwork.kernel.org/patch/10579155/ Link: http://lkml.kernel.org/r/20180919100819.25518-4-osalvador@techadventures.net Signed-off-by: Oscar Salvador Reviewed-by: Pavel Tatashin Cc: Dan Williams Cc: David Hildenbrand Cc: Jonathan Cameron Cc: Mathieu Malaterre Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 63facfc57224..561c44761f95 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1582,12 +1582,10 @@ static void node_states_clear_node(int node, struct memory_notify *arg) if (arg->status_change_nid_normal >= 0) node_clear_state(node, N_NORMAL_MEMORY); - if ((N_MEMORY != N_NORMAL_MEMORY) && - (arg->status_change_nid_high >= 0)) + if (arg->status_change_nid_high >= 0) node_clear_state(node, N_HIGH_MEMORY); - if ((N_MEMORY != N_HIGH_MEMORY) && - (arg->status_change_nid >= 0)) + if (arg->status_change_nid >= 0) node_clear_state(node, N_MEMORY); } From 8efe33f40f3e69ac6069ed46666d2d527ddc2c04 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Fri, 26 Oct 2018 15:07:34 -0700 Subject: [PATCH 080/132] mm/memory_hotplug.c: simplify node_states_check_changes_online While looking at node_states_check_changes_online, I stumbled upon some confusing things. Right after entering the function, we find this: if (N_MEMORY == N_NORMAL_MEMORY) zone_last = ZONE_MOVABLE; This is wrong. N_MEMORY cannot really be equal to N_NORMAL_MEMORY. My guess is that this wanted to be something like: if (N_NORMAL_MEMORY == N_HIGH_MEMORY) to check if we have CONFIG_HIGHMEM. Later on, in the CONFIG_HIGHMEM block, we have: if (N_MEMORY == N_HIGH_MEMORY) zone_last = ZONE_MOVABLE; Again, this is wrong, and will never be evaluated to true. Besides removing these wrong if statements, I simplified the function a bit. [osalvador@suse.de: address feedback from Pavel] Link: http://lkml.kernel.org/r/20180921132634.10103-4-osalvador@techadventures.net Link: http://lkml.kernel.org/r/20180919100819.25518-5-osalvador@techadventures.net Signed-off-by: Oscar Salvador Reviewed-by: Pavel Tatashin Cc: Dan Williams Cc: David Hildenbrand Cc: Jonathan Cameron Cc: Mathieu Malaterre Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 61 +++++++-------------------------------------- 1 file changed, 9 insertions(+), 52 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 561c44761f95..eadd149eb7bc 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -687,62 +687,19 @@ static void node_states_check_changes_online(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) { int nid = zone_to_nid(zone); - enum zone_type zone_last = ZONE_NORMAL; - /* - * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_NORMAL, - * set zone_last to ZONE_NORMAL. - * - * If we don't have HIGHMEM nor movable node, - * node_states[N_NORMAL_MEMORY] contains nodes which have zones of - * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. - */ - if (N_MEMORY == N_NORMAL_MEMORY) - zone_last = ZONE_MOVABLE; + arg->status_change_nid = -1; + arg->status_change_nid_normal = -1; + arg->status_change_nid_high = -1; - /* - * if the memory to be online is in a zone of 0...zone_last, and - * the zones of 0...zone_last don't have memory before online, we will - * need to set the node to node_states[N_NORMAL_MEMORY] after - * the memory is online. - */ - if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) - arg->status_change_nid_normal = nid; - else - arg->status_change_nid_normal = -1; - -#ifdef CONFIG_HIGHMEM - /* - * If we have movable node, node_states[N_HIGH_MEMORY] - * contains nodes which have zones of 0...ZONE_HIGHMEM, - * set zone_last to ZONE_HIGHMEM. - * - * If we don't have movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_MOVABLE, - * set zone_last to ZONE_MOVABLE. - */ - zone_last = ZONE_HIGHMEM; - if (N_MEMORY == N_HIGH_MEMORY) - zone_last = ZONE_MOVABLE; - - if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) - arg->status_change_nid_high = nid; - else - arg->status_change_nid_high = -1; -#else - arg->status_change_nid_high = arg->status_change_nid_normal; -#endif - - /* - * if the node don't have memory befor online, we will need to - * set the node to node_states[N_MEMORY] after the memory - * is online. - */ if (!node_state(nid, N_MEMORY)) arg->status_change_nid = nid; - else - arg->status_change_nid = -1; + if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) + arg->status_change_nid_normal = nid; +#ifdef CONFIG_HIGHMEM + if (zone_idx(zone) <= N_HIGH_MEMORY && !node_state(nid, N_HIGH_MEMORY)) + arg->status_change_nid_high = nid; +#endif } static void node_states_set_node(int node, struct memory_notify *arg) From 86b27beae59685a42f81bcda9d502b5aebddfab8 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Fri, 26 Oct 2018 15:07:38 -0700 Subject: [PATCH 081/132] mm/memory_hotplug.c: clean up node_states_check_changes_offline() This patch, as the previous one, gets rid of the wrong if statements. While at it, I realized that the comments are sometimes very confusing, to say the least, and wrong. For example: ___ zone_last = ZONE_MOVABLE; /* * check whether node_states[N_HIGH_MEMORY] will be changed * If we try to offline the last present @nr_pages from the node, * we can determind we will need to clear the node from * node_states[N_HIGH_MEMORY]. */ for (; zt <= zone_last; zt++) present_pages += pgdat->node_zones[zt].present_pages; if (nr_pages >= present_pages) arg->status_change_nid = zone_to_nid(zone); else arg->status_change_nid = -1; ___ In case the node gets empry, it must be removed from N_MEMORY. We already check N_HIGH_MEMORY a bit above within the CONFIG_HIGHMEM ifdef code. Not to say that status_change_nid is for N_MEMORY, and not for N_HIGH_MEMORY. So I re-wrote some of the comments to what I think is better. [osalvador@suse.de: address feedback from Pavel] Link: http://lkml.kernel.org/r/20180921132634.10103-5-osalvador@techadventures.net Link: http://lkml.kernel.org/r/20180919100819.25518-6-osalvador@techadventures.net Signed-off-by: Oscar Salvador Reviewed-by: Pavel Tatashin Cc: Michal Hocko Cc: Dan Williams Cc: David Hildenbrand Cc: Jonathan Cameron Cc: Cc: Mathieu Malaterre Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 82 +++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 52 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index eadd149eb7bc..7e6509a53d79 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1463,75 +1463,53 @@ static void node_states_check_changes_offline(unsigned long nr_pages, { struct pglist_data *pgdat = zone->zone_pgdat; unsigned long present_pages = 0; - enum zone_type zt, zone_last = ZONE_NORMAL; + enum zone_type zt; + + arg->status_change_nid = -1; + arg->status_change_nid_normal = -1; + arg->status_change_nid_high = -1; /* - * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_NORMAL, - * set zone_last to ZONE_NORMAL. - * - * If we don't have HIGHMEM nor movable node, - * node_states[N_NORMAL_MEMORY] contains nodes which have zones of - * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. + * Check whether node_states[N_NORMAL_MEMORY] will be changed. + * If the memory to be offline is within the range + * [0..ZONE_NORMAL], and it is the last present memory there, + * the zones in that range will become empty after the offlining, + * thus we can determine that we need to clear the node from + * node_states[N_NORMAL_MEMORY]. */ - if (N_MEMORY == N_NORMAL_MEMORY) - zone_last = ZONE_MOVABLE; - - /* - * check whether node_states[N_NORMAL_MEMORY] will be changed. - * If the memory to be offline is in a zone of 0...zone_last, - * and it is the last present memory, 0...zone_last will - * become empty after offline , thus we can determind we will - * need to clear the node from node_states[N_NORMAL_MEMORY]. - */ - for (zt = 0; zt <= zone_last; zt++) + for (zt = 0; zt <= ZONE_NORMAL; zt++) present_pages += pgdat->node_zones[zt].present_pages; - if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) + if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages) arg->status_change_nid_normal = zone_to_nid(zone); - else - arg->status_change_nid_normal = -1; #ifdef CONFIG_HIGHMEM /* - * If we have movable node, node_states[N_HIGH_MEMORY] - * contains nodes which have zones of 0...ZONE_HIGHMEM, - * set zone_last to ZONE_HIGHMEM. - * - * If we don't have movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_MOVABLE, - * set zone_last to ZONE_MOVABLE. + * node_states[N_HIGH_MEMORY] contains nodes which + * have normal memory or high memory. + * Here we add the present_pages belonging to ZONE_HIGHMEM. + * If the zone is within the range of [0..ZONE_HIGHMEM), and + * we determine that the zones in that range become empty, + * we need to clear the node for N_HIGH_MEMORY. */ - zone_last = ZONE_HIGHMEM; - if (N_MEMORY == N_HIGH_MEMORY) - zone_last = ZONE_MOVABLE; - - for (; zt <= zone_last; zt++) - present_pages += pgdat->node_zones[zt].present_pages; - if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) + present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages; + if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages) arg->status_change_nid_high = zone_to_nid(zone); - else - arg->status_change_nid_high = -1; -#else - arg->status_change_nid_high = arg->status_change_nid_normal; #endif /* - * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE + * We have accounted the pages from [0..ZONE_NORMAL), and + * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM + * as well. + * Here we count the possible pages from ZONE_MOVABLE. + * If after having accounted all the pages, we see that the nr_pages + * to be offlined is over or equal to the accounted pages, + * we know that the node will become empty, and so, we can clear + * it for N_MEMORY as well. */ - zone_last = ZONE_MOVABLE; + present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages; - /* - * check whether node_states[N_HIGH_MEMORY] will be changed - * If we try to offline the last present @nr_pages from the node, - * we can determind we will need to clear the node from - * node_states[N_HIGH_MEMORY]. - */ - for (; zt <= zone_last; zt++) - present_pages += pgdat->node_zones[zt].present_pages; if (nr_pages >= present_pages) arg->status_change_nid = zone_to_nid(zone); - else - arg->status_change_nid = -1; } static void node_states_clear_node(int node, struct memory_notify *arg) From 85cfb245060e45640fa3447f8b0bad5e8bd3bdaf Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 26 Oct 2018 15:07:41 -0700 Subject: [PATCH 082/132] memcg: remove memcg_kmem_skip_account The flag memcg_kmem_skip_account was added during the era of opt-out kmem accounting. There is no need for such flag in the opt-in world as there aren't any __GFP_ACCOUNT allocations within memcg_create_cache_enqueue(). Link: http://lkml.kernel.org/r/20180919004501.178023-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 --- mm/memcontrol.c | 24 +----------------------- 2 files changed, 1 insertion(+), 26 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b8fcc6b3080c..8f8a5418b627 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -724,9 +724,6 @@ struct task_struct { #endif #ifdef CONFIG_MEMCG unsigned in_user_fault:1; -#ifdef CONFIG_MEMCG_KMEM - unsigned memcg_kmem_skip_account:1; -#endif #endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0e9ede617b89..645ede7ad1b2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2460,7 +2460,7 @@ static void memcg_kmem_cache_create_func(struct work_struct *w) /* * Enqueue the creation of a per-memcg kmem_cache. */ -static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, +static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, struct kmem_cache *cachep) { struct memcg_kmem_cache_create_work *cw; @@ -2478,25 +2478,6 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, queue_work(memcg_kmem_cache_wq, &cw->work); } -static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, - struct kmem_cache *cachep) -{ - /* - * We need to stop accounting when we kmalloc, because if the - * corresponding kmalloc cache is not yet created, the first allocation - * in __memcg_schedule_kmem_cache_create will recurse. - * - * However, it is better to enclose the whole function. Depending on - * the debugging options enabled, INIT_WORK(), for instance, can - * trigger an allocation. This too, will make us recurse. Because at - * this point we can't allow ourselves back into memcg_kmem_get_cache, - * the safest choice is to do it like this, wrapping the whole function. - */ - current->memcg_kmem_skip_account = 1; - __memcg_schedule_kmem_cache_create(memcg, cachep); - current->memcg_kmem_skip_account = 0; -} - static inline bool memcg_kmem_bypass(void) { if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) @@ -2531,9 +2512,6 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) if (memcg_kmem_bypass()) return cachep; - if (current->memcg_kmem_skip_account) - return cachep; - memcg = get_mem_cgroup_from_current(); kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) From f682a97a00591def7cefbb5003dc04045028e405 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 26 Oct 2018 15:07:45 -0700 Subject: [PATCH 083/132] mm: provide kernel parameter to allow disabling page init poisoning Patch series "Address issues slowing persistent memory initialization", v5. The main thing this patch set achieves is that it allows us to initialize each node worth of persistent memory independently. As a result we reduce page init time by about 2 minutes because instead of taking 30 to 40 seconds per node and going through each node one at a time, we process all 4 nodes in parallel in the case of a 12TB persistent memory setup spread evenly over 4 nodes. This patch (of 3): On systems with a large amount of memory it can take a significant amount of time to initialize all of the page structs with the PAGE_POISON_PATTERN value. I have seen it take over 2 minutes to initialize a system with over 12TB of RAM. In order to work around the issue I had to disable CONFIG_DEBUG_VM and then the boot time returned to something much more reasonable as the arch_add_memory call completed in milliseconds versus seconds. However in doing that I had to disable all of the other VM debugging on the system. In order to work around a kernel that might have CONFIG_DEBUG_VM enabled on a system that has a large amount of memory I have added a new kernel parameter named "vm_debug" that can be set to "-" in order to disable it. Link: http://lkml.kernel.org/r/20180925201921.3576.84239.stgit@localhost.localdomain Reviewed-by: Pavel Tatashin Signed-off-by: Alexander Duyck Cc: Dave Hansen Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/kernel-parameters.txt | 12 +++++ include/linux/page-flags.h | 8 ++++ mm/debug.c | 46 +++++++++++++++++++ mm/memblock.c | 5 +- mm/sparse.c | 4 +- 5 files changed, 69 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 8022d902e770..dcd082576e79 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4839,6 +4839,18 @@ This is actually a boot loader parameter; the value is passed to the kernel using a special protocol. + vm_debug[=options] [KNL] Available with CONFIG_DEBUG_VM=y. + May slow down system boot speed, especially when + enabled on systems with a large amount of memory. + All options are enabled by default, and this + interface is meant to allow for selectively + enabling or disabling specific virtual memory + debugging features. + + Available options are: + P Enable page structure init time poisoning + - Disable all of the above options + vmalloc=nn[KMG] [KNL,BOOT] Forces the vmalloc area to have an exact size of . This can be used to increase the minimum size (128MB on x86). It can also be used to diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4d99504f6496..934f91ef3f54 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -163,6 +163,14 @@ static inline int PagePoisoned(const struct page *page) return page->flags == PAGE_POISON_PATTERN; } +#ifdef CONFIG_DEBUG_VM +void page_init_poison(struct page *page, size_t size); +#else +static inline void page_init_poison(struct page *page, size_t size) +{ +} +#endif + /* * Page flags policies wrt compound pages * diff --git a/mm/debug.c b/mm/debug.c index bd10aad8539a..cdacba12e09a 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "internal.h" @@ -175,4 +176,49 @@ void dump_mm(const struct mm_struct *mm) ); } +static bool page_init_poisoning __read_mostly = true; + +static int __init setup_vm_debug(char *str) +{ + bool __page_init_poisoning = true; + + /* + * Calling vm_debug with no arguments is equivalent to requesting + * to enable all debugging options we can control. + */ + if (*str++ != '=' || !*str) + goto out; + + __page_init_poisoning = false; + if (*str == '-') + goto out; + + while (*str) { + switch (tolower(*str)) { + case'p': + __page_init_poisoning = true; + break; + default: + pr_err("vm_debug option '%c' unknown. skipped\n", + *str); + } + + str++; + } +out: + if (page_init_poisoning && !__page_init_poisoning) + pr_warn("Page struct poisoning disabled by kernel command line option 'vm_debug'\n"); + + page_init_poisoning = __page_init_poisoning; + + return 1; +} +__setup("vm_debug", setup_vm_debug); + +void page_init_poison(struct page *page, size_t size) +{ + if (page_init_poisoning) + memset(page, PAGE_POISON_PATTERN, size); +} +EXPORT_SYMBOL_GPL(page_init_poison); #endif /* CONFIG_DEBUG_VM */ diff --git a/mm/memblock.c b/mm/memblock.c index 237944479d25..a85315083b5a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1444,10 +1444,9 @@ void * __init memblock_virt_alloc_try_nid_raw( ptr = memblock_virt_alloc_internal(size, align, min_addr, max_addr, nid); -#ifdef CONFIG_DEBUG_VM if (ptr && size > 0) - memset(ptr, PAGE_POISON_PATTERN, size); -#endif + page_init_poison(ptr, size); + return ptr; } diff --git a/mm/sparse.c b/mm/sparse.c index 10b07eea9a6e..67ad061f7fb8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -696,13 +696,11 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat, goto out; } -#ifdef CONFIG_DEBUG_VM /* * Poison uninitialized struct pages in order to catch invalid flags * combinations. */ - memset(memmap, PAGE_POISON_PATTERN, sizeof(struct page) * PAGES_PER_SECTION); -#endif + page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION); section_mark_present(ms); sparse_init_one_section(ms, section_nr, memmap, usemap); From d483da5bc78b86fe4200d2947f193a745f711713 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 26 Oct 2018 15:07:48 -0700 Subject: [PATCH 084/132] mm: create non-atomic version of SetPageReserved for init use It doesn't make much sense to use the atomic SetPageReserved at init time when we are using memset to clear the memory and manipulating the page flags via simple "&=" and "|=" operations in __init_single_page. This patch adds a non-atomic version __SetPageReserved that can be used during page init and shows about a 10% improvement in initialization times on the systems I have available for testing. On those systems I saw initialization times drop from around 35 seconds to around 32 seconds to initialize a 3TB block of persistent memory. I believe the main advantage of this is that it allows for more compiler optimization as the __set_bit operation can be reordered whereas the atomic version cannot. I tried adding a bit of documentation based on f1dd2cd13c4 ("mm, memory_hotplug: do not associate hotadded memory to zones until online"). Ideally the reserved flag should be set earlier since there is a brief window where the page is initialization via __init_single_page and we have not set the PG_Reserved flag. I'm leaving that for a future patch set as that will require a more significant refactor. Link: http://lkml.kernel.org/r/20180925202018.3576.11607.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Reviewed-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 1 + mm/page_alloc.c | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 934f91ef3f54..50ce1bddaf56 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -303,6 +303,7 @@ PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) + __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eb6c50cc8880..cee1abf85d72 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1232,7 +1232,12 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) /* Avoid false-positive PageTail() */ INIT_LIST_HEAD(&page->lru); - SetPageReserved(page); + /* + * no need for atomic set_bit because the struct + * page is not visible yet so nobody should + * access it yet. + */ + __SetPageReserved(page); } } } @@ -5508,7 +5513,7 @@ not_early: page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid); if (context == MEMMAP_HOTPLUG) - SetPageReserved(page); + __SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for From 966cf44f637e6aeea7e3d01ba004bf8b5beac78f Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 26 Oct 2018 15:07:52 -0700 Subject: [PATCH 085/132] mm: defer ZONE_DEVICE page initialization to the point where we init pgmap The ZONE_DEVICE pages were being initialized in two locations. One was with the memory_hotplug lock held and another was outside of that lock. The problem with this is that it was nearly doubling the memory initialization time. Instead of doing this twice, once while holding a global lock and once without, I am opting to defer the initialization to the one outside of the lock. This allows us to avoid serializing the overhead for memory init and we can instead focus on per-node init times. One issue I encountered is that devm_memremap_pages and hmm_devmmem_pages_create were initializing only the pgmap field the same way. One wasn't initializing hmm_data, and the other was initializing it to a poison value. Since this is something that is exposed to the driver in the case of hmm I am opting for a third option and just initializing hmm_data to 0 since this is going to be exposed to unknown third party drivers. [alexander.h.duyck@linux.intel.com: fix reference count for pgmap in devm_memremap_pages] Link: http://lkml.kernel.org/r/20181008233404.1909.37302.stgit@localhost.localdomain Link: http://lkml.kernel.org/r/20180925202053.3576.66039.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Reviewed-by: Pavel Tatashin Tested-by: Dan Williams Cc: Dave Hansen Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 + kernel/memremap.c | 27 ++++++-------- mm/hmm.c | 12 +++--- mm/page_alloc.c | 92 ++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 109 insertions(+), 24 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 737279bb479c..33228a49d7d2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -848,6 +848,8 @@ static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } +extern void memmap_init_zone_device(struct zone *, unsigned long, + unsigned long, struct dev_pagemap *); #else static inline bool is_zone_device_page(const struct page *page) { diff --git a/kernel/memremap.c b/kernel/memremap.c index 5b8600d39931..620fc4d2559a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -175,10 +175,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) struct vmem_altmap *altmap = pgmap->altmap_valid ? &pgmap->altmap : NULL; struct resource *res = &pgmap->res; - unsigned long pfn, pgoff, order; - pgprot_t pgprot = PAGE_KERNEL; - int error, nid, is_ram; struct dev_pagemap *conflict_pgmap; + pgprot_t pgprot = PAGE_KERNEL; + unsigned long pgoff, order; + int error, nid, is_ram; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -256,19 +256,14 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) if (error) goto err_add_memory; - for_each_device_pfn(pfn, pgmap) { - struct page *page = pfn_to_page(pfn); - - /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back - * pointer. It is a bug if a ZONE_DEVICE page is ever - * freed or placed on a driver-private list. Seed the - * storage with LIST_POISON* values. - */ - list_del(&page->lru); - page->pgmap = pgmap; - percpu_ref_get(pgmap->ref); - } + /* + * Initialization of the pages has been deferred until now in order + * to allow us to do the work while not holding the hotplug lock. + */ + memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, pgmap); + percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); devm_add_action(dev, devm_memremap_pages_release, pgmap); diff --git a/mm/hmm.c b/mm/hmm.c index c968e49f7a0c..774d684fa2b4 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1024,7 +1024,6 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) resource_size_t key, align_start, align_size, align_end; struct device *device = devmem->device; int ret, nid, is_ram; - unsigned long pfn; align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); align_size = ALIGN(devmem->resource->start + @@ -1109,11 +1108,14 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) align_size >> PAGE_SHIFT, NULL); mem_hotplug_done(); - for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { - struct page *page = pfn_to_page(pfn); + /* + * Initialization of the pages has been deferred until now in order + * to allow us to do the work while not holding the hotplug lock. + */ + memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, &devmem->pagemap); - page->pgmap = &devmem->pagemap; - } return 0; error_add_memory: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cee1abf85d72..d73ff2188d72 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5465,12 +5465,23 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; +#ifdef CONFIG_ZONE_DEVICE /* * Honor reservation requested by the driver for this ZONE_DEVICE - * memory + * memory. We limit the total number of pages to initialize to just + * those that might contain the memory mapping. We will defer the + * ZONE_DEVICE page initialization until after we have released + * the hotplug lock. */ - if (altmap && start_pfn == altmap->base_pfn) - start_pfn += altmap->reserve; + if (zone == ZONE_DEVICE) { + if (!altmap) + return; + + if (start_pfn == altmap->base_pfn) + start_pfn += altmap->reserve; + end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); + } +#endif for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* @@ -5537,6 +5548,81 @@ not_early: } } +#ifdef CONFIG_ZONE_DEVICE +void __ref memmap_init_zone_device(struct zone *zone, + unsigned long start_pfn, + unsigned long size, + struct dev_pagemap *pgmap) +{ + unsigned long pfn, end_pfn = start_pfn + size; + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long zone_idx = zone_idx(zone); + unsigned long start = jiffies; + int nid = pgdat->node_id; + + if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone))) + return; + + /* + * The call to memmap_init_zone should have already taken care + * of the pages reserved for the memmap, so we can just jump to + * the end of that region and start processing the device pages. + */ + if (pgmap->altmap_valid) { + struct vmem_altmap *altmap = &pgmap->altmap; + + start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); + size = end_pfn - start_pfn; + } + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + struct page *page = pfn_to_page(pfn); + + __init_single_page(page, pfn, zone_idx, nid); + + /* + * Mark page reserved as it will need to wait for onlining + * phase for it to be fully associated with a zone. + * + * We can use the non-atomic __set_bit operation for setting + * the flag as we are still initializing the pages. + */ + __SetPageReserved(page); + + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back + * pointer and hmm_data. It is a bug if a ZONE_DEVICE + * page is ever freed or placed on a driver-private list. + */ + page->pgmap = pgmap; + page->hmm_data = 0; + + /* + * Mark the block movable so that blocks are reserved for + * movable at startup. This will force kernel allocations + * to reserve their blocks rather than leaking throughout + * the address space during boot when many long-lived + * kernel allocations are made. + * + * bitmap is created for zone's valid pfn range. but memmap + * can be created for invalid pages (for alignment) + * check here not to call set_pageblock_migratetype() against + * pfn out of zone. + * + * Please note that MEMMAP_HOTPLUG path doesn't clear memmap + * because this is done early in sparse_add_one_section + */ + if (!(pfn & (pageblock_nr_pages - 1))) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + cond_resched(); + } + } + + pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev), + size, jiffies_to_msecs(jiffies - start)); +} + +#endif static void __meminit zone_init_free_lists(struct zone *zone) { unsigned int order, t; From d4faa40259b8524a09c1b0fcd38a446db0c0b770 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 26 Oct 2018 15:07:55 -0700 Subject: [PATCH 086/132] mm: remove unnecessary local variable addr in __get_user_pages_fast() The local variable `addr' in __get_user_pages_fast() is just a shadow of `start'. Since `start' never changes after assignment to `addr', it is fine to replace `start' with it. Also the meaning of [start, end] is more obvious than [addr, end] when passed to gup_pgd_range(). Link: http://lkml.kernel.org/r/20180925021448.20265-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 1abc8b4afff6..08eb350e0f35 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1780,12 +1780,11 @@ bool gup_fast_permitted(unsigned long start, int nr_pages, int write) int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - unsigned long addr, len, end; + unsigned long len, end; unsigned long flags; int nr = 0; start &= PAGE_MASK; - addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; @@ -1807,7 +1806,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, if (gup_fast_permitted(start, nr_pages, write)) { local_irq_save(flags); - gup_pgd_range(addr, end, write, pages, &nr); + gup_pgd_range(start, end, write, pages, &nr); local_irq_restore(flags); } From d018498ccc92775dd593432ba6f3ef92d4d2a782 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 26 Oct 2018 15:07:59 -0700 Subject: [PATCH 087/132] hugetlb: harmonize hugetlb.h arch specific defines with pgtable.h In order to reduce copy/paste of functions across architectures and then make riscv hugetlb port (and future ports) simpler and smaller, this patchset intends to factorize the numerous hugetlb primitives that are defined across all the architectures. Except for prepare_hugepage_range, this patchset moves the versions that are just pass-through to standard pte primitives into asm-generic/hugetlb.h by using the same #ifdef semantic that can be found in asm-generic/pgtable.h, i.e. __HAVE_ARCH_***. s390 architecture has not been tackled in this serie since it does not use asm-generic/hugetlb.h at all. This patchset has been compiled on all addressed architectures with success (except for parisc, but the problem does not come from this series). This patch (of 11): asm-generic/hugetlb.h proposes generic implementations of hugetlb related functions: use __HAVE_ARCH_HUGE* defines in order to make arch specific implementations of hugetlb functions consistent with pgtable.h scheme. Link: http://lkml.kernel.org/r/20180920060358.16606-2-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Reviewed-by: Luiz Capitulino Reviewed-by: Mike Kravetz Acked-by: Catalin Marinas [arm64] Cc: Russell King Cc: Will Deacon Cc: Tony Luck Cc: Fenghua Yu Cc: Ralf Baechle Cc: Paul Burton Cc: James Hogan Cc: James E.J. Bottomley Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Yoshinori Sato Cc: Rich Felker Cc: David S. Miller Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Arnd Bergmann Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Ingo Molnar [x86] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/hugetlb.h | 2 +- include/asm-generic/hugetlb.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index e73f68569624..3fcf14663dfa 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -81,9 +81,9 @@ extern void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); extern void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +#define __HAVE_ARCH_HUGE_PTE_CLEAR extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz); -#define huge_pte_clear huge_pte_clear extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz); #define set_huge_swap_pte_at set_huge_swap_pte_at diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 9d0cde8ab716..3da7cff52360 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -32,7 +32,7 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) return pte_modify(pte, newprot); } -#ifndef huge_pte_clear +#ifndef __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { From 1e5f50fc9d0a653c910df2291f245a8fa7beed11 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 26 Oct 2018 15:08:03 -0700 Subject: [PATCH 088/132] hugetlb: introduce generic version of hugetlb_free_pgd_range arm, arm64, mips, parisc, sh, x86 architectures use the same version of hugetlb_free_pgd_range, so move this generic implementation into asm-generic/hugetlb.h. Link: http://lkml.kernel.org/r/20180920060358.16606-3-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Reviewed-by: Luiz Capitulino Reviewed-by: Mike Kravetz Tested-by: Helge Deller [parisc] Acked-by: Catalin Marinas [arm64] Acked-by: Paul Burton [MIPS] Acked-by: Ingo Molnar [x86] Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: David S. Miller Cc: Fenghua Yu Cc: Heiko Carstens Cc: H. Peter Anvin Cc: Ingo Molnar Cc: James E.J. Bottomley Cc: James Hogan Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Russell King Cc: Thomas Gleixner Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/hugetlb.h | 9 --------- arch/arm64/include/asm/hugetlb.h | 10 ---------- arch/ia64/include/asm/hugetlb.h | 5 +++-- arch/mips/include/asm/hugetlb.h | 13 ++----------- arch/parisc/include/asm/hugetlb.h | 12 ++---------- arch/powerpc/include/asm/hugetlb.h | 4 +++- arch/sh/include/asm/hugetlb.h | 12 ++---------- arch/sparc/include/asm/hugetlb.h | 4 +++- arch/x86/include/asm/hugetlb.h | 8 -------- include/asm-generic/hugetlb.h | 11 +++++++++++ 10 files changed, 26 insertions(+), 62 deletions(-) diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index 7d26f6c4f0f5..537660891f9f 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h @@ -27,15 +27,6 @@ #include -static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, - unsigned long addr, unsigned long end, - unsigned long floor, - unsigned long ceiling) -{ - free_pgd_range(tlb, addr, end, floor, ceiling); -} - - static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 3fcf14663dfa..4af1a800a900 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -25,16 +25,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep) return READ_ONCE(*ptep); } - - -static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, - unsigned long addr, unsigned long end, - unsigned long floor, - unsigned long ceiling) -{ - free_pgd_range(tlb, addr, end, floor, ceiling); -} - static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index 74d2a5540aaf..afe9fa4d969b 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -3,9 +3,8 @@ #define _ASM_IA64_HUGETLB_H #include -#include - +#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); @@ -70,4 +69,6 @@ static inline void arch_clear_hugepage_flags(struct page *page) { } +#include + #endif /* _ASM_IA64_HUGETLB_H */ diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 982bc0685330..53764050243e 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -10,8 +10,6 @@ #define __ASM_HUGETLB_H #include -#include - static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, @@ -38,15 +36,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, - unsigned long addr, - unsigned long end, - unsigned long floor, - unsigned long ceiling) -{ - free_pgd_range(tlb, addr, end, floor, ceiling); -} - static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { @@ -114,4 +103,6 @@ static inline void arch_clear_hugepage_flags(struct page *page) { } +#include + #endif /* __ASM_HUGETLB_H */ diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 58e0f4620426..28c23b68d38d 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -3,8 +3,6 @@ #define _ASM_PARISC64_HUGETLB_H #include -#include - void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); @@ -32,14 +30,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, - unsigned long addr, unsigned long end, - unsigned long floor, - unsigned long ceiling) -{ - free_pgd_range(tlb, addr, end, floor, ceiling); -} - static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -71,4 +61,6 @@ static inline void arch_clear_hugepage_flags(struct page *page) { } +#include + #endif /* _ASM_PARISC64_HUGETLB_H */ diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 2d00cc530083..2ab028b73a43 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -4,7 +4,6 @@ #ifdef CONFIG_HUGETLB_PAGE #include -#include extern struct kmem_cache *hugepte_cache; @@ -110,6 +109,7 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma, void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); #endif +#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); @@ -176,6 +176,8 @@ static inline void arch_clear_hugepage_flags(struct page *page) { } +#include + #else /* ! CONFIG_HUGETLB_PAGE */ static inline void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 735939c0f513..f6a51b609409 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -4,8 +4,6 @@ #include #include -#include - static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, @@ -27,14 +25,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, - unsigned long addr, unsigned long end, - unsigned long floor, - unsigned long ceiling) -{ - free_pgd_range(tlb, addr, end, floor, ceiling); -} - static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { @@ -85,4 +75,6 @@ static inline void arch_clear_hugepage_flags(struct page *page) clear_bit(PG_dcache_clean, &page->flags); } +#include + #endif /* _ASM_SH_HUGETLB_H */ diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 300557c66698..59d89b52ccb7 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -3,7 +3,6 @@ #define _ASM_SPARC64_HUGETLB_H #include -#include #ifdef CONFIG_HUGETLB_PAGE struct pud_huge_patch_entry { @@ -84,8 +83,11 @@ static inline void arch_clear_hugepage_flags(struct page *page) { } +#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); +#include + #endif /* _ASM_SPARC64_HUGETLB_H */ diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 5ed826da5e07..398da3b3414c 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -28,14 +28,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, - unsigned long addr, unsigned long end, - unsigned long floor, - unsigned long ceiling) -{ - free_pgd_range(tlb, addr, end, floor, ceiling); -} - static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 3da7cff52360..c697ca9dda18 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -40,4 +40,15 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, } #endif +#ifndef __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE +static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + free_pgd_range(tlb, addr, end, floor, ceiling); +} + + +#endif + #endif /* _ASM_GENERIC_HUGETLB_H */ From cea685d556330b0265ca62c0d820f687cc9a38d6 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 26 Oct 2018 15:08:07 -0700 Subject: [PATCH 089/132] hugetlb: introduce generic version of set_huge_pte_at() arm, ia64, mips, powerpc, sh, x86 architectures use the same version of set_huge_pte_at, so move this generic implementation into asm-generic/hugetlb.h. Link: http://lkml.kernel.org/r/20180920060358.16606-4-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Reviewed-by: Luiz Capitulino Reviewed-by: Mike Kravetz Tested-by: Helge Deller [parisc] Acked-by: Catalin Marinas [arm64] Acked-by: Paul Burton [MIPS] Acked-by: Ingo Molnar [x86] Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: David S. Miller Cc: Fenghua Yu Cc: Heiko Carstens Cc: H. Peter Anvin Cc: Ingo Molnar Cc: James E.J. Bottomley Cc: James Hogan Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Russell King Cc: Thomas Gleixner Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/hugetlb-3level.h | 6 ------ arch/arm64/include/asm/hugetlb.h | 1 + arch/ia64/include/asm/hugetlb.h | 6 ------ arch/mips/include/asm/hugetlb.h | 6 ------ arch/parisc/include/asm/hugetlb.h | 1 + arch/powerpc/include/asm/hugetlb.h | 6 ------ arch/sh/include/asm/hugetlb.h | 6 ------ arch/sparc/include/asm/hugetlb.h | 1 + arch/x86/include/asm/hugetlb.h | 6 ------ include/asm-generic/hugetlb.h | 8 +++++++- 10 files changed, 10 insertions(+), 37 deletions(-) diff --git a/arch/arm/include/asm/hugetlb-3level.h b/arch/arm/include/asm/hugetlb-3level.h index d4014fbe5ea3..398fb06e8207 100644 --- a/arch/arm/include/asm/hugetlb-3level.h +++ b/arch/arm/include/asm/hugetlb-3level.h @@ -37,12 +37,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep) return retval; } -static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - set_pte_at(mm, addr, ptep, pte); -} - static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 4af1a800a900..874661a1dff1 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -60,6 +60,7 @@ static inline void arch_clear_hugepage_flags(struct page *page) extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, struct page *page, int writable); #define arch_make_huge_pte arch_make_huge_pte +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index afe9fa4d969b..a235d6f60fb3 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -20,12 +20,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE); } -static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - set_pte_at(mm, addr, ptep, pte); -} - static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 53764050243e..8ea439041d5d 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -36,12 +36,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - set_pte_at(mm, addr, ptep, pte); -} - static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 28c23b68d38d..77c8adbac7c3 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -4,6 +4,7 @@ #include +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 2ab028b73a43..33b899624922 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -129,12 +129,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - set_pte_at(mm, addr, ptep, pte); -} - static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index f6a51b609409..bc552e37c1c9 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -25,12 +25,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - set_pte_at(mm, addr, ptep, pte); -} - static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 59d89b52ccb7..16b0c53ea6c9 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -12,6 +12,7 @@ struct pud_huge_patch_entry { extern struct pud_huge_patch_entry __pud_huge_patch, __pud_huge_patch_end; #endif +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 398da3b3414c..8db9a761964d 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -28,12 +28,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - set_pte_at(mm, addr, ptep, pte); -} - static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index c697ca9dda18..ee010b756246 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -47,8 +47,14 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, { free_pgd_range(tlb, addr, end, floor, ceiling); } +#endif - +#ifndef __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT +static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + set_pte_at(mm, addr, ptep, pte); +} #endif #endif /* _ASM_GENERIC_HUGETLB_H */ From a4d838536c6e5c1807a863c860d9e767d55ba681 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 26 Oct 2018 15:08:12 -0700 Subject: [PATCH 090/132] hugetlb: introduce generic version of huge_ptep_get_and_clear() arm, ia64, sh, x86 architectures use the same version of huge_ptep_get_and_clear, so move this generic implementation into asm-generic/hugetlb.h. Link: http://lkml.kernel.org/r/20180920060358.16606-5-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Reviewed-by: Luiz Capitulino Reviewed-by: Mike Kravetz Tested-by: Helge Deller [parisc] Acked-by: Catalin Marinas [arm64] Acked-by: Paul Burton [MIPS] Acked-by: Ingo Molnar [x86] Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: David S. Miller Cc: Fenghua Yu Cc: Heiko Carstens Cc: H. Peter Anvin Cc: Ingo Molnar Cc: James E.J. Bottomley Cc: James Hogan Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Russell King Cc: Thomas Gleixner Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/hugetlb-3level.h | 6 ------ arch/arm64/include/asm/hugetlb.h | 1 + arch/ia64/include/asm/hugetlb.h | 6 ------ arch/mips/include/asm/hugetlb.h | 1 + arch/parisc/include/asm/hugetlb.h | 1 + arch/powerpc/include/asm/hugetlb.h | 1 + arch/sh/include/asm/hugetlb.h | 6 ------ arch/sparc/include/asm/hugetlb.h | 1 + arch/x86/include/asm/hugetlb.h | 6 ------ include/asm-generic/hugetlb.h | 8 ++++++++ 10 files changed, 13 insertions(+), 24 deletions(-) diff --git a/arch/arm/include/asm/hugetlb-3level.h b/arch/arm/include/asm/hugetlb-3level.h index 398fb06e8207..ad36e84b819a 100644 --- a/arch/arm/include/asm/hugetlb-3level.h +++ b/arch/arm/include/asm/hugetlb-3level.h @@ -49,12 +49,6 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, ptep_set_wrprotect(mm, addr, ptep); } -static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - return ptep_get_and_clear(mm, addr, ptep); -} - static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 874661a1dff1..6ae0bcafe162 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -66,6 +66,7 @@ extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); extern void huge_ptep_set_wrprotect(struct mm_struct *mm, diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index a235d6f60fb3..6719c74da0de 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -20,12 +20,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE); } -static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - return ptep_get_and_clear(mm, addr, ptep); -} - static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 8ea439041d5d..0959cc5a41fa 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -36,6 +36,7 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 77c8adbac7c3..6e281e1bb336 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -8,6 +8,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 33b899624922..91bdc84b76ce 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -129,6 +129,7 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index bc552e37c1c9..08ee6c00b5e9 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -25,12 +25,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - return ptep_get_and_clear(mm, addr, ptep); -} - static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 16b0c53ea6c9..944e3a4bfaff 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -16,6 +16,7 @@ extern struct pud_huge_patch_entry __pud_huge_patch, __pud_huge_patch_end; void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 8db9a761964d..e9e7fef867ad 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -28,12 +28,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - return ptep_get_and_clear(mm, addr, ptep); -} - static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index ee010b756246..0f6f151780dd 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -57,4 +57,12 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, } #endif +#ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR +static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + return ptep_get_and_clear(mm, addr, ptep); +} +#endif + #endif /* _ASM_GENERIC_HUGETLB_H */ From fe632225bdbd49f5620e7ccfa03268fec9a3e370 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 26 Oct 2018 15:08:17 -0700 Subject: [PATCH 091/132] hugetlb: introduce generic version of huge_ptep_clear_flush arm, x86 architectures use the same version of huge_ptep_clear_flush, so move this generic implementation into asm-generic/hugetlb.h. Link: http://lkml.kernel.org/r/20180920060358.16606-6-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Reviewed-by: Luiz Capitulino Reviewed-by: Mike Kravetz Tested-by: Helge Deller [parisc] Acked-by: Catalin Marinas [arm64] Acked-by: Paul Burton [MIPS] Acked-by: Ingo Molnar [x86] Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: David S. Miller Cc: Fenghua Yu Cc: Heiko Carstens Cc: H. Peter Anvin Cc: Ingo Molnar Cc: James E.J. Bottomley Cc: James Hogan Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Russell King Cc: Thomas Gleixner Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/hugetlb-3level.h | 6 ------ arch/arm64/include/asm/hugetlb.h | 1 + arch/ia64/include/asm/hugetlb.h | 1 + arch/mips/include/asm/hugetlb.h | 1 + arch/parisc/include/asm/hugetlb.h | 1 + arch/powerpc/include/asm/hugetlb.h | 1 + arch/sh/include/asm/hugetlb.h | 1 + arch/sparc/include/asm/hugetlb.h | 1 + arch/x86/include/asm/hugetlb.h | 6 ------ include/asm-generic/hugetlb.h | 8 ++++++++ 10 files changed, 15 insertions(+), 12 deletions(-) diff --git a/arch/arm/include/asm/hugetlb-3level.h b/arch/arm/include/asm/hugetlb-3level.h index ad36e84b819a..b897541520ef 100644 --- a/arch/arm/include/asm/hugetlb-3level.h +++ b/arch/arm/include/asm/hugetlb-3level.h @@ -37,12 +37,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep) return retval; } -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - ptep_clear_flush(vma, addr, ptep); -} - static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 6ae0bcafe162..4c8dd488554d 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -71,6 +71,7 @@ extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); extern void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH extern void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); #define __HAVE_ARCH_HUGE_PTE_CLEAR diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index 6719c74da0de..41b5f6adeee4 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -20,6 +20,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE); } +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 0959cc5a41fa..7df1f116a3cc 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -48,6 +48,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, return pte; } +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 6e281e1bb336..9afff26747a1 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -32,6 +32,7 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 91bdc84b76ce..1eb3e131cab4 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -140,6 +140,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, #endif } +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 08ee6c00b5e9..9abf9c86b769 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -25,6 +25,7 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 944e3a4bfaff..651a9593fcee 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -42,6 +42,7 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index e9e7fef867ad..fd59673e7a0a 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -28,12 +28,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - ptep_clear_flush(vma, addr, ptep); -} - static inline int huge_pte_none(pte_t pte) { return pte_none(pte); diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 0f6f151780dd..ffa63fd8388d 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -65,4 +65,12 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, } #endif +#ifndef __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH +static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + ptep_clear_flush(vma, addr, ptep); +} +#endif + #endif /* _ASM_GENERIC_HUGETLB_H */ From cae72abc1af0c1ef4cb39e7187c5af44a7ce17a4 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 26 Oct 2018 15:08:22 -0700 Subject: [PATCH 092/132] hugetlb: introduce generic version of huge_pte_none() arm, arm64, ia64, mips, parisc, powerpc, sh, sparc, x86 architectures use the same version of huge_pte_none, so move this generic implementation into asm-generic/hugetlb.h. Link: http://lkml.kernel.org/r/20180920060358.16606-7-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Reviewed-by: Luiz Capitulino Reviewed-by: Mike Kravetz Tested-by: Helge Deller [parisc] Acked-by: Catalin Marinas [arm64] Acked-by: Paul Burton [MIPS] Acked-by: Ingo Molnar [x86] Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: David S. Miller Cc: Fenghua Yu Cc: Heiko Carstens Cc: H. Peter Anvin Cc: Ingo Molnar Cc: James E.J. Bottomley Cc: James Hogan Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Russell King Cc: Thomas Gleixner Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/hugetlb.h | 5 ----- arch/arm64/include/asm/hugetlb.h | 5 ----- arch/ia64/include/asm/hugetlb.h | 5 ----- arch/mips/include/asm/hugetlb.h | 1 + arch/parisc/include/asm/hugetlb.h | 5 ----- arch/powerpc/include/asm/hugetlb.h | 5 ----- arch/sh/include/asm/hugetlb.h | 5 ----- arch/sparc/include/asm/hugetlb.h | 5 ----- arch/x86/include/asm/hugetlb.h | 5 ----- include/asm-generic/hugetlb.h | 7 +++++++ 10 files changed, 8 insertions(+), 40 deletions(-) diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index 537660891f9f..c821b550d6a4 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h @@ -44,11 +44,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 4c8dd488554d..49247c6f94db 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -42,11 +42,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index 41b5f6adeee4..bf573500b3c4 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -26,11 +26,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 7df1f116a3cc..1c9c4531376c 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -55,6 +55,7 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, flush_tlb_page(vma, addr & huge_page_mask(hstate_vma(vma))); } +#define __HAVE_ARCH_HUGE_PTE_NONE static inline int huge_pte_none(pte_t pte) { unsigned long val = pte_val(pte) & ~_PAGE_GLOBAL; diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 9afff26747a1..c09d8c74553c 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -38,11 +38,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 1eb3e131cab4..6a534353c8eb 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -149,11 +149,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, flush_hugetlb_page(vma, addr); } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 9abf9c86b769..a9f8266f33cf 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -31,11 +31,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 651a9593fcee..11115bbd712e 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -48,11 +48,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index fd59673e7a0a..42d872054791 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -28,11 +28,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index ffa63fd8388d..2fc3d68424e9 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -73,4 +73,11 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, } #endif +#ifndef __HAVE_ARCH_HUGE_PTE_NONE +static inline int huge_pte_none(pte_t pte) +{ + return pte_none(pte); +} +#endif + #endif /* _ASM_GENERIC_HUGETLB_H */ From c4916a008665a6650834a736f80559a7f90e5857 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 26 Oct 2018 15:08:26 -0700 Subject: [PATCH 093/132] hugetlb: introduce generic version of huge_pte_wrprotect arm, arm64, ia64, mips, parisc, powerpc, sh, sparc, x86 architectures use the same version of huge_pte_wrprotect, so move this generic implementation into asm-generic/hugetlb.h. Link: http://lkml.kernel.org/r/20180920060358.16606-8-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Reviewed-by: Luiz Capitulino Reviewed-by: Mike Kravetz Tested-by: Helge Deller [parisc] Acked-by: Catalin Marinas [arm64] Acked-by: Paul Burton [MIPS] Acked-by: Ingo Molnar [x86] Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: David S. Miller Cc: Fenghua Yu Cc: Heiko Carstens Cc: H. Peter Anvin Cc: Ingo Molnar Cc: James E.J. Bottomley Cc: James Hogan Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Russell King Cc: Thomas Gleixner Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/hugetlb.h | 5 ----- arch/arm64/include/asm/hugetlb.h | 5 ----- arch/ia64/include/asm/hugetlb.h | 5 ----- arch/mips/include/asm/hugetlb.h | 5 ----- arch/parisc/include/asm/hugetlb.h | 5 ----- arch/powerpc/include/asm/hugetlb.h | 5 ----- arch/sh/include/asm/hugetlb.h | 5 ----- arch/sparc/include/asm/hugetlb.h | 5 ----- arch/x86/include/asm/hugetlb.h | 5 ----- include/asm-generic/hugetlb.h | 7 +++++++ 10 files changed, 7 insertions(+), 45 deletions(-) diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index c821b550d6a4..9ca14227eeb7 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h @@ -44,11 +44,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 49247c6f94db..1fd64ebf0cd7 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -42,11 +42,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index bf573500b3c4..82fe3d7a38d9 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -26,11 +26,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 1c9c4531376c..b3d6bb53ee6e 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -62,11 +62,6 @@ static inline int huge_pte_none(pte_t pte) return !val || (val == (unsigned long)invalid_pte_table); } -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index c09d8c74553c..5a102d7251e4 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -38,11 +38,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 6a534353c8eb..b5b57b309564 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -149,11 +149,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, flush_hugetlb_page(vma, addr); } -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index a9f8266f33cf..54f65094efe6 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -31,11 +31,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 11115bbd712e..f661362376e0 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -48,11 +48,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 42d872054791..3cd3a2c9840e 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -28,11 +28,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 2fc3d68424e9..cd9697672b79 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -80,4 +80,11 @@ static inline int huge_pte_none(pte_t pte) } #endif +#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT +static inline pte_t huge_pte_wrprotect(pte_t pte) +{ + return pte_wrprotect(pte); +} +#endif + #endif /* _ASM_GENERIC_HUGETLB_H */ From 78d6e4e8ea8700fb3973661eb3774f632bab5842 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 26 Oct 2018 15:08:31 -0700 Subject: [PATCH 094/132] hugetlb: introduce generic version of prepare_hugepage_range arm, arm64, powerpc, sparc, x86 architectures use the same version of prepare_hugepage_range, so move this generic implementation into asm-generic/hugetlb.h. Link: http://lkml.kernel.org/r/20180920060358.16606-9-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Reviewed-by: Luiz Capitulino Reviewed-by: Mike Kravetz Tested-by: Helge Deller [parisc] Acked-by: Catalin Marinas [arm64] Acked-by: Paul Burton [MIPS] Acked-by: Ingo Molnar [x86] Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: David S. Miller Cc: Fenghua Yu Cc: Heiko Carstens Cc: H. Peter Anvin Cc: Ingo Molnar Cc: James E.J. Bottomley Cc: James Hogan Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Russell King Cc: Thomas Gleixner Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/hugetlb.h | 11 ----------- arch/arm64/include/asm/hugetlb.h | 11 ----------- arch/ia64/include/asm/hugetlb.h | 1 + arch/mips/include/asm/hugetlb.h | 1 + arch/parisc/include/asm/hugetlb.h | 1 + arch/powerpc/include/asm/hugetlb.h | 15 --------------- arch/sh/include/asm/hugetlb.h | 1 + arch/sparc/include/asm/hugetlb.h | 16 ---------------- arch/x86/include/asm/hugetlb.h | 15 --------------- include/asm-generic/hugetlb.h | 15 +++++++++++++++ 10 files changed, 19 insertions(+), 68 deletions(-) diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index 9ca14227eeb7..3fcef21ff2c2 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h @@ -33,17 +33,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return 0; } -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - struct hstate *h = hstate_file(file); - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; - return 0; -} - static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 1fd64ebf0cd7..3e7f6e69b28d 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -31,17 +31,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return 0; } -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - struct hstate *h = hstate_file(file); - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; - return 0; -} - static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index 82fe3d7a38d9..cbe296271030 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -9,6 +9,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); +#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len); diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index b3d6bb53ee6e..6ff2531cfb1d 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -18,6 +18,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return 0; } +#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 5a102d7251e4..fb7e0fd858a3 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -22,6 +22,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. */ +#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index b5b57b309564..2a90f387880a 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -114,21 +114,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); -/* - * If the arch doesn't supply something else, assume that hugepage - * size aligned regions are ok without further preparation. - */ -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - struct hstate *h = hstate_file(file); - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; - return 0; -} - #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 54f65094efe6..f1bbd255ee43 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -15,6 +15,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. */ +#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index f661362376e0..2101ea217f33 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -26,22 +26,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return 0; } -/* - * If the arch doesn't supply something else, assume that hugepage - * size aligned regions are ok without further preparation. - */ -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - struct hstate *h = hstate_file(file); - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; - return 0; -} - #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 3cd3a2c9840e..59c056adb3c9 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -13,21 +13,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return 0; } -/* - * If the arch doesn't supply something else, assume that hugepage - * size aligned regions are ok without further preparation. - */ -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - struct hstate *h = hstate_file(file); - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; - return 0; -} - static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index cd9697672b79..6c0c8b0c71e0 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -87,4 +87,19 @@ static inline pte_t huge_pte_wrprotect(pte_t pte) } #endif +#ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE +static inline int prepare_hugepage_range(struct file *file, + unsigned long addr, unsigned long len) +{ + struct hstate *h = hstate_file(file); + + if (len & ~huge_page_mask(h)) + return -EINVAL; + if (addr & ~huge_page_mask(h)) + return -EINVAL; + + return 0; +} +#endif + #endif /* _ASM_GENERIC_HUGETLB_H */ From 8e581d433bf796738181e865878a130b4e98f1b9 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 26 Oct 2018 15:08:35 -0700 Subject: [PATCH 095/132] hugetlb: introduce generic version of huge_ptep_set_wrprotect() arm, ia64, mips, powerpc, sh, x86 architectures use the same version of huge_ptep_set_wrprotect, so move this generic implementation into asm-generic/hugetlb.h. Link: http://lkml.kernel.org/r/20180920060358.16606-10-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Reviewed-by: Luiz Capitulino Reviewed-by: Mike Kravetz Tested-by: Helge Deller [parisc] Acked-by: Catalin Marinas [arm64] Acked-by: Paul Burton [MIPS] Acked-by: Ingo Molnar [x86] Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: David S. Miller Cc: Fenghua Yu Cc: Heiko Carstens Cc: H. Peter Anvin Cc: Ingo Molnar Cc: James E.J. Bottomley Cc: James Hogan Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Russell King Cc: Thomas Gleixner Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/hugetlb-3level.h | 6 ------ arch/arm64/include/asm/hugetlb.h | 1 + arch/ia64/include/asm/hugetlb.h | 6 ------ arch/mips/include/asm/hugetlb.h | 6 ------ arch/parisc/include/asm/hugetlb.h | 1 + arch/powerpc/include/asm/book3s/32/pgtable.h | 6 ------ arch/powerpc/include/asm/book3s/64/pgtable.h | 1 + arch/powerpc/include/asm/nohash/32/pgtable.h | 6 ------ arch/powerpc/include/asm/nohash/64/pgtable.h | 1 + arch/sh/include/asm/hugetlb.h | 6 ------ arch/sparc/include/asm/hugetlb.h | 1 + arch/x86/include/asm/hugetlb.h | 6 ------ include/asm-generic/hugetlb.h | 8 ++++++++ 13 files changed, 13 insertions(+), 42 deletions(-) diff --git a/arch/arm/include/asm/hugetlb-3level.h b/arch/arm/include/asm/hugetlb-3level.h index b897541520ef..8247cd6a2ac6 100644 --- a/arch/arm/include/asm/hugetlb-3level.h +++ b/arch/arm/include/asm/hugetlb-3level.h @@ -37,12 +37,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep) return retval; } -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 3e7f6e69b28d..f4f69ae5466e 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -48,6 +48,7 @@ extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT extern void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index cbe296271030..49d1f7949f3a 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -27,12 +27,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 6ff2531cfb1d..3dcf5debf8c4 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -63,12 +63,6 @@ static inline int huge_pte_none(pte_t pte) return !val || (val == (unsigned long)invalid_pte_table); } -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index fb7e0fd858a3..9c3950ca2974 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -39,6 +39,7 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 751cf931bb3f..796d026da37e 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -221,12 +221,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, { pte_update(ptep, (_PAGE_RW | _PAGE_HWWRITE), _PAGE_RO); } -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - static inline void __ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, pte_t entry, diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 2a2486526d1f..fc69eda5d588 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -461,6 +461,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 0); } +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index a507a65b0866..6c82b9660c55 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -246,12 +246,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, { pte_update(ptep, (_PAGE_RW | _PAGE_HWWRITE), _PAGE_RO); } -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - static inline void __ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, pte_t entry, diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 7cd6809f4d33..68283e632f04 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -239,6 +239,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); } +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index f1bbd255ee43..8df4004977b9 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -32,12 +32,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 2101ea217f33..c41754a113f3 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -32,6 +32,7 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 59c056adb3c9..a3f781f7a264 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -13,12 +13,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return 0; } -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 6c0c8b0c71e0..9b9039845278 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -102,4 +102,12 @@ static inline int prepare_hugepage_range(struct file *file, } #endif +#ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT +static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + ptep_set_wrprotect(mm, addr, ptep); +} +#endif + #endif /* _ASM_GENERIC_HUGETLB_H */ From facf6d5b8b6c4212519acfefbf3a0e0aeeb5b77e Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 26 Oct 2018 15:08:39 -0700 Subject: [PATCH 096/132] hugetlb: introduce generic version of huge_ptep_set_access_flags() arm, ia64, sh, x86 architectures use the same version of huge_ptep_set_access_flags, so move this generic implementation into asm-generic/hugetlb.h. Link: http://lkml.kernel.org/r/20180920060358.16606-11-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Reviewed-by: Luiz Capitulino Reviewed-by: Mike Kravetz Tested-by: Helge Deller [parisc] Acked-by: Catalin Marinas [arm64] Acked-by: Paul Burton [MIPS] Acked-by: Ingo Molnar [x86] Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: David S. Miller Cc: Fenghua Yu Cc: Heiko Carstens Cc: H. Peter Anvin Cc: Ingo Molnar Cc: James E.J. Bottomley Cc: James Hogan Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Russell King Cc: Thomas Gleixner Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/hugetlb-3level.h | 7 ------- arch/arm64/include/asm/hugetlb.h | 1 + arch/ia64/include/asm/hugetlb.h | 7 ------- arch/mips/include/asm/hugetlb.h | 1 + arch/parisc/include/asm/hugetlb.h | 1 + arch/powerpc/include/asm/hugetlb.h | 1 + arch/sh/include/asm/hugetlb.h | 7 ------- arch/sparc/include/asm/hugetlb.h | 1 + arch/x86/include/asm/hugetlb.h | 7 ------- include/asm-generic/hugetlb.h | 9 +++++++++ 10 files changed, 14 insertions(+), 28 deletions(-) diff --git a/arch/arm/include/asm/hugetlb-3level.h b/arch/arm/include/asm/hugetlb-3level.h index 8247cd6a2ac6..54e4b097b1f5 100644 --- a/arch/arm/include/asm/hugetlb-3level.h +++ b/arch/arm/include/asm/hugetlb-3level.h @@ -37,11 +37,4 @@ static inline pte_t huge_ptep_get(pte_t *ptep) return retval; } -static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - pte_t pte, int dirty) -{ - return ptep_set_access_flags(vma, addr, ptep, pte, dirty); -} - #endif /* _ASM_ARM_HUGETLB_3LEVEL_H */ diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index f4f69ae5466e..80887abcef7f 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -42,6 +42,7 @@ extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index 49d1f7949f3a..e9b42750fdf5 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -27,13 +27,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - pte_t pte, int dirty) -{ - return ptep_set_access_flags(vma, addr, ptep, pte, dirty); -} - static inline pte_t huge_ptep_get(pte_t *ptep) { return *ptep; diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 3dcf5debf8c4..120adc3b2ffd 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -63,6 +63,7 @@ static inline int huge_pte_none(pte_t pte) return !val || (val == (unsigned long)invalid_pte_table); } +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 9c3950ca2974..165b4e5a6f32 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -43,6 +43,7 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 2a90f387880a..d4d9cf6cb846 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -134,6 +134,7 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, flush_hugetlb_page(vma, addr); } +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 8df4004977b9..c87195ae0cfa 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -32,13 +32,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - pte_t pte, int dirty) -{ - return ptep_set_access_flags(vma, addr, ptep, pte, dirty); -} - static inline pte_t huge_ptep_get(pte_t *ptep) { return *ptep; diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index c41754a113f3..028a1465fbe7 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -40,6 +40,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, set_huge_pte_at(mm, addr, ptep, pte_wrprotect(old_pte)); } +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index a3f781f7a264..574d42eb081e 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -13,13 +13,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return 0; } -static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - pte_t pte, int dirty) -{ - return ptep_set_access_flags(vma, addr, ptep, pte, dirty); -} - static inline pte_t huge_ptep_get(pte_t *ptep) { return *ptep; diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 9b9039845278..f3c99a03ee83 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -110,4 +110,13 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, } #endif +#ifndef __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS +static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t pte, int dirty) +{ + return ptep_set_access_flags(vma, addr, ptep, pte, dirty); +} +#endif + #endif /* _ASM_GENERIC_HUGETLB_H */ From 544db7597ad0fd1ceebf656e2808a209d46ccd9e Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 26 Oct 2018 15:08:43 -0700 Subject: [PATCH 097/132] hugetlb: introduce generic version of huge_ptep_get ia64, mips, parisc, powerpc, sh, sparc, x86 architectures use the same version of huge_ptep_get, so move this generic implementation into asm-generic/hugetlb.h. [arnd@arndb.de: fix ARM 3level page tables] Link: http://lkml.kernel.org/r/20181005161722.904274-1-arnd@arndb.de Link: http://lkml.kernel.org/r/20180920060358.16606-12-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Reviewed-by: Luiz Capitulino Reviewed-by: Mike Kravetz Tested-by: Helge Deller [parisc] Acked-by: Catalin Marinas [arm64] Acked-by: Paul Burton [MIPS] Acked-by: Ingo Molnar [x86] Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: David S. Miller Cc: Fenghua Yu Cc: Heiko Carstens Cc: H. Peter Anvin Cc: Ingo Molnar Cc: James E.J. Bottomley Cc: James Hogan Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Russell King Cc: Thomas Gleixner Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/hugetlb-3level.h | 1 + arch/arm/include/asm/hugetlb.h | 3 +-- arch/arm64/include/asm/hugetlb.h | 1 + arch/ia64/include/asm/hugetlb.h | 5 ----- arch/mips/include/asm/hugetlb.h | 5 ----- arch/parisc/include/asm/hugetlb.h | 5 ----- arch/powerpc/include/asm/hugetlb.h | 5 ----- arch/sh/include/asm/hugetlb.h | 5 ----- arch/sparc/include/asm/hugetlb.h | 5 ----- arch/x86/include/asm/hugetlb.h | 5 ----- include/asm-generic/hugetlb.h | 7 +++++++ 11 files changed, 10 insertions(+), 37 deletions(-) diff --git a/arch/arm/include/asm/hugetlb-3level.h b/arch/arm/include/asm/hugetlb-3level.h index 54e4b097b1f5..0d9f3918fa7e 100644 --- a/arch/arm/include/asm/hugetlb-3level.h +++ b/arch/arm/include/asm/hugetlb-3level.h @@ -29,6 +29,7 @@ * ptes. * (The valid bit is automatically cleared by set_pte_at for PROT_NONE ptes). */ +#define __HAVE_ARCH_HUGE_PTEP_GET static inline pte_t huge_ptep_get(pte_t *ptep) { pte_t retval = *ptep; diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index 3fcef21ff2c2..b67256c22b08 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h @@ -23,9 +23,8 @@ #define _ASM_ARM_HUGETLB_H #include -#include - #include +#include static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 80887abcef7f..fb6609875455 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -20,6 +20,7 @@ #include +#define __HAVE_ARCH_HUGE_PTEP_GET static inline pte_t huge_ptep_get(pte_t *ptep) { return READ_ONCE(*ptep); diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index e9b42750fdf5..36cc0396b214 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -27,11 +27,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline pte_t huge_ptep_get(pte_t *ptep) -{ - return *ptep; -} - static inline void arch_clear_hugepage_flags(struct page *page) { } diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 120adc3b2ffd..425bb6fc3bda 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -82,11 +82,6 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, return changed; } -static inline pte_t huge_ptep_get(pte_t *ptep) -{ - return *ptep; -} - static inline void arch_clear_hugepage_flags(struct page *page) { } diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 165b4e5a6f32..7cb595dcb7d7 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -48,11 +48,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); -static inline pte_t huge_ptep_get(pte_t *ptep) -{ - return *ptep; -} - static inline void arch_clear_hugepage_flags(struct page *page) { } diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index d4d9cf6cb846..383da1ab9e23 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -139,11 +139,6 @@ extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); -static inline pte_t huge_ptep_get(pte_t *ptep) -{ - return *ptep; -} - static inline void arch_clear_hugepage_flags(struct page *page) { } diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index c87195ae0cfa..6f025fe18146 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -32,11 +32,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline pte_t huge_ptep_get(pte_t *ptep) -{ - return *ptep; -} - static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 028a1465fbe7..3963f80d1cb3 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -53,11 +53,6 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, return changed; } -static inline pte_t huge_ptep_get(pte_t *ptep) -{ - return *ptep; -} - static inline void arch_clear_hugepage_flags(struct page *page) { } diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 574d42eb081e..7469d321f072 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -13,11 +13,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return 0; } -static inline pte_t huge_ptep_get(pte_t *ptep) -{ - return *ptep; -} - static inline void arch_clear_hugepage_flags(struct page *page) { } diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index f3c99a03ee83..71d7b77eea50 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -119,4 +119,11 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, } #endif +#ifndef __HAVE_ARCH_HUGE_PTEP_GET +static inline pte_t huge_ptep_get(pte_t *ptep) +{ + return *ptep; +} +#endif + #endif /* _ASM_GENERIC_HUGETLB_H */ From 3c0513243a4a07ebad2d59f3d972bef483818ec6 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Fri, 26 Oct 2018 15:08:47 -0700 Subject: [PATCH 098/132] mm/filemap.c: use vmf_error() These codes can be replaced with new inline vmf_error(). Link: http://lkml.kernel.org/r/20180927171411.GA23331@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Andrew Morton Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 0b8c6de6d995..3968da1f7f5a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2596,9 +2596,7 @@ no_cached_page: * system is low on memory, or a problem occurs while trying * to schedule I/O. */ - if (error == -ENOMEM) - return VM_FAULT_OOM; - return VM_FAULT_SIGBUS; + return vmf_error(error); page_not_uptodate: /* From 85a06835f6f1ba79f0f00838ccd5ad840dd1eafb Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 26 Oct 2018 15:08:50 -0700 Subject: [PATCH 099/132] mm: mremap: downgrade mmap_sem to read when shrinking Other than munmap, mremap might be used to shrink memory mapping too. So, it may hold write mmap_sem for long time when shrinking large mapping, as what commit ("mm: mmap: zap pages with read mmap_sem in munmap") described. The mremap() will not manipulate vmas anymore after __do_munmap() call for the mapping shrink use case, so it is safe to downgrade to read mmap_sem. So, the same optimization, which downgrades mmap_sem to read for zapping pages, is also feasible and reasonable to this case. The period of holding exclusive mmap_sem for shrinking large mapping would be reduced significantly with this optimization. MREMAP_FIXED and MREMAP_MAYMOVE are more complicated to adopt this optimization since they need manipulate vmas after do_munmap(), downgrading mmap_sem may create race window. Simple mapping shrink is the low hanging fruit, and it may cover the most cases of unmap with munmap together. [akpm@linux-foundation.org: tweak comment] [yang.shi@linux.alibaba.com: fix unsigned compare against 0 issue] Link: http://lkml.kernel.org/r/1538687672-17795-2-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1538067582-60038-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Michal Hocko Cc: Matthew Wilcox Cc: Laurent Dufour Cc: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/mmap.c | 4 ++-- mm/mremap.c | 20 ++++++++++++++++---- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 33228a49d7d2..a023c5ce71fa 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2306,6 +2306,8 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); +extern int __do_munmap(struct mm_struct *, unsigned long, size_t, + struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); diff --git a/mm/mmap.c b/mm/mmap.c index 58e323c92c8e..1bfd12032664 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2687,8 +2687,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, * work. This now handles partial unmappings. * Jeremy Fitzhardinge */ -static int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, - struct list_head *uf, bool downgrade) +int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, + struct list_head *uf, bool downgrade) { unsigned long end; struct vm_area_struct *vma, *prev, *last; diff --git a/mm/mremap.c b/mm/mremap.c index a9617e72e6b7..7f9f9180e401 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -521,6 +521,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long ret = -EINVAL; unsigned long charged = 0; bool locked = false; + bool downgraded = false; struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); @@ -557,12 +558,20 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. - * do_munmap does all the needed commit accounting + * __do_munmap does all the needed commit accounting, and + * downgrades mmap_sem to read if so directed. */ if (old_len >= new_len) { - ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap); - if (ret && old_len != new_len) + int retval; + + retval = __do_munmap(mm, addr+new_len, old_len - new_len, + &uf_unmap, true); + if (retval < 0 && old_len != new_len) { + ret = retval; goto out; + /* Returning 1 indicates mmap_sem is downgraded to read. */ + } else if (retval == 1) + downgraded = true; ret = addr; goto out; } @@ -627,7 +636,10 @@ out: vm_unacct_memory(charged); locked = 0; } - up_write(¤t->mm->mmap_sem); + if (downgraded) + up_read(¤t->mm->mmap_sem); + else + up_write(¤t->mm->mmap_sem); if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); userfaultfd_unmap_complete(mm, &uf_unmap_early); From 9bc8039e715da3b53dbac89525323a9f2f69b7b5 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 26 Oct 2018 15:08:54 -0700 Subject: [PATCH 100/132] mm: brk: downgrade mmap_sem to read when shrinking brk might be used to shrink memory mapping too other than munmap(). So, it may hold write mmap_sem for long time when shrinking large mapping, as what commit ("mm: mmap: zap pages with read mmap_sem in munmap") described. The brk() will not manipulate vmas anymore after __do_munmap() call for the mapping shrink use case. But, it may set mm->brk after __do_munmap(), which needs hold write mmap_sem. However, a simple trick can workaround this by setting mm->brk before __do_munmap(). Then restore the original value if __do_munmap() fails. With this trick, it is safe to downgrade to read mmap_sem. So, the same optimization, which downgrades mmap_sem to read for zapping pages, is also feasible and reasonable to this case. The period of holding exclusive mmap_sem for shrinking large mapping would be reduced significantly with this optimization. [akpm@linux-foundation.org: tweak comment] [yang.shi@linux.alibaba.com: fix unsigned compare against 0 issue] Link: http://lkml.kernel.org/r/1538687672-17795-1-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1538067582-60038-2-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Michal Hocko Cc: Matthew Wilcox Cc: Laurent Dufour Cc: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 1bfd12032664..6c04292e16a7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -191,16 +191,19 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long retval; - unsigned long newbrk, oldbrk; + unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; struct vm_area_struct *next; unsigned long min_brk; bool populate; + bool downgraded = false; LIST_HEAD(uf); if (down_write_killable(&mm->mmap_sem)) return -EINTR; + origbrk = mm->brk; + #ifdef CONFIG_COMPAT_BRK /* * CONFIG_COMPAT_BRK can still be overridden by setting @@ -229,14 +232,32 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) newbrk = PAGE_ALIGN(brk); oldbrk = PAGE_ALIGN(mm->brk); - if (oldbrk == newbrk) - goto set_brk; + if (oldbrk == newbrk) { + mm->brk = brk; + goto success; + } - /* Always allow shrinking brk. */ + /* + * Always allow shrinking brk. + * __do_munmap() may downgrade mmap_sem to read. + */ if (brk <= mm->brk) { - if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf)) - goto set_brk; - goto out; + int ret; + + /* + * mm->brk must to be protected by write mmap_sem so update it + * before downgrading mmap_sem. When __do_munmap() fails, + * mm->brk will be restored from origbrk. + */ + mm->brk = brk; + ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true); + if (ret < 0) { + mm->brk = origbrk; + goto out; + } else if (ret == 1) { + downgraded = true; + } + goto success; } /* Check against existing mmap mappings. */ @@ -247,18 +268,21 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* Ok, looks good - let it rip. */ if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) goto out; - -set_brk: mm->brk = brk; + +success: populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; - up_write(&mm->mmap_sem); + if (downgraded) + up_read(&mm->mmap_sem); + else + up_write(&mm->mmap_sem); userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(oldbrk, newbrk - oldbrk); return brk; out: - retval = mm->brk; + retval = origbrk; up_write(&mm->mmap_sem); return retval; } From cc4b8c794f476076c9ce19f43eb4d98dc4b5e155 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 26 Oct 2018 15:08:57 -0700 Subject: [PATCH 101/132] mm: dax: add comment for PFN_SPECIAL The comment for PFN_SPECIAL is missed in pfn_t.h. Add comment to get consistent with other pfn flags. Link: http://lkml.kernel.org/r/1538086549-100536-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Suggested-by: Dan Williams Reviewed-by: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pfn_t.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 673546ba7342..7bb77850c65a 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -9,6 +9,8 @@ * PFN_SG_LAST - pfn references a page and is the last scatterlist entry * PFN_DEV - pfn is not covered by system memmap by default * PFN_MAP - pfn has a dynamic page mapping established by a device driver + * PFN_SPECIAL - for CONFIG_FS_DAX_LIMITED builds to allow XIP, but not + * get_user_pages */ #define PFN_FLAGS_MASK (((u64) (~PAGE_MASK)) << (BITS_PER_LONG_LONG - PAGE_SHIFT)) #define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1)) From ff09d7ec9786be4ad7589aa987d7dc66e2dd9160 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 26 Oct 2018 15:09:01 -0700 Subject: [PATCH 102/132] mm/memory.c: recheck page table entry with page table lock held We clear the pte temporarily during read/modify/write update of the pte. If we take a page fault while the pte is cleared, the application can get SIGBUS. One such case is with remap_pfn_range without a backing vm_ops->fault callback. do_fault will return SIGBUS in that case. cpu 0 cpu1 mprotect() ptep_modify_prot_start()/pte cleared. . . page fault. . . prep_modify_prot_commit() Fix this by taking page table lock and rechecking for pte_none. [aneesh.kumar@linux.ibm.com: fix crash observed with syzkaller run] Link: http://lkml.kernel.org/r/87va6bwlfg.fsf@linux.ibm.com Link: http://lkml.kernel.org/r/20180926031858.9692-1-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Cc: Willem de Bruijn Cc: Eric Dumazet Cc: Ido Schimmel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 6abc74f41bc0..072139579d89 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3496,10 +3496,36 @@ static vm_fault_t do_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; - /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ - if (!vma->vm_ops->fault) - ret = VM_FAULT_SIGBUS; - else if (!(vmf->flags & FAULT_FLAG_WRITE)) + /* + * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND + */ + if (!vma->vm_ops->fault) { + /* + * If we find a migration pmd entry or a none pmd entry, which + * should never happen, return SIGBUS + */ + if (unlikely(!pmd_present(*vmf->pmd))) + ret = VM_FAULT_SIGBUS; + else { + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, + vmf->pmd, + vmf->address, + &vmf->ptl); + /* + * Make sure this is not a temporary clearing of pte + * by holding ptl and checking again. A R/M/W update + * of pte involves: take ptl, clearing the pte so that + * we don't have concurrent modification by hardware + * followed by an update. + */ + if (unlikely(pte_none(*vmf->pte))) + ret = VM_FAULT_SIGBUS; + else + ret = VM_FAULT_NOPAGE; + + pte_unmap_unlock(vmf->pte, vmf->ptl); + } + } else if (!(vmf->flags & FAULT_FLAG_WRITE)) ret = do_read_fault(vmf); else if (!(vma->vm_flags & VM_SHARED)) ret = do_cow_fault(vmf); From f0ecf25a093fc0589f0a6bc4c1ea068bbb67d220 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 26 Oct 2018 15:09:05 -0700 Subject: [PATCH 103/132] mm/vmstat.c: assert that vmstat_text is in sync with stat_items_size Having two gigantic arrays that must manually be kept in sync, including ifdefs, isn't exactly robust. To make it easier to catch such issues in the future, add a BUILD_BUG_ON(). Link: http://lkml.kernel.org/r/20181001143138.95119-3-jannh@google.com Signed-off-by: Jann Horn Reviewed-by: Kees Cook Reviewed-by: Andrew Morton Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Christoph Lameter Cc: Kemi Wang Cc: Andy Lutomirski Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/vmstat.c b/mm/vmstat.c index dab53430f63c..6038ce593ce3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1665,6 +1665,8 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) stat_items_size += sizeof(struct vm_event_state); #endif + BUILD_BUG_ON(stat_items_size != + ARRAY_SIZE(vmstat_text) * sizeof(unsigned long)); v = kmalloc(stat_items_size, GFP_KERNEL); m->private = v; if (!v) From 439de0d7443789c688428429874b8e27f693c00e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 26 Oct 2018 15:09:09 -0700 Subject: [PATCH 104/132] userfaultfd: selftest: cleanup help messages Firstly, the help in the comment region is obsolete, now we support three parameters. Since at it, change it and move it into the help message of the program. Also, the help messages dumped here and there is obsolete too. Use a single usage() helper. Link: http://lkml.kernel.org/r/20180930074259.18229-2-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: Mike Rapoport Cc: Shuah Khan Cc: Mike Kravetz Cc: Jerome Glisse Cc: Zi Yan Cc: "Kirill A . Shutemov" Cc: Shaohua Li Cc: Andrea Arcangeli Cc: "Dr . David Alan Gilbert" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 46 ++++++++++++++---------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 7b8171e3128a..5ff3a4f9173e 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -34,18 +34,6 @@ * per-CPU threads 1 by triggering userfaults inside * pthread_mutex_lock will also verify the atomicity of the memory * transfer (UFFDIO_COPY). - * - * The program takes two parameters: the amounts of physical memory in - * megabytes (MiB) of the area and the number of bounces to execute. - * - * # 100MiB 99999 bounces - * ./userfaultfd 100 99999 - * - * # 1GiB 99 bounces - * ./userfaultfd 1000 99 - * - * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers - * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done */ #define _GNU_SOURCE @@ -115,6 +103,30 @@ pthread_attr_t attr; ~(unsigned long)(sizeof(unsigned long long) \ - 1))) +const char *examples = + "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" + "./userfaultfd anon 100 99999\n\n" + "# Run share memory test on 1GiB region with 99 bounces:\n" + "./userfaultfd shmem 1000 99\n\n" + "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n" + "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n" + "# Run the same hugetlb test but using shmem:\n" + "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n" + "# 10MiB-~6GiB 999 bounces anonymous test, " + "continue forever unless an error triggers\n" + "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; + +static void usage(void) +{ + fprintf(stderr, "\nUsage: ./userfaultfd " + "[hugetlbfs_file]\n\n"); + fprintf(stderr, "Supported : anon, hugetlb, " + "hugetlb_shared, shmem\n\n"); + fprintf(stderr, "Examples:\n\n"); + fprintf(stderr, examples); + exit(1); +} + static int anon_release_pages(char *rel_area) { int ret = 0; @@ -1272,8 +1284,7 @@ static void sigalrm(int sig) int main(int argc, char **argv) { if (argc < 4) - fprintf(stderr, "Usage: [hugetlbfs_file]\n"), - exit(1); + usage(); if (signal(SIGALRM, sigalrm) == SIG_ERR) fprintf(stderr, "failed to arm SIGALRM"), exit(1); @@ -1286,20 +1297,19 @@ int main(int argc, char **argv) nr_cpus; if (!nr_pages_per_cpu) { fprintf(stderr, "invalid MiB\n"); - fprintf(stderr, "Usage: \n"), exit(1); + usage(); } bounces = atoi(argv[3]); if (bounces <= 0) { fprintf(stderr, "invalid bounces\n"); - fprintf(stderr, "Usage: \n"), exit(1); + usage(); } nr_pages = nr_pages_per_cpu * nr_cpus; if (test_type == TEST_HUGETLB) { if (argc < 5) - fprintf(stderr, "Usage: hugetlb \n"), - exit(1); + usage(); huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755); if (huge_fd < 0) { fprintf(stderr, "Open of %s failed", argv[3]); From 04d877319e2aa3895cc7998adb32de0967d3927b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 26 Oct 2018 15:09:13 -0700 Subject: [PATCH 105/132] userfaultfd: selftest: generalize read and poll We do very similar things in read and poll modes, but we're copying the codes around. Share the codes properly on reading the message and handling the page fault to make the code cleaner. Meanwhile this solves previous mismatch of behaviors between the two modes on that the old code: - did not check EAGAIN case in read() mode - ignored BOUNCE_VERIFY check in read() mode Link: http://lkml.kernel.org/r/20180930074259.18229-3-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: Mike Rapoport Cc: Shuah Khan Cc: Mike Kravetz Cc: Jerome Glisse Cc: Zi Yan Cc: "Kirill A . Shutemov" Cc: Shaohua Li Cc: Andrea Arcangeli Cc: "Dr . David Alan Gilbert" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 77 +++++++++++++----------- 1 file changed, 43 insertions(+), 34 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 5ff3a4f9173e..7a8c6937cc67 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -451,6 +451,43 @@ static int copy_page(int ufd, unsigned long offset) return __copy_page(ufd, offset, false); } +static int uffd_read_msg(int ufd, struct uffd_msg *msg) +{ + int ret = read(uffd, msg, sizeof(*msg)); + + if (ret != sizeof(*msg)) { + if (ret < 0) { + if (errno == EAGAIN) + return 1; + else + perror("blocking read error"), exit(1); + } else { + fprintf(stderr, "short read\n"), exit(1); + } + } + + return 0; +} + +/* Return 1 if page fault handled by us; otherwise 0 */ +static int uffd_handle_page_fault(struct uffd_msg *msg) +{ + unsigned long offset; + + if (msg->event != UFFD_EVENT_PAGEFAULT) + fprintf(stderr, "unexpected msg event %u\n", + msg->event), exit(1); + + if (bounces & BOUNCE_VERIFY && + msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) + fprintf(stderr, "unexpected write fault\n"), exit(1); + + offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; + offset &= ~(page_size-1); + + return copy_page(uffd, offset); +} + static void *uffd_poll_thread(void *arg) { unsigned long cpu = (unsigned long) arg; @@ -458,7 +495,6 @@ static void *uffd_poll_thread(void *arg) struct uffd_msg msg; struct uffdio_register uffd_reg; int ret; - unsigned long offset; char tmp_chr; unsigned long userfaults = 0; @@ -482,25 +518,15 @@ static void *uffd_poll_thread(void *arg) if (!(pollfd[0].revents & POLLIN)) fprintf(stderr, "pollfd[0].revents %d\n", pollfd[0].revents), exit(1); - ret = read(uffd, &msg, sizeof(msg)); - if (ret < 0) { - if (errno == EAGAIN) - continue; - perror("nonblocking read error"), exit(1); - } + if (uffd_read_msg(uffd, &msg)) + continue; switch (msg.event) { default: fprintf(stderr, "unexpected msg event %u\n", msg.event), exit(1); break; case UFFD_EVENT_PAGEFAULT: - if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) - fprintf(stderr, "unexpected write fault\n"), exit(1); - offset = (char *)(unsigned long)msg.arg.pagefault.address - - area_dst; - offset &= ~(page_size-1); - if (copy_page(uffd, offset)) - userfaults++; + userfaults += uffd_handle_page_fault(&msg); break; case UFFD_EVENT_FORK: close(uffd); @@ -528,8 +554,6 @@ static void *uffd_read_thread(void *arg) { unsigned long *this_cpu_userfaults; struct uffd_msg msg; - unsigned long offset; - int ret; this_cpu_userfaults = (unsigned long *) arg; *this_cpu_userfaults = 0; @@ -538,24 +562,9 @@ static void *uffd_read_thread(void *arg) /* from here cancellation is ok */ for (;;) { - ret = read(uffd, &msg, sizeof(msg)); - if (ret != sizeof(msg)) { - if (ret < 0) - perror("blocking read error"), exit(1); - else - fprintf(stderr, "short read\n"), exit(1); - } - if (msg.event != UFFD_EVENT_PAGEFAULT) - fprintf(stderr, "unexpected msg event %u\n", - msg.event), exit(1); - if (bounces & BOUNCE_VERIFY && - msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) - fprintf(stderr, "unexpected write fault\n"), exit(1); - offset = (char *)(unsigned long)msg.arg.pagefault.address - - area_dst; - offset &= ~(page_size-1); - if (copy_page(uffd, offset)) - (*this_cpu_userfaults)++; + if (uffd_read_msg(uffd, &msg)) + continue; + (*this_cpu_userfaults) += uffd_handle_page_fault(&msg); } return (void *)NULL; } From 7eaa8c969efa77127de9a05856eef9e5d22cf487 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 26 Oct 2018 15:09:17 -0700 Subject: [PATCH 106/132] userfaultfd: selftest: recycle lock threads first Now we recycle the uffd servicing threads earlier than the lock threads. It might happen that when the lock thread is still blocked at a pthread mutex lock while the servicing thread has already quitted for the cpu so the lock thread will be blocked forever and hang the test program. To fix the possible race, recycle the lock threads first. This never happens with current missing-only tests, but when I start to run the write-protection tests (the feature is not yet posted upstream) it happens every time of the run possibly because in that new test we'll need to service two page faults for each lock operation. Link: http://lkml.kernel.org/r/20180930074259.18229-4-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: Mike Rapoport Cc: Shuah Khan Cc: Mike Kravetz Cc: Jerome Glisse Cc: Zi Yan Cc: "Kirill A . Shutemov" Cc: Shaohua Li Cc: Andrea Arcangeli Cc: "Dr . David Alan Gilbert" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 7a8c6937cc67..5d1db824f73a 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -626,6 +626,12 @@ static int stress(unsigned long *userfaults) if (uffd_test_ops->release_pages(area_src)) return 1; + + finished = 1; + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pthread_join(locking_threads[cpu], NULL)) + return 1; + for (cpu = 0; cpu < nr_cpus; cpu++) { char c; if (bounces & BOUNCE_POLL) { @@ -643,11 +649,6 @@ static int stress(unsigned long *userfaults) } } - finished = 1; - for (cpu = 0; cpu < nr_cpus; cpu++) - if (pthread_join(locking_threads[cpu], NULL)) - return 1; - return 0; } From 61855f021c3ae2ccc244111e2a321690f9786aed Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 26 Oct 2018 15:09:20 -0700 Subject: [PATCH 107/132] mm/zsmalloc.c: fix fall-through annotation Replace "fallthru" with a proper "fall through" annotation. This fix is part of the ongoing efforts to enabling -Wimplicit-fallthrough Link: http://lkml.kernel.org/r/20181003105114.GA24423@embeddedor.com Signed-off-by: Gustavo A. R. Silva Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 9da65552e7ca..0787d33b80d8 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -418,7 +418,7 @@ static void *zs_zpool_map(void *pool, unsigned long handle, case ZPOOL_MM_WO: zs_mm = ZS_MM_WO; break; - case ZPOOL_MM_RW: /* fallthru */ + case ZPOOL_MM_RW: /* fall through */ default: zs_mm = ZS_MM_RW; break; From 4a222127f3631e6cdde228e3bb6b599f52b96d14 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 26 Oct 2018 15:09:24 -0700 Subject: [PATCH 108/132] mm/page_alloc.c: initialize num_movable in move_freepages() If move_freepages_block() returns 0 because !zone_spans_pfn(), *num_movable can hold the value from the stack because it does not get initialized in move_freepages(). Move the initialization to move_freepages_block() to guarantee the value actually makes sense. This currently doesn't affect its only caller where num_movable != NULL, so no bug fix, but just more robust. Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1810051355490.212229@chino.kir.corp.google.com Signed-off-by: David Rientjes Reviewed-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d73ff2188d72..bdb7eb25acf8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2021,10 +2021,6 @@ static int move_freepages(struct zone *zone, pfn_valid(page_to_pfn(end_page)) && page_zone(start_page) != page_zone(end_page)); #endif - - if (num_movable) - *num_movable = 0; - for (page = start_page; page <= end_page;) { if (!pfn_valid_within(page_to_pfn(page))) { page++; @@ -2064,6 +2060,9 @@ int move_freepages_block(struct zone *zone, struct page *page, unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; + if (num_movable) + *num_movable = 0; + start_pfn = page_to_pfn(page); start_pfn = start_pfn & ~(pageblock_nr_pages-1); start_page = pfn_to_page(start_pfn); From 1c2d479a119b84feacbe4de782016f1bf1ad16dc Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 26 Oct 2018 15:09:28 -0700 Subject: [PATCH 109/132] mm/memcontrol.c: convert mem_cgroup_id::ref to refcount_t type This will allow to use generic refcount_t interfaces to check counters overflow instead of currently existing VM_BUG_ON(). The only difference after the patch is VM_BUG_ON() may cause BUG(), while refcount_t fires with WARN(). But this seems not to be significant here, since such the problems are usually caught by syzbot with panic-on-warn enabled. Link: http://lkml.kernel.org/r/153910718919.7006.13400779039257185427.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Andrea Parri Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 +- mm/memcontrol.c | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4399cc3f00e4..7ab2120155a4 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -78,7 +78,7 @@ struct mem_cgroup_reclaim_cookie { struct mem_cgroup_id { int id; - atomic_t ref; + refcount_t ref; }; /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 645ede7ad1b2..92d38c88250f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4299,14 +4299,12 @@ static void mem_cgroup_id_remove(struct mem_cgroup *memcg) static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { - VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); - atomic_add(n, &memcg->id.ref); + refcount_add(n, &memcg->id.ref); } static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { - VM_BUG_ON(atomic_read(&memcg->id.ref) < n); - if (atomic_sub_and_test(n, &memcg->id.ref)) { + if (refcount_sub_and_test(n, &memcg->id.ref)) { mem_cgroup_id_remove(memcg); /* Memcg ID pins CSS */ @@ -4523,7 +4521,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) } /* Online state pins memcg ID, memcg ID pins CSS */ - atomic_set(&memcg->id.ref, 1); + refcount_set(&memcg->id.ref, 1); css_get(css); return 0; } @@ -6357,7 +6355,7 @@ subsys_initcall(mem_cgroup_init); #ifdef CONFIG_MEMCG_SWAP static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) { - while (!atomic_inc_not_zero(&memcg->id.ref)) { + while (!refcount_inc_not_zero(&memcg->id.ref)) { /* * The root cgroup cannot be destroyed, so it's refcount must * always be >= 1. From dfb3ccd00a06d71171961022019bb0f210d2cdc1 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 26 Oct 2018 15:09:32 -0700 Subject: [PATCH 110/132] mm: make memmap_init a proper function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit memmap_init is sometimes a macro sometimes a function based on __HAVE_ARCH_MEMMAP_INIT. It is only a function on ia64. Make memmap_init a weak function instead, and let ia64 redefine it. Link: http://lkml.kernel.org/r/20180724235520.10200-2-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Andrew Morton Reviewed-by: Oscar Salvador Cc: Steven Sistare Cc: Daniel Jordan Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Dan Williams Cc: Jan Kara Cc: Jérôme Glisse Cc: Souptick Joarder Cc: Baoquan He Cc: Greg Kroah-Hartman Cc: Vlastimil Babka Cc: Wei Yang Cc: Dave Hansen Cc: David Rientjes Cc: Ingo Molnar Cc: Pavel Tatashin Cc: Abdul Haleem Cc: Michael Ellerman Cc: Pasha Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/pgtable.h | 1 - mm/page_alloc.c | 9 +++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 165827774bea..b1e7468eb65a 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -544,7 +544,6 @@ extern struct page *zero_page_memmap_ptr; # ifdef CONFIG_VIRTUAL_MEM_MAP /* arch mem_map init routine is needed due to holes in a virtual mem_map */ -# define __HAVE_ARCH_MEMMAP_INIT extern void memmap_init (unsigned long size, int nid, unsigned long zone, unsigned long start_pfn); # endif /* CONFIG_VIRTUAL_MEM_MAP */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bdb7eb25acf8..94725aea672f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5631,10 +5631,11 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } -#ifndef __HAVE_ARCH_MEMMAP_INIT -#define memmap_init(size, nid, zone, start_pfn) \ - memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL) -#endif +void __meminit __weak memmap_init(unsigned long size, int nid, + unsigned long zone, unsigned long start_pfn) +{ + memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL); +} static int zone_batchsize(struct zone *zone) { From d3035be4ce2345d98633a45f93a74e526e94b802 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 26 Oct 2018 15:09:37 -0700 Subject: [PATCH 111/132] mm: calculate deferred pages after skipping mirrored memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit update_defer_init() should be called only when struct page is about to be initialized. Because it counts number of initialized struct pages, but there we may skip struct pages if there is some mirrored memory. So move, update_defer_init() after checking for mirrored memory. Also, rename update_defer_init() to defer_init() and reverse the return boolean to emphasize that this is a boolean function, that tells that the reset of memmap initialization should be deferred. Make this function self-contained: do not pass number of already initialized pages in this zone by using static counters. I found this bug by reading the code. The effect is that fewer than expected struct pages are initialized early in boot, and it is possible that in some corner cases we may fail to boot when mirrored pages are used. The deferred on demand code should somewhat mitigate this. But this still brings some inconsistencies compared to when booting without mirrored pages, so it is better to fix. [pasha.tatashin@oracle.com: add comment about defer_init's lack of locking] Link: http://lkml.kernel.org/r/20180726193509.3326-3-pasha.tatashin@oracle.com [akpm@linux-foundation.org: make defer_init non-inline, __meminit] Link: http://lkml.kernel.org/r/20180724235520.10200-3-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Oscar Salvador Cc: Abdul Haleem Cc: Baoquan He Cc: Daniel Jordan Cc: Dan Williams Cc: Dave Hansen Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Jan Kara Cc: Jérôme Glisse Cc: Kirill A. Shutemov Cc: Michael Ellerman Cc: Michal Hocko Cc: Souptick Joarder Cc: Steven Sistare Cc: Vlastimil Babka Cc: Wei Yang Cc: Pasha Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94725aea672f..db1ff4ac0cc6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -307,24 +307,33 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn) } /* - * Returns false when the remaining initialisation should be deferred until + * Returns true when the remaining initialisation should be deferred until * later in the boot cycle when it can be parallelised. */ -static inline bool update_defer_init(pg_data_t *pgdat, - unsigned long pfn, unsigned long zone_end, - unsigned long *nr_initialised) +static bool __meminit +defer_init(int nid, unsigned long pfn, unsigned long end_pfn) { - /* Always populate low zones for address-constrained allocations */ - if (zone_end < pgdat_end_pfn(pgdat)) - return true; - (*nr_initialised)++; - if ((*nr_initialised > pgdat->static_init_pgcnt) && - (pfn & (PAGES_PER_SECTION - 1)) == 0) { - pgdat->first_deferred_pfn = pfn; - return false; + static unsigned long prev_end_pfn, nr_initialised; + + /* + * prev_end_pfn static that contains the end of previous zone + * No need to protect because called very early in boot before smp_init. + */ + if (prev_end_pfn != end_pfn) { + prev_end_pfn = end_pfn; + nr_initialised = 0; } - return true; + /* Always populate low zones for address-constrained allocations */ + if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) + return false; + nr_initialised++; + if ((nr_initialised > NODE_DATA(nid)->static_init_pgcnt) && + (pfn & (PAGES_PER_SECTION - 1)) == 0) { + NODE_DATA(nid)->first_deferred_pfn = pfn; + return true; + } + return false; } #else static inline bool early_page_uninitialised(unsigned long pfn) @@ -332,11 +341,9 @@ static inline bool early_page_uninitialised(unsigned long pfn) return false; } -static inline bool update_defer_init(pg_data_t *pgdat, - unsigned long pfn, unsigned long zone_end, - unsigned long *nr_initialised) +static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) { - return true; + return false; } #endif @@ -5453,9 +5460,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, struct vmem_altmap *altmap) { unsigned long end_pfn = start_pfn + size; - pg_data_t *pgdat = NODE_DATA(nid); unsigned long pfn; - unsigned long nr_initialised = 0; struct page *page; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP struct memblock_region *r = NULL, *tmp; @@ -5494,8 +5499,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, continue; if (!early_pfn_in_nid(pfn, nid)) continue; - if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) - break; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* @@ -5518,6 +5521,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, } } #endif + if (defer_init(nid, pfn, end_pfn)) + break; not_early: page = pfn_to_page(pfn); From a9a9e77fbf2784ae2694696eb772419c0a96159e Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 26 Oct 2018 15:09:40 -0700 Subject: [PATCH 112/132] mm: move mirrored memory specific code outside of memmap_init_zone MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit memmap_init_zone, is getting complex, because it is called from different contexts: hotplug, and during boot, and also because it must handle some architecture quirks. One of them is mirrored memory. Move the code that decides whether to skip mirrored memory outside of memmap_init_zone, into a separate function. [pasha.tatashin@oracle.com: uninline overlap_memmap_init()] Link: http://lkml.kernel.org/r/20180726193509.3326-4-pasha.tatashin@oracle.com Link: http://lkml.kernel.org/r/20180724235520.10200-4-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Oscar Salvador Cc: Pasha Tatashin Cc: Abdul Haleem Cc: Baoquan He Cc: Daniel Jordan Cc: Dan Williams Cc: Dave Hansen Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Jan Kara Cc: Jérôme Glisse Cc: Kirill A. Shutemov Cc: Michael Ellerman Cc: Michal Hocko Cc: Souptick Joarder Cc: Steven Sistare Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 71 +++++++++++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 38 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index db1ff4ac0cc6..c26d3152f9ba 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5450,6 +5450,30 @@ void __ref build_all_zonelists(pg_data_t *pgdat) #endif } +/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ +static bool __meminit +overlap_memmap_init(unsigned long zone, unsigned long *pfn) +{ +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + static struct memblock_region *r; + + if (mirrored_kernelcore && zone == ZONE_MOVABLE) { + if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { + for_each_memblock(memory, r) { + if (*pfn < memblock_region_memory_end_pfn(r)) + break; + } + } + if (*pfn >= memblock_region_memory_base_pfn(r) && + memblock_is_mirror(r)) { + *pfn = memblock_region_memory_end_pfn(r); + return true; + } + } +#endif + return false; +} + /* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is @@ -5459,12 +5483,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context, struct vmem_altmap *altmap) { - unsigned long end_pfn = start_pfn + size; - unsigned long pfn; + unsigned long pfn, end_pfn = start_pfn + size; struct page *page; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - struct memblock_region *r = NULL, *tmp; -#endif if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; @@ -5492,39 +5512,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * There can be holes in boot-time mem_map[]s handed to this * function. They do not exist on hotplugged memory. */ - if (context != MEMMAP_EARLY) - goto not_early; - - if (!early_pfn_valid(pfn)) - continue; - if (!early_pfn_in_nid(pfn, nid)) - continue; - -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - /* - * Check given memblock attribute by firmware which can affect - * kernel memory layout. If zone==ZONE_MOVABLE but memory is - * mirrored, it's an overlapped memmap init. skip it. - */ - if (mirrored_kernelcore && zone == ZONE_MOVABLE) { - if (!r || pfn >= memblock_region_memory_end_pfn(r)) { - for_each_memblock(memory, tmp) - if (pfn < memblock_region_memory_end_pfn(tmp)) - break; - r = tmp; - } - if (pfn >= memblock_region_memory_base_pfn(r) && - memblock_is_mirror(r)) { - /* already initialized as NORMAL */ - pfn = memblock_region_memory_end_pfn(r); + if (context == MEMMAP_EARLY) { + if (!early_pfn_valid(pfn)) continue; - } + if (!early_pfn_in_nid(pfn, nid)) + continue; + if (overlap_memmap_init(zone, &pfn)) + continue; + if (defer_init(nid, pfn, end_pfn)) + break; } -#endif - if (defer_init(nid, pfn, end_pfn)) - break; -not_early: page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid); if (context == MEMMAP_HOTPLUG) @@ -5541,9 +5539,6 @@ not_early: * can be created for invalid pages (for alignment) * check here not to call set_pageblock_migratetype() against * pfn out of zone. - * - * Please note that MEMMAP_HOTPLUG path doesn't clear memmap - * because this is done early in sparse_add_one_section */ if (!(pfn & (pageblock_nr_pages - 1))) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); From 64081362e8ff4587b4554087f3cfc73d3e0a4cd7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 26 Oct 2018 15:09:45 -0700 Subject: [PATCH 113/132] mm/page-writeback.c: fix range_cyclic writeback vs writepages deadlock We've recently seen a workload on XFS filesystems with a repeatable deadlock between background writeback and a multi-process application doing concurrent writes and fsyncs to a small range of a file. range_cyclic writeback Process 1 Process 2 xfs_vm_writepages write_cache_pages writeback_index = 2 cycled = 0 .... find page 2 dirty lock Page 2 ->writepage page 2 writeback page 2 clean page 2 added to bio no more pages write() locks page 1 dirties page 1 locks page 2 dirties page 1 fsync() .... xfs_vm_writepages write_cache_pages start index 0 find page 1 towrite lock Page 1 ->writepage page 1 writeback page 1 clean page 1 added to bio find page 2 towrite lock Page 2 page 2 is writeback write() locks page 1 dirties page 1 fsync() .... xfs_vm_writepages write_cache_pages start index 0 !done && !cycled sets index to 0, restarts lookup find page 1 dirty find page 1 towrite lock Page 1 page 1 is writeback lock Page 1 DEADLOCK because: - process 1 needs page 2 writeback to complete to make enough progress to issue IO pending for page 1 - writeback needs page 1 writeback to complete so process 2 can progress and unlock the page it is blocked on, then it can issue the IO pending for page 2 - process 2 can't make progress until process 1 issues IO for page 1 The underlying cause of the problem here is that range_cyclic writeback is processing pages in descending index order as we hold higher index pages in a structure controlled from above write_cache_pages(). The write_cache_pages() caller needs to be able to submit these pages for IO before write_cache_pages restarts writeback at mapping index 0 to avoid wcp inverting the page lock/writeback wait order. generic_writepages() is not susceptible to this bug as it has no private context held across write_cache_pages() - filesystems using this infrastructure always submit pages in ->writepage immediately and so there is no problem with range_cyclic going back to mapping index 0. However: mpage_writepages() has a private bio context, exofs_writepages() has page_collect fuse_writepages() has fuse_fill_wb_data nfs_writepages() has nfs_pageio_descriptor xfs_vm_writepages() has xfs_writepage_ctx All of these ->writepages implementations can hold pages under writeback in their private structures until write_cache_pages() returns, and hence they are all susceptible to this deadlock. Also worth noting is that ext4 has it's own bastardised version of write_cache_pages() and so it /may/ have an equivalent deadlock. I looked at the code long enough to understand that it has a similar retry loop for range_cyclic writeback reaching the end of the file and then promptly ran away before my eyes bled too much. I'll leave it for the ext4 developers to determine if their code is actually has this deadlock and how to fix it if it has. There's a few ways I can see avoid this deadlock. There's probably more, but these are the first I've though of: 1. get rid of range_cyclic altogether 2. range_cyclic always stops at EOF, and we start again from writeback index 0 on the next call into write_cache_pages() 2a. wcp also returns EAGAIN to ->writepages implementations to indicate range cyclic has hit EOF. writepages implementations can then flush the current context and call wpc again to continue. i.e. lift the retry into the ->writepages implementation 3. range_cyclic uses trylock_page() rather than lock_page(), and it skips pages it can't lock without blocking. It will already do this for pages under writeback, so this seems like a no-brainer 3a. all non-WB_SYNC_ALL writeback uses trylock_page() to avoid blocking as per pages under writeback. I don't think #1 is an option - range_cyclic prevents frequently dirtied lower file offset from starving background writeback of rarely touched higher file offsets. #2 is simple, and I don't think it will have any impact on performance as going back to the start of the file implies an immediate seek. We'll have exactly the same number of seeks if we switch writeback to another inode, and then come back to this one later and restart from index 0. #2a is pretty much "status quo without the deadlock". Moving the retry loop up into the wcp caller means we can issue IO on the pending pages before calling wcp again, and so avoid locking or waiting on pages in the wrong order. I'm not convinced we need to do this given that we get the same thing from #2 on the next writeback call from the writeback infrastructure. #3 is really just a band-aid - it doesn't fix the access/wait inversion problem, just prevents it from becoming a deadlock situation. I'd prefer we fix the inversion, not sweep it under the carpet like this. #3a is really an optimisation that just so happens to include the band-aid fix of #3. So it seems that the simplest way to fix this issue is to implement solution #2 Link: http://lkml.kernel.org/r/20181005054526.21507-1-david@fromorbit.com Signed-off-by: Dave Chinner Reviewed-by: Jan Kara Cc: Nicholas Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 84ae9bf5858a..439a304a6c92 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2149,6 +2149,13 @@ EXPORT_SYMBOL(tag_pages_for_writeback); * not miss some pages (e.g., because some other process has cleared TOWRITE * tag we set). The rule we follow is that TOWRITE tag can be cleared only * by the process clearing the DIRTY tag (and submitting the page for IO). + * + * To avoid deadlocks between range_cyclic writeback and callers that hold + * pages in PageWriteback to aggregate IO until write_cache_pages() returns, + * we do not loop back to the start of the file. Doing so causes a page + * lock/page writeback access order inversion - we should only ever lock + * multiple pages in ascending page->index order, and looping back to the start + * of the file violates that rule and causes deadlocks. */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, @@ -2162,7 +2169,6 @@ int write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; - int cycled; int range_whole = 0; int tag; @@ -2170,23 +2176,17 @@ int write_cache_pages(struct address_space *mapping, if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; - if (index == 0) - cycled = 1; - else - cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - cycled = 1; /* ignore range_cyclic tests */ } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; -retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; @@ -2272,17 +2272,14 @@ continue_unlock: pagevec_release(&pvec); cond_resched(); } - if (!cycled && !done) { - /* - * range_cyclic: - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - cycled = 1; - index = 0; - end = writeback_index - 1; - goto retry; - } + + /* + * If we hit the last page and there is more work to be done: wrap + * back the index back to the start of the file for the next + * time we are called. + */ + if (wbc->range_cyclic && !done) + done_index = 0; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; From 7a1adfddaf0d11a39fdcaf6e82a88e9c0586e08b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 26 Oct 2018 15:09:48 -0700 Subject: [PATCH 114/132] mm: don't raise MEMCG_OOM event due to failed high-order allocation It was reported that on some of our machines containers were restarted with OOM symptoms without an obvious reason. Despite there were almost no memory pressure and plenty of page cache, MEMCG_OOM event was raised occasionally, causing the container management software to think, that OOM has happened. However, no tasks have been killed. The following investigation showed that the problem is caused by a failing attempt to charge a high-order page. In such case, the OOM killer is never invoked. As shown below, it can happen under conditions, which are very far from a real OOM: e.g. there is plenty of clean page cache and no memory pressure. There is no sense in raising an OOM event in this case, as it might confuse a user and lead to wrong and excessive actions (e.g. restart the workload, as in my case). Let's look at the charging path in try_charge(). If the memory usage is about memory.max, which is absolutely natural for most memory cgroups, we try to reclaim some pages. Even if we were able to reclaim enough memory for the allocation, the following check can fail due to a race with another concurrent allocation: if (mem_cgroup_margin(mem_over_limit) >= nr_pages) goto retry; For regular pages the following condition will save us from triggering the OOM: if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) goto retry; But for high-order allocation this condition will intentionally fail. The reason behind is that we'll likely fall to regular pages anyway, so it's ok and even preferred to return ENOMEM. In this case the idea of raising MEMCG_OOM looks dubious. Fix this by moving MEMCG_OOM raising to mem_cgroup_oom() after allocation order check, so that the event won't be raised for high order allocations. This change doesn't affect regular pages allocation and charging. Link: http://lkml.kernel.org/r/20181004214050.7417-1-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: David Rientjes Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 4 ++++ mm/memcontrol.c | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 8389d6f72a77..8384c681a4b2 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1133,6 +1133,10 @@ PAGE_SIZE multiple when read back. disk readahead. For now OOM in memory cgroup kills tasks iff shortage has happened inside page fault. + This event is not raised if the OOM killer is not + considered as an option, e.g. for failed high-order + allocations. + oom_kill The number of processes belonging to this cgroup killed by any kind of OOM killer. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 92d38c88250f..10a9b554d69f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1669,6 +1669,8 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int if (order > PAGE_ALLOC_COSTLY_ORDER) return OOM_SKIPPED; + memcg_memory_event(memcg, MEMCG_OOM); + /* * We are in the middle of the charge context here, so we * don't want to block when potentially sitting on a callstack @@ -2250,8 +2252,6 @@ retry: if (fatal_signal_pending(current)) goto force; - memcg_memory_event(mem_over_limit, MEMCG_OOM); - /* * keep retrying as long as the memcg oom killer is able to make * a forward progress or bypass the charge if the oom killer From 26db3d09d9e1963c9db77cb275bd2d36e56ef57a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 26 Oct 2018 15:09:52 -0700 Subject: [PATCH 115/132] mm/gup_benchmark.c: time put_page() We'd like to measure time to unpin user pages, so this adds a second benchmark timer on put_page, separate from get_page. Adding the field breaks this ioctl ABI, but should be okay since this an in-tree kernel selftest. [akpm@linux-foundation.org: add expansion to struct gup_benchmark for future use] Link: http://lkml.kernel.org/r/20181010195605.10689-1-keith.busch@intel.com Signed-off-by: Keith Busch Acked-by: Kirill A. Shutemov Reviewed-by: Andrew Morton Cc: Dave Hansen Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup_benchmark.c | 9 +++++++-- tools/testing/selftests/vm/gup_benchmark.c | 6 ++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 7405c9d89d65..e6aff102373b 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -8,11 +8,13 @@ #define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) struct gup_benchmark { - __u64 delta_usec; + __u64 get_delta_usec; + __u64 put_delta_usec; __u64 addr; __u64 size; __u32 nr_pages_per_call; __u32 flags; + __u64 expansion[10]; /* For future use */ }; static int __gup_benchmark_ioctl(unsigned int cmd, @@ -48,14 +50,17 @@ static int __gup_benchmark_ioctl(unsigned int cmd, } end_time = ktime_get(); - gup->delta_usec = ktime_us_delta(end_time, start_time); + gup->get_delta_usec = ktime_us_delta(end_time, start_time); gup->size = addr - gup->addr; + start_time = ktime_get(); for (i = 0; i < nr_pages; i++) { if (!pages[i]) break; put_page(pages[i]); } + end_time = ktime_get(); + gup->put_delta_usec = ktime_us_delta(end_time, start_time); kvfree(pages); return 0; diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c index 36df55132036..bdcb97acd0ac 100644 --- a/tools/testing/selftests/vm/gup_benchmark.c +++ b/tools/testing/selftests/vm/gup_benchmark.c @@ -17,7 +17,8 @@ #define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) struct gup_benchmark { - __u64 delta_usec; + __u64 get_delta_usec; + __u64 put_delta_usec; __u64 addr; __u64 size; __u32 nr_pages_per_call; @@ -81,7 +82,8 @@ int main(int argc, char **argv) if (ioctl(fd, GUP_FAST_BENCHMARK, &gup)) perror("ioctl"), exit(1); - printf("Time: %lld us", gup.delta_usec); + printf("Time: get:%lld put:%lld us", gup.get_delta_usec, + gup.put_delta_usec); if (gup.size != size) printf(", truncated (size: %lld)", gup.size); printf("\n"); From 714a3a1ebafe6e23af55a5c694c308f4044a7e00 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 26 Oct 2018 15:09:56 -0700 Subject: [PATCH 116/132] mm/gup_benchmark.c: add additional pinning methods Provide new gup benchmark ioctl commands to run different user page pinning methods, get_user_pages_longterm() and get_user_pages(), in addition to the existing get_user_pages_fast(). Link: http://lkml.kernel.org/r/20181010195605.10689-2-keith.busch@intel.com Signed-off-by: Keith Busch Acked-by: Kirill A. Shutemov Reviewed-by: Andrew Morton Cc: Dave Hansen Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup_benchmark.c | 28 ++++++++++++++++++++-- tools/testing/selftests/vm/gup_benchmark.c | 13 ++++++++-- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index e6aff102373b..debf11388a60 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -6,6 +6,8 @@ #include #define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) +#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark) +#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark) struct gup_benchmark { __u64 get_delta_usec; @@ -43,7 +45,23 @@ static int __gup_benchmark_ioctl(unsigned int cmd, nr = (next - addr) / PAGE_SIZE; } - nr = get_user_pages_fast(addr, nr, gup->flags & 1, pages + i); + switch (cmd) { + case GUP_FAST_BENCHMARK: + nr = get_user_pages_fast(addr, nr, gup->flags & 1, + pages + i); + break; + case GUP_LONGTERM_BENCHMARK: + nr = get_user_pages_longterm(addr, nr, gup->flags & 1, + pages + i, NULL); + break; + case GUP_BENCHMARK: + nr = get_user_pages(addr, nr, gup->flags & 1, pages + i, + NULL); + break; + default: + return -1; + } + if (nr <= 0) break; i += nr; @@ -72,8 +90,14 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, struct gup_benchmark gup; int ret; - if (cmd != GUP_FAST_BENCHMARK) + switch (cmd) { + case GUP_FAST_BENCHMARK: + case GUP_LONGTERM_BENCHMARK: + case GUP_BENCHMARK: + break; + default: return -EINVAL; + } if (copy_from_user(&gup, (void __user *)arg, sizeof(gup))) return -EFAULT; diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c index bdcb97acd0ac..c2f785ded9b9 100644 --- a/tools/testing/selftests/vm/gup_benchmark.c +++ b/tools/testing/selftests/vm/gup_benchmark.c @@ -15,6 +15,8 @@ #define PAGE_SIZE sysconf(_SC_PAGESIZE) #define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) +#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark) +#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark) struct gup_benchmark { __u64 get_delta_usec; @@ -30,9 +32,10 @@ int main(int argc, char **argv) struct gup_benchmark gup; unsigned long size = 128 * MB; int i, fd, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0; + int cmd = GUP_FAST_BENCHMARK; char *p; - while ((opt = getopt(argc, argv, "m:r:n:tT")) != -1) { + while ((opt = getopt(argc, argv, "m:r:n:tTLU")) != -1) { switch (opt) { case 'm': size = atoi(optarg) * MB; @@ -49,6 +52,12 @@ int main(int argc, char **argv) case 'T': thp = 0; break; + case 'L': + cmd = GUP_LONGTERM_BENCHMARK; + break; + case 'U': + cmd = GUP_BENCHMARK; + break; case 'w': write = 1; default: @@ -79,7 +88,7 @@ int main(int argc, char **argv) for (i = 0; i < repeats; i++) { gup.size = size; - if (ioctl(fd, GUP_FAST_BENCHMARK, &gup)) + if (ioctl(fd, cmd, &gup)) perror("ioctl"), exit(1); printf("Time: get:%lld put:%lld us", gup.get_delta_usec, From 319e0bec1aecb36c5ac6d23812af487ff2c8f47f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 26 Oct 2018 15:09:59 -0700 Subject: [PATCH 117/132] tools/testing/selftests/vm/gup_benchmark.c: fix 'write' flag usage If the '-w' parameter was provided, the benchmark would exit due to a mssing 'break'. Link: http://lkml.kernel.org/r/20181010195605.10689-3-keith.busch@intel.com Signed-off-by: Keith Busch Acked-by: Kirill A. Shutemov Reviewed-by: Andrew Morton Cc: Dave Hansen Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/gup_benchmark.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c index c2f785ded9b9..b2082df8beb4 100644 --- a/tools/testing/selftests/vm/gup_benchmark.c +++ b/tools/testing/selftests/vm/gup_benchmark.c @@ -60,6 +60,7 @@ int main(int argc, char **argv) break; case 'w': write = 1; + break; default: return -1; } From aeb85ed4f41a8c0f5c4606d69f5da75e2348d984 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 26 Oct 2018 15:10:02 -0700 Subject: [PATCH 118/132] tools/testing/selftests/vm/gup_benchmark.c: allow user specified file Allow a user to specify a file to map by adding a new option, '-f', providing a means to test various file backings. If not specified, the benchmark will use a private mapping of /dev/zero, which produces an anonymous mapping as before. [akpm@linux-foundation.org: avoid using comma operator] Link: http://lkml.kernel.org/r/20181010195605.10689-4-keith.busch@intel.com Signed-off-by: Keith Busch Reviewed-by: Andrew Morton Cc: Kirill Shutemov Cc: Dave Hansen Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/gup_benchmark.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c index b2082df8beb4..43a2d60e0e93 100644 --- a/tools/testing/selftests/vm/gup_benchmark.c +++ b/tools/testing/selftests/vm/gup_benchmark.c @@ -31,11 +31,12 @@ int main(int argc, char **argv) { struct gup_benchmark gup; unsigned long size = 128 * MB; - int i, fd, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0; + int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0; int cmd = GUP_FAST_BENCHMARK; + char *file = "/dev/zero"; char *p; - while ((opt = getopt(argc, argv, "m:r:n:tTLU")) != -1) { + while ((opt = getopt(argc, argv, "m:r:n:f:tTLU")) != -1) { switch (opt) { case 'm': size = atoi(optarg) * MB; @@ -61,11 +62,20 @@ int main(int argc, char **argv) case 'w': write = 1; break; + case 'f': + file = optarg; + break; default: return -1; } } + filed = open(file, O_RDWR|O_CREAT); + if (filed < 0) { + perror("open"); + exit(filed); + } + gup.nr_pages_per_call = nr_pages; gup.flags = write; @@ -73,8 +83,7 @@ int main(int argc, char **argv) if (fd == -1) perror("open"), exit(1); - p = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, filed, 0); if (p == MAP_FAILED) perror("mmap"), exit(1); gup.addr = (unsigned long)p; From 0dd8666afb99c71d124e0c2abf1ad7d934a242a0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 26 Oct 2018 15:10:08 -0700 Subject: [PATCH 119/132] tools/testing/selftests/vm/gup_benchmark.c: add MAP_SHARED option Add a new benchmark option, -S, to request MAP_SHARED. This can be used to compare with MAP_PRIVATE, or for files that require this option, like dax. Link: http://lkml.kernel.org/r/20181010195605.10689-5-keith.busch@intel.com Signed-off-by: Keith Busch Reviewed-by: Andrew Morton Cc: Kirill Shutemov Cc: Dave Hansen Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/gup_benchmark.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c index 43a2d60e0e93..5c8e4cb1441a 100644 --- a/tools/testing/selftests/vm/gup_benchmark.c +++ b/tools/testing/selftests/vm/gup_benchmark.c @@ -32,11 +32,11 @@ int main(int argc, char **argv) struct gup_benchmark gup; unsigned long size = 128 * MB; int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0; - int cmd = GUP_FAST_BENCHMARK; + int cmd = GUP_FAST_BENCHMARK, flags = MAP_PRIVATE; char *file = "/dev/zero"; char *p; - while ((opt = getopt(argc, argv, "m:r:n:f:tTLU")) != -1) { + while ((opt = getopt(argc, argv, "m:r:n:f:tTLUS")) != -1) { switch (opt) { case 'm': size = atoi(optarg) * MB; @@ -65,6 +65,10 @@ int main(int argc, char **argv) case 'f': file = optarg; break; + case 'S': + flags &= ~MAP_PRIVATE; + flags |= MAP_SHARED; + break; default: return -1; } @@ -83,7 +87,7 @@ int main(int argc, char **argv) if (fd == -1) perror("open"), exit(1); - p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, filed, 0); + p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0); if (p == MAP_FAILED) perror("mmap"), exit(1); gup.addr = (unsigned long)p; From 3821b76c3cdb5f2c5ef1b082de79829e8ff50a7d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 26 Oct 2018 15:10:12 -0700 Subject: [PATCH 120/132] tools/testing/selftests/vm/gup_benchmark.c: add MAP_HUGETLB option Add a new option '-H' to the gup benchmark to help understand how hugetlb mapping pages compare with the default. Link: http://lkml.kernel.org/r/20181010195605.10689-6-keith.busch@intel.com Signed-off-by: Keith Busch Reviewed-by: Andrew Morton Cc: Kirill Shutemov Cc: Dave Hansen Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/gup_benchmark.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c index 5c8e4cb1441a..880b96fc80d4 100644 --- a/tools/testing/selftests/vm/gup_benchmark.c +++ b/tools/testing/selftests/vm/gup_benchmark.c @@ -36,7 +36,7 @@ int main(int argc, char **argv) char *file = "/dev/zero"; char *p; - while ((opt = getopt(argc, argv, "m:r:n:f:tTLUS")) != -1) { + while ((opt = getopt(argc, argv, "m:r:n:f:tTLUSH")) != -1) { switch (opt) { case 'm': size = atoi(optarg) * MB; @@ -69,6 +69,9 @@ int main(int argc, char **argv) flags &= ~MAP_PRIVATE; flags |= MAP_SHARED; break; + case 'H': + flags |= MAP_HUGETLB; + break; default: return -1; } From 907ec5fca3dc38d37737de826f06f25b063aa08e Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 26 Oct 2018 15:10:15 -0700 Subject: [PATCH 121/132] mm: zero remaining unavailable struct pages Patch series "mm: Fix for movable_node boot option", v3. This patch series contains a fix for the movable_node boot option issue which was introduced by commit 124049decbb1 ("x86/e820: put !E820_TYPE_RAM regions into memblock.reserved"). The commit breaks the option because it changed the memory gap range to reserved memblock. So, the node is marked as Normal zone even if the SRAT has Hot pluggable affinity. First and second patch fix the original issue which the commit tried to fix, then revert the commit. This patch (of 3): There is a kernel panic that is triggered when reading /proc/kpageflags on the kernel booted with kernel parameter 'memmap=nn[KMG]!ss[KMG]': BUG: unable to handle kernel paging request at fffffffffffffffe PGD 9b20e067 P4D 9b20e067 PUD 9b210067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 2 PID: 1728 Comm: page-types Not tainted 4.17.0-rc6-mm1-v4.17-rc6-180605-0816-00236-g2dfb086ef02c+ #160 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.fc28 04/01/2014 RIP: 0010:stable_page_flags+0x27/0x3c0 Code: 00 00 00 0f 1f 44 00 00 48 85 ff 0f 84 a0 03 00 00 41 54 55 49 89 fc 53 48 8b 57 08 48 8b 2f 48 8d 42 ff 83 e2 01 48 0f 44 c7 <48> 8b 00 f6 c4 01 0f 84 10 03 00 00 31 db 49 8b 54 24 08 4c 89 e7 RSP: 0018:ffffbbd44111fde0 EFLAGS: 00010202 RAX: fffffffffffffffe RBX: 00007fffffffeff9 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000202 RDI: ffffed1182fff5c0 RBP: ffffffffffffffff R08: 0000000000000001 R09: 0000000000000001 R10: ffffbbd44111fed8 R11: 0000000000000000 R12: ffffed1182fff5c0 R13: 00000000000bffd7 R14: 0000000002fff5c0 R15: ffffbbd44111ff10 FS: 00007efc4335a500(0000) GS:ffff93a5bfc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: fffffffffffffffe CR3: 00000000b2a58000 CR4: 00000000001406e0 Call Trace: kpageflags_read+0xc7/0x120 proc_reg_read+0x3c/0x60 __vfs_read+0x36/0x170 vfs_read+0x89/0x130 ksys_pread64+0x71/0x90 do_syscall_64+0x5b/0x160 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7efc42e75e23 Code: 09 00 ba 9f 01 00 00 e8 ab 81 f4 ff 66 2e 0f 1f 84 00 00 00 00 00 90 83 3d 29 0a 2d 00 00 75 13 49 89 ca b8 11 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 34 c3 48 83 ec 08 e8 db d3 01 00 48 89 04 24 According to kernel bisection, this problem became visible due to commit f7f99100d8d9 which changes how struct pages are initialized. Memblock layout affects the pfn ranges covered by node/zone. Consider that we have a VM with 2 NUMA nodes and each node has 4GB memory, and the default (no memmap= given) memblock layout is like below: MEMBLOCK configuration: memory size = 0x00000001fff75c00 reserved size = 0x000000000300c000 memory.cnt = 0x4 memory[0x0] [0x0000000000001000-0x000000000009efff], 0x000000000009e000 bytes on node 0 flags: 0x0 memory[0x1] [0x0000000000100000-0x00000000bffd6fff], 0x00000000bfed7000 bytes on node 0 flags: 0x0 memory[0x2] [0x0000000100000000-0x000000013fffffff], 0x0000000040000000 bytes on node 0 flags: 0x0 memory[0x3] [0x0000000140000000-0x000000023fffffff], 0x0000000100000000 bytes on node 1 flags: 0x0 ... If you give memmap=1G!4G (so it just covers memory[0x2]), the range [0x100000000-0x13fffffff] is gone: MEMBLOCK configuration: memory size = 0x00000001bff75c00 reserved size = 0x000000000300c000 memory.cnt = 0x3 memory[0x0] [0x0000000000001000-0x000000000009efff], 0x000000000009e000 bytes on node 0 flags: 0x0 memory[0x1] [0x0000000000100000-0x00000000bffd6fff], 0x00000000bfed7000 bytes on node 0 flags: 0x0 memory[0x2] [0x0000000140000000-0x000000023fffffff], 0x0000000100000000 bytes on node 1 flags: 0x0 ... This causes shrinking node 0's pfn range because it is calculated by the address range of memblock.memory. So some of struct pages in the gap range are left uninitialized. We have a function zero_resv_unavail() which does zeroing the struct pages outside memblock.memory, but currently it covers only the reserved unavailable range (i.e. memblock.memory && !memblock.reserved). This patch extends it to cover all unavailable range, which fixes the reported issue. Link: http://lkml.kernel.org/r/20181002143821.5112-2-msys.mizuma@gmail.com Fixes: f7f99100d8d9 ("mm: stop zeroing memory during allocation in vmemmap") Signed-off-by: Naoya Horiguchi Signed-off-by-by: Masayoshi Mizuma Tested-by: Oscar Salvador Tested-by: Masayoshi Mizuma Reviewed-by: Pavel Tatashin Cc: Ingo Molnar Cc: Michal Hocko Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 15 --------------- mm/page_alloc.c | 36 +++++++++++++++++++++++++----------- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 516920549378..2acdd046df2d 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -265,21 +265,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ nid, flags, p_start, p_end, p_nid) -/** - * for_each_resv_unavail_range - iterate through reserved and unavailable memory - * @i: u64 used as loop variable - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * - * Walks over unavailable but reserved (reserved && !memory) areas of memblock. - * Available as soon as memblock is initialized. - * Note: because this memory does not belong to any physical node, flags and - * nid arguments do not make sense and thus not exported as arguments. - */ -#define for_each_resv_unavail_range(i, p_start, p_end) \ - for_each_mem_range(i, &memblock.reserved, &memblock.memory, \ - NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL) - static inline void memblock_set_region_flags(struct memblock_region *r, enum memblock_flags flags) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c26d3152f9ba..6d863c5afa08 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6515,29 +6515,42 @@ void __init free_area_init_node(int nid, unsigned long *zones_size, * struct pages which are reserved in memblock allocator and their fields * may be accessed (for example page_to_pfn() on some configuration accesses * flags). We must explicitly zero those struct pages. + * + * This function also addresses a similar issue where struct pages are left + * uninitialized because the physical address range is not covered by + * memblock.memory or memblock.reserved. That could happen when memblock + * layout is manually configured via memmap=. */ void __init zero_resv_unavail(void) { phys_addr_t start, end; unsigned long pfn; u64 i, pgcnt; + phys_addr_t next = 0; /* - * Loop through ranges that are reserved, but do not have reported - * physical memory backing. + * Loop through unavailable ranges not covered by memblock.memory. */ pgcnt = 0; - for_each_resv_unavail_range(i, &start, &end) { - for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { - pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) - + pageblock_nr_pages - 1; - continue; + for_each_mem_range(i, &memblock.memory, NULL, + NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) { + if (next < start) { + for (pfn = PFN_DOWN(next); pfn < PFN_UP(start); pfn++) { + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) + continue; + mm_zero_struct_page(pfn_to_page(pfn)); + pgcnt++; } - mm_zero_struct_page(pfn_to_page(pfn)); - pgcnt++; } + next = end; } + for (pfn = PFN_DOWN(next); pfn < max_pfn; pfn++) { + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) + continue; + mm_zero_struct_page(pfn_to_page(pfn)); + pgcnt++; + } + /* * Struct pages that do not have backing memory. This could be because @@ -6547,7 +6560,8 @@ void __init zero_resv_unavail(void) * this code can be removed. */ if (pgcnt) - pr_info("Reserved but unavailable: %lld pages", pgcnt); + pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt); + } #endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ From ec393a0f014eaf688a3dbe8c8a4cbb52d7f535f9 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 26 Oct 2018 15:10:21 -0700 Subject: [PATCH 122/132] mm: return zero_resv_unavail optimization When checking for valid pfns in zero_resv_unavail(), it is not necessary to verify that pfns within pageblock_nr_pages ranges are valid, only the first one needs to be checked. This is because memory for pages are allocated in contiguous chunks that contain pageblock_nr_pages struct pages. Link: http://lkml.kernel.org/r/20181002143821.5112-3-msys.mizuma@gmail.com Signed-off-by: Pavel Tatashin Signed-off-by: Masayoshi Mizuma Reviewed-by: Masayoshi Mizuma Acked-by: Naoya Horiguchi Reviewed-by: Oscar Salvador Cc: Ingo Molnar Cc: Michal Hocko Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6d863c5afa08..863d46da6586 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6509,6 +6509,29 @@ void __init free_area_init_node(int nid, unsigned long *zones_size, } #if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) + +/* + * Zero all valid struct pages in range [spfn, epfn), return number of struct + * pages zeroed + */ +static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn) +{ + unsigned long pfn; + u64 pgcnt = 0; + + for (pfn = spfn; pfn < epfn; pfn++) { + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { + pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) + + pageblock_nr_pages - 1; + continue; + } + mm_zero_struct_page(pfn_to_page(pfn)); + pgcnt++; + } + + return pgcnt; +} + /* * Only struct pages that are backed by physical memory are zeroed and * initialized by going through __init_single_page(). But, there are some @@ -6524,7 +6547,6 @@ void __init free_area_init_node(int nid, unsigned long *zones_size, void __init zero_resv_unavail(void) { phys_addr_t start, end; - unsigned long pfn; u64 i, pgcnt; phys_addr_t next = 0; @@ -6534,34 +6556,18 @@ void __init zero_resv_unavail(void) pgcnt = 0; for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) { - if (next < start) { - for (pfn = PFN_DOWN(next); pfn < PFN_UP(start); pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) - continue; - mm_zero_struct_page(pfn_to_page(pfn)); - pgcnt++; - } - } + if (next < start) + pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start)); next = end; } - for (pfn = PFN_DOWN(next); pfn < max_pfn; pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) - continue; - mm_zero_struct_page(pfn_to_page(pfn)); - pgcnt++; - } - + pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn); /* * Struct pages that do not have backing memory. This could be because * firmware is using some of this memory, or for some other reasons. - * Once memblock is changed so such behaviour is not allowed: i.e. - * list of "reserved" memory must be a subset of list of "memory", then - * this code can be removed. */ if (pgcnt) pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt); - } #endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ From 9fd61bc95130d4971568b89c9548b5e0a4e18e0e Mon Sep 17 00:00:00 2001 From: Masayoshi Mizuma Date: Fri, 26 Oct 2018 15:10:24 -0700 Subject: [PATCH 123/132] Revert "x86/e820: put !E820_TYPE_RAM regions into memblock.reserved" commit 124049decbb1 ("x86/e820: put !E820_TYPE_RAM regions into memblock.reserved") breaks movable_node kernel option because it changed the memory gap range to reserved memblock. So, the node is marked as Normal zone even if the SRAT has Hot pluggable affinity. ===================================================================== kernel: BIOS-e820: [mem 0x0000180000000000-0x0000180fffffffff] usable kernel: BIOS-e820: [mem 0x00001c0000000000-0x00001c0fffffffff] usable ... kernel: reserved[0x12]#011[0x0000181000000000-0x00001bffffffffff], 0x000003f000000000 bytes flags: 0x0 ... kernel: ACPI: SRAT: Node 2 PXM 6 [mem 0x180000000000-0x1bffffffffff] hotplug kernel: ACPI: SRAT: Node 3 PXM 7 [mem 0x1c0000000000-0x1fffffffffff] hotplug ... kernel: Movable zone start for each node kernel: Node 3: 0x00001c0000000000 kernel: Early memory node ranges ... ===================================================================== The original issue is fixed by the former patches, so let's revert commit 124049decbb1 ("x86/e820: put !E820_TYPE_RAM regions into memblock.reserved"). Link: http://lkml.kernel.org/r/20181002143821.5112-4-msys.mizuma@gmail.com Signed-off-by: Masayoshi Mizuma Reviewed-by: Pavel Tatashin Acked-by: Ingo Molnar Cc: Naoya Horiguchi Cc: Michal Hocko Cc: Thomas Gleixner Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/e820.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index c88c23c658c1..d1f25c831447 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1248,7 +1248,6 @@ void __init e820__memblock_setup(void) { int i; u64 end; - u64 addr = 0; /* * The bootstrap memblock region count maximum is 128 entries @@ -1265,21 +1264,13 @@ void __init e820__memblock_setup(void) struct e820_entry *entry = &e820_table->entries[i]; end = entry->addr + entry->size; - if (addr < entry->addr) - memblock_reserve(addr, entry->addr - addr); - addr = end; if (end != (resource_size_t)end) continue; - /* - * all !E820_TYPE_RAM ranges (including gap ranges) are put - * into memblock.reserved to make sure that struct pages in - * such regions are not left uninitialized after bootup. - */ if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) - memblock_reserve(entry->addr, entry->size); - else - memblock_add(entry->addr, entry->size); + continue; + + memblock_add(entry->addr, entry->size); } /* Throw away partial pages: */ From df06b37ffe5a442503b7095b77b0a970df515459 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 26 Oct 2018 15:10:28 -0700 Subject: [PATCH 124/132] mm/gup: cache dev_pagemap while pinning pages Getting pages from ZONE_DEVICE memory needs to check the backing device's live-ness, which is tracked in the device's dev_pagemap metadata. This metadata is stored in a radix tree and looking it up adds measurable software overhead. This patch avoids repeating this relatively costly operation when dev_pagemap is used by caching the last dev_pagemap while getting user pages. The gup_benchmark kernel self test reports this reduces time to get user pages to as low as 1/3 of the previous time. Link: http://lkml.kernel.org/r/20181012173040.15669-1-keith.busch@intel.com Signed-off-by: Keith Busch Reviewed-by: Dan Williams Acked-by: Kirill A. Shutemov Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 8 +-- include/linux/mm.h | 12 +---- mm/gup.c | 110 ++++++++++++++++++++++++---------------- mm/huge_memory.c | 16 +++--- mm/nommu.c | 6 +-- 5 files changed, 79 insertions(+), 73 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index fdcb45999b26..4663ee96cf59 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -213,9 +213,9 @@ static inline int hpage_nr_pages(struct page *page) } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, int flags); + pmd_t *pmd, int flags, struct dev_pagemap **pgmap); struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, int flags); + pud_t *pud, int flags, struct dev_pagemap **pgmap); extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); @@ -344,13 +344,13 @@ static inline void mm_put_huge_zero_page(struct mm_struct *mm) } static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmd, int flags) + unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { return NULL; } static inline struct page *follow_devmap_pud(struct vm_area_struct *vma, - unsigned long addr, pud_t *pud, int flags) + unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap) { return NULL; } diff --git a/include/linux/mm.h b/include/linux/mm.h index a023c5ce71fa..1e52b8fd1685 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2536,16 +2536,8 @@ static inline vm_fault_t vmf_error(int err) return VM_FAULT_SIGBUS; } -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int foll_flags, - unsigned int *page_mask); - -static inline struct page *follow_page(struct vm_area_struct *vma, - unsigned long address, unsigned int foll_flags) -{ - unsigned int unused_page_mask; - return follow_page_mask(vma, address, foll_flags, &unused_page_mask); -} +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int foll_flags); #define FOLL_WRITE 0x01 /* check pte is writable */ #define FOLL_TOUCH 0x02 /* mark page accessed */ diff --git a/mm/gup.c b/mm/gup.c index 08eb350e0f35..841d7ef53591 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -20,6 +20,11 @@ #include "internal.h" +struct follow_page_context { + struct dev_pagemap *pgmap; + unsigned int page_mask; +}; + static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags) { @@ -71,10 +76,10 @@ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) } static struct page *follow_page_pte(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, unsigned int flags) + unsigned long address, pmd_t *pmd, unsigned int flags, + struct dev_pagemap **pgmap) { struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap = NULL; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; @@ -116,8 +121,8 @@ retry: * Only return device mapping pages in the FOLL_GET case since * they are only valid while holding the pgmap reference. */ - pgmap = get_dev_pagemap(pte_pfn(pte), NULL); - if (pgmap) + *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); + if (*pgmap) page = pte_page(pte); else goto no_page; @@ -152,15 +157,8 @@ retry: goto retry; } - if (flags & FOLL_GET) { + if (flags & FOLL_GET) get_page(page); - - /* drop the pgmap reference now that we hold the page */ - if (pgmap) { - put_dev_pagemap(pgmap); - pgmap = NULL; - } - } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) @@ -210,7 +208,8 @@ no_page: static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, - unsigned int flags, unsigned int *page_mask) + unsigned int flags, + struct follow_page_context *ctx) { pmd_t *pmd, pmdval; spinlock_t *ptl; @@ -258,13 +257,13 @@ retry: } if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); - page = follow_devmap_pmd(vma, address, pmd, flags); + page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; } if (likely(!pmd_trans_huge(pmdval))) - return follow_page_pte(vma, address, pmd, flags); + return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) return no_page_table(vma, flags); @@ -284,7 +283,7 @@ retry_locked: } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); - return follow_page_pte(vma, address, pmd, flags); + return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } if (flags & FOLL_SPLIT) { int ret; @@ -307,18 +306,18 @@ retry_locked: } return ret ? ERR_PTR(ret) : - follow_page_pte(vma, address, pmd, flags); + follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(ptl); - *page_mask = HPAGE_PMD_NR - 1; + ctx->page_mask = HPAGE_PMD_NR - 1; return page; } - static struct page *follow_pud_mask(struct vm_area_struct *vma, unsigned long address, p4d_t *p4dp, - unsigned int flags, unsigned int *page_mask) + unsigned int flags, + struct follow_page_context *ctx) { pud_t *pud; spinlock_t *ptl; @@ -344,7 +343,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, } if (pud_devmap(*pud)) { ptl = pud_lock(mm, pud); - page = follow_devmap_pud(vma, address, pud, flags); + page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; @@ -352,13 +351,13 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, if (unlikely(pud_bad(*pud))) return no_page_table(vma, flags); - return follow_pmd_mask(vma, address, pud, flags, page_mask); + return follow_pmd_mask(vma, address, pud, flags, ctx); } - static struct page *follow_p4d_mask(struct vm_area_struct *vma, unsigned long address, pgd_t *pgdp, - unsigned int flags, unsigned int *page_mask) + unsigned int flags, + struct follow_page_context *ctx) { p4d_t *p4d; struct page *page; @@ -378,7 +377,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } - return follow_pud_mask(vma, address, p4d, flags, page_mask); + return follow_pud_mask(vma, address, p4d, flags, ctx); } /** @@ -396,13 +395,13 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, */ struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, - unsigned int *page_mask) + struct follow_page_context *ctx) { pgd_t *pgd; struct page *page; struct mm_struct *mm = vma->vm_mm; - *page_mask = 0; + ctx->page_mask = 0; /* make this handle hugepd */ page = follow_huge_addr(mm, address, flags & FOLL_WRITE); @@ -431,7 +430,19 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return no_page_table(vma, flags); } - return follow_p4d_mask(vma, address, pgd, flags, page_mask); + return follow_p4d_mask(vma, address, pgd, flags, ctx); +} + +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int foll_flags) +{ + struct follow_page_context ctx = { NULL }; + struct page *page; + + page = follow_page_mask(vma, address, foll_flags, &ctx); + if (ctx.pgmap) + put_dev_pagemap(ctx.pgmap); + return page; } static int get_gate_page(struct mm_struct *mm, unsigned long address, @@ -659,9 +670,9 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) { - long i = 0; - unsigned int page_mask; + long ret = 0, i = 0; struct vm_area_struct *vma = NULL; + struct follow_page_context ctx = { NULL }; if (!nr_pages) return 0; @@ -691,12 +702,14 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pages ? &pages[i] : NULL); if (ret) return i ? : ret; - page_mask = 0; + ctx.page_mask = 0; goto next_page; } - if (!vma || check_vma_flags(vma, gup_flags)) - return i ? : -EFAULT; + if (!vma || check_vma_flags(vma, gup_flags)) { + ret = -EFAULT; + goto out; + } if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, @@ -709,23 +722,26 @@ retry: * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ - if (unlikely(fatal_signal_pending(current))) - return i ? i : -ERESTARTSYS; + if (unlikely(fatal_signal_pending(current))) { + ret = -ERESTARTSYS; + goto out; + } cond_resched(); - page = follow_page_mask(vma, start, foll_flags, &page_mask); + + page = follow_page_mask(vma, start, foll_flags, &ctx); if (!page) { - int ret; ret = faultin_page(tsk, vma, start, &foll_flags, nonblocking); switch (ret) { case 0: goto retry; + case -EBUSY: + ret = 0; + /* FALLTHRU */ case -EFAULT: case -ENOMEM: case -EHWPOISON: - return i ? i : ret; - case -EBUSY: - return i; + goto out; case -ENOENT: goto next_page; } @@ -737,27 +753,31 @@ retry: */ goto next_page; } else if (IS_ERR(page)) { - return i ? i : PTR_ERR(page); + ret = PTR_ERR(page); + goto out; } if (pages) { pages[i] = page; flush_anon_page(vma, page, start); flush_dcache_page(page); - page_mask = 0; + ctx.page_mask = 0; } next_page: if (vmas) { vmas[i] = vma; - page_mask = 0; + ctx.page_mask = 0; } - page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; i += page_increm; start += page_increm * PAGE_SIZE; nr_pages -= page_increm; } while (nr_pages); - return i; +out: + if (ctx.pgmap) + put_dev_pagemap(ctx.pgmap); + return i ? i : ret; } static bool vma_permits_fault(struct vm_area_struct *vma, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8ea1b36bd452..25c7d7509cf4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -852,11 +852,10 @@ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, int flags) + pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { unsigned long pfn = pmd_pfn(*pmd); struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap; struct page *page; assert_spin_locked(pmd_lockptr(mm, pmd)); @@ -886,12 +885,11 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, return ERR_PTR(-EEXIST); pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; - pgmap = get_dev_pagemap(pfn, NULL); - if (!pgmap) + *pgmap = get_dev_pagemap(pfn, *pgmap); + if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); get_page(page); - put_dev_pagemap(pgmap); return page; } @@ -1000,11 +998,10 @@ static void touch_pud(struct vm_area_struct *vma, unsigned long addr, } struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, int flags) + pud_t *pud, int flags, struct dev_pagemap **pgmap) { unsigned long pfn = pud_pfn(*pud); struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap; struct page *page; assert_spin_locked(pud_lockptr(mm, pud)); @@ -1028,12 +1025,11 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, return ERR_PTR(-EEXIST); pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; - pgmap = get_dev_pagemap(pfn, NULL); - if (!pgmap) + *pgmap = get_dev_pagemap(pfn, *pgmap); + if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); get_page(page); - put_dev_pagemap(pgmap); return page; } diff --git a/mm/nommu.c b/mm/nommu.c index e4aac33216ae..749276beb109 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1709,11 +1709,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, return ret; } -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int foll_flags) { - *page_mask = 0; return NULL; } From 026d1eaf5ef1a5d6258b46e4e411cd9f5ab8c41d Mon Sep 17 00:00:00 2001 From: Clark Williams Date: Fri, 26 Oct 2018 15:10:32 -0700 Subject: [PATCH 125/132] mm/kasan/quarantine.c: make quarantine_lock a raw_spinlock_t The static lock quarantine_lock is used in quarantine.c to protect the quarantine queue datastructures. It is taken inside quarantine queue manipulation routines (quarantine_put(), quarantine_reduce() and quarantine_remove_cache()), with IRQs disabled. This is not a problem on a stock kernel but is problematic on an RT kernel where spin locks are sleeping spinlocks, which can sleep and can not be acquired with disabled interrupts. Convert the quarantine_lock to a raw spinlock_t. The usage of quarantine_lock is confined to quarantine.c and the work performed while the lock is held is used for debug purpose. [bigeasy@linutronix.de: slightly altered the commit message] Link: http://lkml.kernel.org/r/20181010214945.5owshc3mlrh74z4b@linutronix.de Signed-off-by: Clark Williams Signed-off-by: Sebastian Andrzej Siewior Acked-by: Sebastian Andrzej Siewior Acked-by: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/quarantine.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 3a8ddf8baf7d..b209dbaefde8 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -103,7 +103,7 @@ static int quarantine_head; static int quarantine_tail; /* Total size of all objects in global_quarantine across all batches. */ static unsigned long quarantine_size; -static DEFINE_SPINLOCK(quarantine_lock); +static DEFINE_RAW_SPINLOCK(quarantine_lock); DEFINE_STATIC_SRCU(remove_cache_srcu); /* Maximum size of the global queue. */ @@ -190,7 +190,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) { qlist_move_all(q, &temp); - spin_lock(&quarantine_lock); + raw_spin_lock(&quarantine_lock); WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes); qlist_move_all(&temp, &global_quarantine[quarantine_tail]); if (global_quarantine[quarantine_tail].bytes >= @@ -203,7 +203,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) if (new_tail != quarantine_head) quarantine_tail = new_tail; } - spin_unlock(&quarantine_lock); + raw_spin_unlock(&quarantine_lock); } local_irq_restore(flags); @@ -230,7 +230,7 @@ void quarantine_reduce(void) * expected case). */ srcu_idx = srcu_read_lock(&remove_cache_srcu); - spin_lock_irqsave(&quarantine_lock, flags); + raw_spin_lock_irqsave(&quarantine_lock, flags); /* * Update quarantine size in case of hotplug. Allocate a fraction of @@ -254,7 +254,7 @@ void quarantine_reduce(void) quarantine_head = 0; } - spin_unlock_irqrestore(&quarantine_lock, flags); + raw_spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, NULL); srcu_read_unlock(&remove_cache_srcu, srcu_idx); @@ -310,17 +310,17 @@ void quarantine_remove_cache(struct kmem_cache *cache) */ on_each_cpu(per_cpu_remove_cache, cache, 1); - spin_lock_irqsave(&quarantine_lock, flags); + raw_spin_lock_irqsave(&quarantine_lock, flags); for (i = 0; i < QUARANTINE_BATCHES; i++) { if (qlist_empty(&global_quarantine[i])) continue; qlist_move_cache(&global_quarantine[i], &to_free, cache); /* Scanning whole quarantine can take a while. */ - spin_unlock_irqrestore(&quarantine_lock, flags); + raw_spin_unlock_irqrestore(&quarantine_lock, flags); cond_resched(); - spin_lock_irqsave(&quarantine_lock, flags); + raw_spin_lock_irqsave(&quarantine_lock, flags); } - spin_unlock_irqrestore(&quarantine_lock, flags); + raw_spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, cache); From d7c3393413fe7e7dc54498ea200ea94742d61e18 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 26 Oct 2018 15:10:36 -0700 Subject: [PATCH 126/132] mm: thp: fix MADV_DONTNEED vs migrate_misplaced_transhuge_page race condition Patch series "migrate_misplaced_transhuge_page race conditions". Aaron found a new instance of the THP MADV_DONTNEED race against pmdp_clear_flush* variants, that was apparently left unfixed. While looking into the race found by Aaron, I may have found two more issues in migrate_misplaced_transhuge_page. These race conditions would not cause kernel instability, but they'd corrupt userland data or leave data non zero after MADV_DONTNEED. I did only minor testing, and I don't expect to be able to reproduce this (especially the lack of ->invalidate_range before migrate_page_copy, requires the latest iommu hardware or infiniband to reproduce). The last patch is noop for x86 and it needs further review from maintainers of archs that implement flush_cache_range() (not in CC yet). To avoid confusion, it's not the first patch that introduces the bug fixed in the second patch, even before removing the pmdp_huge_clear_flush_notify, that _notify suffix was called after migrate_page_copy already run. This patch (of 3): This is a corollary of ced108037c2aa ("thp: fix MADV_DONTNEED vs. numa balancing race"), 58ceeb6bec8 ("thp: fix MADV_DONTNEED vs. MADV_FREE race") and 5b7abeae3af8c ("thp: fix MADV_DONTNEED vs clear soft dirty race). When the above three fixes where posted Dave asked https://lkml.kernel.org/r/929b3844-aec2-0111-fef7-8002f9d4e2b9@intel.com but apparently this was missed. The pmdp_clear_flush* in migrate_misplaced_transhuge_page() was introduced in a54a407fbf7 ("mm: Close races between THP migration and PMD numa clearing"). The important part of such commit is only the part where the page lock is not released until the first do_huge_pmd_numa_page() finished disarming the pagenuma/protnone. The addition of pmdp_clear_flush() wasn't beneficial to such commit and there's no commentary about such an addition either. I guess the pmdp_clear_flush() in such commit was added just in case for safety, but it ended up introducing the MADV_DONTNEED race condition found by Aaron. At that point in time nobody thought of such kind of MADV_DONTNEED race conditions yet (they were fixed later) so the code may have looked more robust by adding the pmdp_clear_flush(). This specific race condition won't destabilize the kernel, but it can confuse userland because after MADV_DONTNEED the memory won't be zeroed out. This also optimizes the code and removes a superfluous TLB flush. [akpm@linux-foundation.org: reflow comment to 80 cols, fix grammar and typo (beacuse)] Link: http://lkml.kernel.org/r/20181013002430.698-2-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reported-by: Aaron Tomlin Acked-by: Mel Gorman Acked-by: Kirill A. Shutemov Cc: Jerome Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 1ea27b343ccd..93d9a1e25aad 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2031,15 +2031,26 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); /* - * Clear the old entry under pagetable lock and establish the new PTE. - * Any parallel GUP will either observe the old page blocking on the - * page lock, block on the page table lock or observe the new page. - * The SetPageUptodate on the new page and page_add_new_anon_rmap - * guarantee the copy is visible before the pagetable update. + * Overwrite the old entry under pagetable lock and establish + * the new PTE. Any parallel GUP will either observe the old + * page blocking on the page lock, block on the page table + * lock or observe the new page. The SetPageUptodate on the + * new page and page_add_new_anon_rmap guarantee the copy is + * visible before the pagetable update. */ flush_cache_range(vma, mmun_start, mmun_end); page_add_anon_rmap(new_page, vma, mmun_start, true); - pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); + /* + * At this point the pmd is numa/protnone (i.e. non present) and the TLB + * has already been flushed globally. So no TLB can be currently + * caching this non present pmd mapping. There's no need to clear the + * pmd before doing set_pmd_at(), nor to flush the TLB after + * set_pmd_at(). Clearing the pmd here would introduce a race + * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the + * mmap_sem for reading. If the pmd is set to NULL at any given time, + * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this + * pmd. + */ set_pmd_at(mm, mmun_start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); @@ -2053,7 +2064,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, * No need to double call mmu_notifier->invalidate_range() callback as * the above pmdp_huge_clear_flush_notify() did already call it. */ - mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); /* Take an "isolate" reference and put new page on the LRU. */ get_page(new_page); From 7066f0f933a1fd707bb38781866657769cff7efc Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 26 Oct 2018 15:10:40 -0700 Subject: [PATCH 127/132] mm: thp: fix mmu_notifier in migrate_misplaced_transhuge_page() change_huge_pmd() after arming the numa/protnone pmd doesn't flush the TLB right away. do_huge_pmd_numa_page() flushes the TLB before calling migrate_misplaced_transhuge_page(). By the time do_huge_pmd_numa_page() runs some CPU could still access the page through the TLB. change_huge_pmd() before arming the numa/protnone transhuge pmd calls mmu_notifier_invalidate_range_start(). So there's no need of mmu_notifier_invalidate_range_start()/mmu_notifier_invalidate_range_only_end() sequence in migrate_misplaced_transhuge_page() too, because by the time migrate_misplaced_transhuge_page() runs, the pmd mapping has already been invalidated in the secondary MMUs. It has to or if a secondary MMU can still write to the page, the migrate_page_copy() would lose data. However an explicit mmu_notifier_invalidate_range() is needed before migrate_misplaced_transhuge_page() starts copying the data of the transhuge page or the below can happen for MMU notifier users sharing the primary MMU pagetables and only implementing ->invalidate_range: CPU0 CPU1 GPU sharing linux pagetables using only ->invalidate_range ----------- ------------ --------- GPU secondary MMU writes to the page mapped by the transhuge pmd change_pmd_range() mmu..._range_start() ->invalidate_range_start() noop change_huge_pmd() set_pmd_at(numa/protnone) pmd_unlock() do_huge_pmd_numa_page() CPU TLB flush globally (1) CPU cannot write to page migrate_misplaced_transhuge_page() GPU writes to the page... migrate_page_copy() ...GPU stops writing to the page CPU TLB flush (2) mmu..._range_end() (3) ->invalidate_range_stop() noop ->invalidate_range() GPU secondary MMU is invalidated and cannot write to the page anymore (too late) Just like we need a CPU TLB flush (1) because the TLB flush (2) arrives too late, we also need a mmu_notifier_invalidate_range() before calling migrate_misplaced_transhuge_page(), because the ->invalidate_range() in (3) also arrives too late. This requirement is the result of the lazy optimization in change_huge_pmd() that releases the pmd_lock without first flushing the TLB and without first calling mmu_notifier_invalidate_range(). Even converting the removed mmu_notifier_invalidate_range_only_end() into a mmu_notifier_invalidate_range_end() would not have been enough to fix this, because it run after migrate_page_copy(). After the hugepage data copy is done migrate_misplaced_transhuge_page() can proceed and call set_pmd_at without having to flush the TLB nor any secondary MMUs because the secondary MMU invalidate, just like the CPU TLB flush, has to happen before the migrate_page_copy() is called or it would be a bug in the first place (and it was for drivers using ->invalidate_range()). KVM is unaffected because it doesn't implement ->invalidate_range(). The standard PAGE_SIZEd migrate_misplaced_page is less accelerated and uses the generic migrate_pages which transitions the pte from numa/protnone to a migration entry in try_to_unmap_one() and flushes TLBs and all mmu notifiers there before copying the page. Link: http://lkml.kernel.org/r/20181013002430.698-3-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Acked-by: Mel Gorman Acked-by: Kirill A. Shutemov Reviewed-by: Aaron Tomlin Cc: Jerome Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 14 +++++++++++++- mm/migrate.c | 19 ++++++------------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 25c7d7509cf4..25ef59b7ee34 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1558,8 +1558,20 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) * We are not sure a pending tlb flush here is for a huge page * mapping or not. Hence use the tlb range variant */ - if (mm_tlb_flush_pending(vma->vm_mm)) + if (mm_tlb_flush_pending(vma->vm_mm)) { flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); + /* + * change_huge_pmd() released the pmd lock before + * invalidating the secondary MMUs sharing the primary + * MMU pagetables (with ->invalidate_range()). The + * mmu_notifier_invalidate_range_end() (which + * internally calls ->invalidate_range()) in + * change_pmd_range() will run after us, so we can't + * rely on it here and we need an explicit invalidate. + */ + mmu_notifier_invalidate_range(vma->vm_mm, haddr, + haddr + HPAGE_PMD_SIZE); + } /* * Migrate the THP to the requested node, returns with page unlocked diff --git a/mm/migrate.c b/mm/migrate.c index 93d9a1e25aad..905c2264c788 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1975,8 +1975,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, int isolated = 0; struct page *new_page = NULL; int page_lru = page_is_file_cache(page); - unsigned long mmun_start = address & HPAGE_PMD_MASK; - unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; + unsigned long start = address & HPAGE_PMD_MASK; + unsigned long end = start + HPAGE_PMD_SIZE; new_page = alloc_pages_node(node, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), @@ -2003,11 +2003,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, WARN_ON(PageLRU(new_page)); /* Recheck the target PMD */ - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); /* Reverse changes made by migrate_page_copy() */ if (TestClearPageActive(new_page)) @@ -2038,8 +2036,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, * new page and page_add_new_anon_rmap guarantee the copy is * visible before the pagetable update. */ - flush_cache_range(vma, mmun_start, mmun_end); - page_add_anon_rmap(new_page, vma, mmun_start, true); + flush_cache_range(vma, start, end); + page_add_anon_rmap(new_page, vma, start, true); /* * At this point the pmd is numa/protnone (i.e. non present) and the TLB * has already been flushed globally. So no TLB can be currently @@ -2051,7 +2049,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this * pmd. */ - set_pmd_at(mm, mmun_start, pmd, entry); + set_pmd_at(mm, start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); page_ref_unfreeze(page, 2); @@ -2060,11 +2058,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); spin_unlock(ptl); - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pmdp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); /* Take an "isolate" reference and put new page on the LRU. */ get_page(new_page); @@ -2088,7 +2081,7 @@ out_fail: ptl = pmd_lock(mm, pmd); if (pmd_same(*pmd, entry)) { entry = pmd_modify(entry, vma->vm_page_prot); - set_pmd_at(mm, mmun_start, pmd, entry); + set_pmd_at(mm, start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); } spin_unlock(ptl); From 7eef5f97c1f94c7b72520b42d372037e97a81b95 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 26 Oct 2018 15:10:43 -0700 Subject: [PATCH 128/132] mm: thp: relocate flush_cache_range() in migrate_misplaced_transhuge_page() There should be no cache left by the time we overwrite the old transhuge pmd with the new one. It's already too late to flush through the virtual address because we already copied the page data to the new physical address. So flush the cache before the data copy. Also delete the "end" variable to shutoff a "unused variable" warning on x86 where flush_cache_range() is a noop. Link: http://lkml.kernel.org/r/20181015202311.7209-1-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Acked-by: Kirill A. Shutemov Cc: Aaron Tomlin Cc: Jerome Glisse Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 905c2264c788..b6700f2962f3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1976,7 +1976,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, struct page *new_page = NULL; int page_lru = page_is_file_cache(page); unsigned long start = address & HPAGE_PMD_MASK; - unsigned long end = start + HPAGE_PMD_SIZE; new_page = alloc_pages_node(node, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), @@ -1999,6 +1998,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, /* anon mapping, we can simply copy page->mapping to the new page: */ new_page->mapping = page->mapping; new_page->index = page->index; + /* flush the cache before copying using the kernel virtual address */ + flush_cache_range(vma, start, start + HPAGE_PMD_SIZE); migrate_page_copy(new_page, page); WARN_ON(PageLRU(new_page)); @@ -2036,7 +2037,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, * new page and page_add_new_anon_rmap guarantee the copy is * visible before the pagetable update. */ - flush_cache_range(vma, start, end); page_add_anon_rmap(new_page, vma, start, true); /* * At this point the pmd is numa/protnone (i.e. non present) and the TLB From 91cbacc34512e37c9bb89125ca4b224ca6459245 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 26 Oct 2018 15:10:48 -0700 Subject: [PATCH 129/132] tools/testing/selftests/vm/map_fixed_noreplace.c: add test for MAP_FIXED_NOREPLACE Add a test for MAP_FIXED_NOREPLACE, based on some code originally by Jann Horn. This would have caught the overlap bug reported by Daniel Micay. I originally suggested to Michal that we create MAP_FIXED_NOREPLACE, but instead of writing a selftest I spent my time bike-shedding whether it should be called MAP_FIXED_SAFE/NOCLOBBER/WEAK/NEW .. mea culpa. Link: http://lkml.kernel.org/r/20181013133929.28653-1-mpe@ellerman.id.au Signed-off-by: Michael Ellerman Reviewed-by: Kees Cook Reviewed-by: Khalid Aziz Acked-by: Michal Hocko Cc: Jann Horn Cc: Andrea Arcangeli Cc: Florian Weimer Cc: John Hubbard Cc: Matthew Wilcox Cc: Abdul Haleem Cc: Joel Stanley Cc: Jason Evans Cc: David Goldblatt Cc: Daniel Micay Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 1 + .../selftests/vm/map_fixed_noreplace.c | 206 ++++++++++++++++++ 3 files changed, 208 insertions(+) create mode 100644 tools/testing/selftests/vm/map_fixed_noreplace.c diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index af5ff83f6d7f..31b3c98b6d34 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -13,3 +13,4 @@ mlock-random-test virtual_address_range gup_benchmark va_128TBswitch +map_fixed_noreplace diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index e94b7b14bcb2..6e67e726e5a5 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -12,6 +12,7 @@ TEST_GEN_FILES += gup_benchmark TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-shm TEST_GEN_FILES += map_hugetlb +TEST_GEN_FILES += map_fixed_noreplace TEST_GEN_FILES += map_populate TEST_GEN_FILES += mlock-random-test TEST_GEN_FILES += mlock2-tests diff --git a/tools/testing/selftests/vm/map_fixed_noreplace.c b/tools/testing/selftests/vm/map_fixed_noreplace.c new file mode 100644 index 000000000000..d91bde511268 --- /dev/null +++ b/tools/testing/selftests/vm/map_fixed_noreplace.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Test that MAP_FIXED_NOREPLACE works. + * + * Copyright 2018, Jann Horn + * Copyright 2018, Michael Ellerman, IBM Corporation. + */ + +#include +#include +#include +#include +#include + +#ifndef MAP_FIXED_NOREPLACE +#define MAP_FIXED_NOREPLACE 0x100000 +#endif + +#define BASE_ADDRESS (256ul * 1024 * 1024) + + +static void dump_maps(void) +{ + char cmd[32]; + + snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid()); + system(cmd); +} + +int main(void) +{ + unsigned long flags, addr, size, page_size; + char *p; + + page_size = sysconf(_SC_PAGE_SIZE); + + flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE; + + // Check we can map all the areas we need below + errno = 0; + addr = BASE_ADDRESS; + size = 5 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error: couldn't map the space we need for the test\n"); + return 1; + } + + errno = 0; + if (munmap((void *)addr, 5 * page_size) != 0) { + dump_maps(); + printf("Error: munmap failed!?\n"); + return 1; + } + printf("unmap() successful\n"); + + errno = 0; + addr = BASE_ADDRESS + page_size; + size = 3 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error: first mmap() failed unexpectedly\n"); + return 1; + } + + /* + * Exact same mapping again: + * base | free | new + * +1 | mapped | new + * +2 | mapped | new + * +3 | mapped | new + * +4 | free | new + */ + errno = 0; + addr = BASE_ADDRESS; + size = 5 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:1: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Second mapping contained within first: + * + * base | free | + * +1 | mapped | + * +2 | mapped | new + * +3 | mapped | + * +4 | free | + */ + errno = 0; + addr = BASE_ADDRESS + (2 * page_size); + size = page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:2: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Overlap end of existing mapping: + * base | free | + * +1 | mapped | + * +2 | mapped | + * +3 | mapped | new + * +4 | free | new + */ + errno = 0; + addr = BASE_ADDRESS + (3 * page_size); + size = 2 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:3: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Overlap start of existing mapping: + * base | free | new + * +1 | mapped | new + * +2 | mapped | + * +3 | mapped | + * +4 | free | + */ + errno = 0; + addr = BASE_ADDRESS; + size = 2 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:4: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Adjacent to start of existing mapping: + * base | free | new + * +1 | mapped | + * +2 | mapped | + * +3 | mapped | + * +4 | free | + */ + errno = 0; + addr = BASE_ADDRESS; + size = page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error:5: mmap() failed when it shouldn't have\n"); + return 1; + } + + /* + * Adjacent to end of existing mapping: + * base | free | + * +1 | mapped | + * +2 | mapped | + * +3 | mapped | + * +4 | free | new + */ + errno = 0; + addr = BASE_ADDRESS + (4 * page_size); + size = page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error:6: mmap() failed when it shouldn't have\n"); + return 1; + } + + addr = BASE_ADDRESS; + size = 5 * page_size; + if (munmap((void *)addr, size) != 0) { + dump_maps(); + printf("Error: munmap failed!?\n"); + return 1; + } + printf("unmap() successful\n"); + + printf("OK\n"); + return 0; +} From bc4ae27d817a4e92071ef67cb6368120cfabe7ec Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 26 Oct 2018 15:10:51 -0700 Subject: [PATCH 130/132] mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS The SWP_FILE flag serves two purposes: to make swap_{read,write}page() go through the filesystem, and to make swapoff() call ->swap_deactivate(). For Btrfs, we want the latter but not the former, so split this flag into two. This makes us always call ->swap_deactivate() if ->swap_activate() succeeded, not just if it didn't add any swap extents itself. This also resolves the issue of the very misleading name of SWP_FILE, which is only used for swap files over NFS. Link: http://lkml.kernel.org/r/6d63d8668c4287a4f6d203d65696e96f80abdfc7.1536704650.git.osandov@fb.com Signed-off-by: Omar Sandoval Reviewed-by: Nikolay Borisov Reviewed-by: Andrew Morton Cc: Johannes Weiner Cc: David Sterba Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 13 +++++++------ mm/page_io.c | 6 +++--- mm/swapfile.c | 13 ++++++++----- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index b93740d72e78..38195f5c96b1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -167,13 +167,14 @@ enum { SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ - SWP_FILE = (1 << 7), /* set after swap_activate success */ - SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ - SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ - SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ - SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */ + SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ + SWP_FS = (1 << 8), /* swap file goes through fs */ + SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */ + SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ + SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ + SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ /* add others here before... */ - SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL diff --git a/mm/page_io.c b/mm/page_io.c index 573d3663d846..a451ffa9491c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -283,7 +283,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, struct swap_info_struct *sis = page_swap_info(page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_FS) { struct kiocb kiocb; struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; @@ -365,7 +365,7 @@ int swap_readpage(struct page *page, bool synchronous) goto out; } - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_FS) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; @@ -423,7 +423,7 @@ int swap_set_page_dirty(struct page *page) { struct swap_info_struct *sis = page_swap_info(page); - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_FS) { struct address_space *mapping = sis->swap_file->f_mapping; VM_BUG_ON_PAGE(!PageSwapCache(page), page); diff --git a/mm/swapfile.c b/mm/swapfile.c index 2681e50592c5..f0c7e4c11bab 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1003,7 +1003,7 @@ start_over: goto nextsi; } if (size == SWAPFILE_CLUSTER) { - if (!(si->flags & SWP_FILE)) + if (!(si->flags & SWP_FS)) n_ret = swap_alloc_cluster(si, swp_entries); } else n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, @@ -2299,12 +2299,13 @@ static void destroy_swap_extents(struct swap_info_struct *sis) kfree(se); } - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_ACTIVATED) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; - sis->flags &= ~SWP_FILE; - mapping->a_ops->swap_deactivate(swap_file); + sis->flags &= ~SWP_ACTIVATED; + if (mapping->a_ops->swap_deactivate) + mapping->a_ops->swap_deactivate(swap_file); } } @@ -2400,8 +2401,10 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) if (mapping->a_ops->swap_activate) { ret = mapping->a_ops->swap_activate(sis, swap_file, span); + if (ret >= 0) + sis->flags |= SWP_ACTIVATED; if (!ret) { - sis->flags |= SWP_FILE; + sis->flags |= SWP_FS; ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; } From aa8aa8a331d27fcef3e60dabb918eb8c5c9a2ad9 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 26 Oct 2018 15:10:55 -0700 Subject: [PATCH 131/132] mm: export add_swap_extent() Btrfs currently does not support swap files because swap's use of bmap does not work with copy-on-write and multiple devices. See 35054394c4b3 ("Btrfs: stop providing a bmap operation to avoid swapfile corruptions"). However, the swap code has a mechanism for the filesystem to manually add swap extents using add_swap_extent() from the ->swap_activate() aop. iomap has done this since 67482129cdab ("iomap: add a swapfile activation function"). Btrfs will do the same in a later patch, so export add_swap_extent(). Link: http://lkml.kernel.org/r/bb1208575e02829aae51b538709476964f97b1ea.1536704650.git.osandov@fb.com Signed-off-by: Omar Sandoval Reviewed-by: Andrew Morton Cc: David Sterba Cc: Johannes Weiner Cc: Nikolay Borisov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/swapfile.c b/mm/swapfile.c index f0c7e4c11bab..644f746e167a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2354,6 +2354,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, list_add_tail(&new_se->list, &sis->first_swap_extent.list); return 1; } +EXPORT_SYMBOL_GPL(add_swap_extent); /* * A `swap extent' is a simple thing which maps a contiguous range of pages From 22146c3ce98962436e401f7b7016a6f664c9ffb5 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 26 Oct 2018 15:10:58 -0700 Subject: [PATCH 132/132] hugetlbfs: dirty pages as they are added to pagecache Some test systems were experiencing negative huge page reserve counts and incorrect file block counts. This was traced to /proc/sys/vm/drop_caches removing clean pages from hugetlbfs file pagecaches. When non-hugetlbfs explicit code removes the pages, the appropriate accounting is not performed. This can be recreated as follows: fallocate -l 2M /dev/hugepages/foo echo 1 > /proc/sys/vm/drop_caches fallocate -l 2M /dev/hugepages/foo grep -i huge /proc/meminfo AnonHugePages: 0 kB ShmemHugePages: 0 kB HugePages_Total: 2048 HugePages_Free: 2047 HugePages_Rsvd: 18446744073709551615 HugePages_Surp: 0 Hugepagesize: 2048 kB Hugetlb: 4194304 kB ls -lsh /dev/hugepages/foo 4.0M -rw-r--r--. 1 root root 2.0M Oct 17 20:05 /dev/hugepages/foo To address this issue, dirty pages as they are added to pagecache. This can easily be reproduced with fallocate as shown above. Read faulted pages will eventually end up being marked dirty. But there is a window where they are clean and could be impacted by code such as drop_caches. So, just dirty them all as they are added to the pagecache. Link: http://lkml.kernel.org/r/b5be45b8-5afe-56cd-9482-28384699a049@oracle.com Fixes: 6bda666a03f0 ("hugepages: fold find_or_alloc_pages into huge_no_page()") Signed-off-by: Mike Kravetz Acked-by: Mihcla Hocko Reviewed-by: Khalid Aziz Cc: Hugh Dickins Cc: Naoya Horiguchi Cc: "Aneesh Kumar K . V" Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Cc: Davidlohr Bueso Cc: Alexander Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5c390f5a5207..7b5c0ad9a6bd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3690,6 +3690,12 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, return err; ClearPagePrivate(page); + /* + * set page dirty so that it will not be removed from cache/file + * by non-hugetlbfs specific code paths. + */ + set_page_dirty(page); + spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock);