From 5211613978cb7353a3237e4372958c0e7514683f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 22 Oct 2015 13:32:08 -0700 Subject: [PATCH 1/9] kmod: don't run async usermode helper as a child of kworker thread call_usermodehelper_exec_sync() does fork() + wait() with "unignored" SIGCHLD. What we have missed is that this worker thread can have other children previously forked by call_usermodehelper_exec_work() without UMH_WAIT_PROC. If such a child exits in between it becomes a zombie because auto-reaping only works if SIGCHLD is ignored, and nobody can reap it (unless/until this worker thread exits too). Change the !UMH_WAIT_PROC case to use CLONE_PARENT. Note: this is only first step. All PF_KTHREAD tasks, even created by kernel_thread() should have ->parent == kthreadd by default. Fixes: bb304a5c6fc63d8506c ("kmod: handle UMH_WAIT_PROC from system unbound workqueue") Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Rik van Riel Cc: Christoph Lameter Cc: Tejun Heo Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/kmod.c b/kernel/kmod.c index da98d0593de2..0277d1216f80 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -327,9 +327,13 @@ static void call_usermodehelper_exec_work(struct work_struct *work) call_usermodehelper_exec_sync(sub_info); } else { pid_t pid; - + /* + * Use CLONE_PARENT to reparent it to kthreadd; we do not + * want to pollute current->children, and we need a parent + * that always ignores SIGCHLD to ensure auto-reaping. + */ pid = kernel_thread(call_usermodehelper_exec_async, sub_info, - SIGCHLD); + CLONE_PARENT | SIGCHLD); if (pid < 0) { sub_info->retval = pid; umh_complete(sub_info); From 67a2e213e7e937c41c52ab5bc46bf3f4de469f6e Mon Sep 17 00:00:00 2001 From: Rohit Vaswani Date: Thu, 22 Oct 2015 13:32:11 -0700 Subject: [PATCH 2/9] mm: cma: fix incorrect type conversion for size during dma allocation This was found during userspace fuzzing test when a large size dma cma allocation is made by driver(like ion) through userspace. show_stack+0x10/0x1c dump_stack+0x74/0xc8 kasan_report_error+0x2b0/0x408 kasan_report+0x34/0x40 __asan_storeN+0x15c/0x168 memset+0x20/0x44 __dma_alloc_coherent+0x114/0x18c Signed-off-by: Rohit Vaswani Acked-by: Greg Kroah-Hartman Cc: Marek Szyprowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/dma-contiguous.c | 2 +- include/linux/cma.h | 2 +- include/linux/dma-contiguous.h | 4 ++-- mm/cma.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index 950fff9ce453..a12ff9863d7e 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c @@ -187,7 +187,7 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, * global one. Requires architecture specific dev_get_cma_area() helper * function. */ -struct page *dma_alloc_from_contiguous(struct device *dev, int count, +struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, unsigned int align) { if (align > CONFIG_CMA_ALIGNMENT) diff --git a/include/linux/cma.h b/include/linux/cma.h index f7ef093ec49a..29f9e774ab76 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -26,6 +26,6 @@ extern int __init cma_declare_contiguous(phys_addr_t base, extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, struct cma **res_cma); -extern struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align); +extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align); extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count); #endif diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h index 569bbd039896..fec734df1524 100644 --- a/include/linux/dma-contiguous.h +++ b/include/linux/dma-contiguous.h @@ -111,7 +111,7 @@ static inline int dma_declare_contiguous(struct device *dev, phys_addr_t size, return ret; } -struct page *dma_alloc_from_contiguous(struct device *dev, int count, +struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, unsigned int order); bool dma_release_from_contiguous(struct device *dev, struct page *pages, int count); @@ -144,7 +144,7 @@ int dma_declare_contiguous(struct device *dev, phys_addr_t size, } static inline -struct page *dma_alloc_from_contiguous(struct device *dev, int count, +struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, unsigned int order) { return NULL; diff --git a/mm/cma.c b/mm/cma.c index e7d1db533025..4eb56badf37e 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -361,7 +361,7 @@ err: * This function allocates part of contiguous memory on specific * contiguous memory area. */ -struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align) +struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) { unsigned long mask, offset, pfn, start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; @@ -371,7 +371,7 @@ struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align) if (!cma || !cma->count) return NULL; - pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma, + pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma, count, align); if (!count) From 41192a2d6a7f4cd6af9fc2f8edbbf24b2694f2f6 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 22 Oct 2015 13:32:13 -0700 Subject: [PATCH 3/9] MAINTAINERS: add Sergey as zsmalloc reviewer Nominate myself as a zsmalloc reviewer. Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index fb7d2e4af200..d1116d9ea6ae 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11674,6 +11674,7 @@ F: drivers/tty/serial/zs.* ZSMALLOC COMPRESSED SLAB MEMORY ALLOCATOR M: Minchan Kim M: Nitin Gupta +R: Sergey Senozhatsky L: linux-mm@kvack.org S: Maintained F: mm/zsmalloc.c From b8fa0efa01109e294e9be610465c324f771cb5ba Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Thu, 22 Oct 2015 13:32:16 -0700 Subject: [PATCH 4/9] mailmap: update Javier Martinez Canillas' email The get_maintainer script still reports my old Collabora email based on old commits but that address no longer exist so update mailmap to report my current email and avoid people sending to the old address. Signed-off-by: Javier Martinez Canillas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 4b31af54ccd5..b1e9a97653dc 100644 --- a/.mailmap +++ b/.mailmap @@ -59,6 +59,7 @@ James Bottomley James Bottomley James E Wilson James Ketrenos + Jean Tourrilhes Jeff Garzik Jens Axboe From 47aee4d8e314384807e98b67ade07f6da476aa75 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 22 Oct 2015 13:32:19 -0700 Subject: [PATCH 5/9] thp: use is_zero_pfn() only after pte_present() check Use is_zero_pfn() on pteval only after pte_present() check on pteval (It might be better idea to introduce is_zero_pte() which checks pte_present() first). Otherwise when working on a swap or migration entry and if pte_pfn's result is equal to zero_pfn by chance, we lose user's data in __collapse_huge_page_copy(). So if you're unlucky, the application segfaults and finally you could see below message on exit: BUG: Bad rss-counter state mm:ffff88007f099300 idx:2 val:3 Fixes: ca0984caa823 ("mm: incorporate zero pages into transparent huge pages") Signed-off-by: Minchan Kim Reviewed-by: Andrea Arcangeli Acked-by: Kirill A. Shutemov Cc: Mel Gorman Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: Rik van Riel Cc: [4.1+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4b06b8db9df2..bbac913f96bc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2206,7 +2206,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (pte_none(pteval) || (pte_present(pteval) && + is_zero_pfn(pte_pfn(pteval)))) { if (!userfaultfd_armed(vma) && ++none_or_zero <= khugepaged_max_ptes_none) continue; From 296291cdd1629c308114504b850dc343eabc2782 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 22 Oct 2015 13:32:21 -0700 Subject: [PATCH 6/9] mm: make sendfile(2) killable Currently a simple program below issues a sendfile(2) system call which takes about 62 days to complete in my test KVM instance. int fd; off_t off = 0; fd = open("file", O_RDWR | O_TRUNC | O_SYNC | O_CREAT, 0644); ftruncate(fd, 2); lseek(fd, 0, SEEK_END); sendfile(fd, fd, &off, 0xfffffff); Now you should not ask kernel to do a stupid stuff like copying 256MB in 2-byte chunks and call fsync(2) after each chunk but if you do, sysadmin should have a way to stop you. We actually do have a check for fatal_signal_pending() in generic_perform_write() which triggers in this path however because we always succeed in writing something before the check is done, we return value > 0 from generic_perform_write() and thus the information about signal gets lost. Fix the problem by doing the signal check before writing anything. That way generic_perform_write() returns -EINTR, the error gets propagated up and the sendfile loop terminates early. Signed-off-by: Jan Kara Reported-by: Dmitry Vyukov Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 1cc5467cf36c..327910c2400c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2488,6 +2488,11 @@ again: break; } + if (fatal_signal_pending(current)) { + status = -EINTR; + break; + } + status = a_ops->write_begin(file, mapping, pos, bytes, flags, &page, &fsdata); if (unlikely(status < 0)) @@ -2525,10 +2530,6 @@ again: written += copied; balance_dirty_pages_ratelimited(mapping); - if (fatal_signal_pending(current)) { - status = -EINTR; - break; - } } while (iov_iter_count(i)); return written ? written : status; From 3f181b4d8652f7bcd7e9932c7307b8ecd4d87cf6 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 22 Oct 2015 13:32:24 -0700 Subject: [PATCH 7/9] lib/Kconfig.debug: disable -Wframe-larger-than warnings with KASAN=y When the kernel compiled with KASAN=y, GCC adds redzones for each variable on stack. This enlarges function's stack frame and causes: 'warning: the frame size of X bytes is larger than Y bytes' The worst case I've seen for now is following: ../net/wireless/nl80211.c: In function `nl80211_send_wiphy': ../net/wireless/nl80211.c:1731:1: warning: the frame size of 5448 bytes is larger than 2048 bytes [-Wframe-larger-than=] That kind of warning becomes useless with KASAN=y. It doesn't necessarily indicate that there is some problem in the code, thus we should turn it off. (The KASAN=y stack size in increased from 16k to 32k for this reason) Signed-off-by: Andrey Ryabinin Reported-by: Fengguang Wu Acked-by: Abylay Ospan Cc: Andi Kleen Cc: Ingo Molnar Cc: Mauro Carvalho Chehab Cc: Kozlov Sergey Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ab76b99adc85..1d1521c26302 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -197,6 +197,7 @@ config ENABLE_MUST_CHECK config FRAME_WARN int "Warn for stack frames larger than (needs gcc 4.4)" range 0 8192 + default 0 if KASAN default 1024 if !64BIT default 2048 if 64BIT help From bb387002693ed28b2bb0408c5dec65521b71e5f1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 22 Oct 2015 13:32:27 -0700 Subject: [PATCH 8/9] fault-inject: fix inverted interval/probability values in printk interval displays the probability and vice versa. Fixes: 6adc4a22f20bb ("fault-inject: add ratelimit option") Acked-by: Akinobu Mita Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/fault-inject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/fault-inject.c b/lib/fault-inject.c index f1cdeb024d17..6a823a53e357 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -44,7 +44,7 @@ static void fail_dump(struct fault_attr *attr) printk(KERN_NOTICE "FAULT_INJECTION: forcing a failure.\n" "name %pd, interval %lu, probability %lu, " "space %d, times %d\n", attr->dname, - attr->probability, attr->interval, + attr->interval, attr->probability, atomic_read(&attr->space), atomic_read(&attr->times)); if (attr->verbose > 1) From b67de018b37a97548645a879c627d4188518e907 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 22 Oct 2015 13:32:29 -0700 Subject: [PATCH 9/9] ocfs2/dlm: unlock lockres spinlock before dlm_lockres_put dlm_lockres_put will call dlm_lockres_release if it is the last reference, and then it may call dlm_print_one_lock_resource and take lockres spinlock. So unlock lockres spinlock before dlm_lockres_put to avoid deadlock. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmmaster.c | 3 ++- fs/ocfs2/dlm/dlmrecovery.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index ee5aa4daaea0..ce38b4ccc9ab 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1658,12 +1658,13 @@ send_response: if (ret < 0) { mlog(ML_ERROR, "failed to dispatch assert master work\n"); response = DLM_MASTER_RESP_ERROR; + spin_unlock(&res->spinlock); dlm_lockres_put(res); } else { dispatched = 1; __dlm_lockres_grab_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); } - spin_unlock(&res->spinlock); } else { if (res) dlm_lockres_put(res); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 3d90ad7ff91f..58eaa5c0d387 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1723,8 +1723,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, } else { dispatched = 1; __dlm_lockres_grab_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); } - spin_unlock(&res->spinlock); } else { /* put.. incase we are not the master */ spin_unlock(&res->spinlock);