From 823dd3224a07f618d652a7743c9603222d019de3 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Fri, 5 Feb 2016 15:36:05 -0800 Subject: [PATCH 01/22] signals: avoid random wakeups in sigsuspend() A random wakeup can get us out of sigsuspend() without TIF_SIGPENDING being set. Avoid that by making sure we were signaled, like sys_pause() does. Signed-off-by: Sasha Levin Acked-by: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Cc: Dmitry Vyukov Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index f3f1f7a972fd..0508544c8ced 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3508,8 +3508,10 @@ static int sigsuspend(sigset_t *set) current->saved_sigmask = current->blocked; set_current_blocked(set); - __set_current_state(TASK_INTERRUPTIBLE); - schedule(); + while (!signal_pending(current)) { + __set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } set_restore_sigmask(); return -ERESTARTNOHAND; } From 9c5a05bc350c29285a89d6e1f3b68c8c7f308207 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 5 Feb 2016 15:36:08 -0800 Subject: [PATCH 02/22] block: fix pfn_mkwrite() DAX fault handler Previously the pfn_mkwrite() fault handler for raw block devices called bldev_dax_fault() -> __dax_fault() to do a full DAX page fault. Really what the pfn_mkwrite() fault handler needs to do is call dax_pfn_mkwrite() to make sure that the radix tree entry for the given PTE is marked as dirty so that a follow-up fsync or msync call will flush it durably to media. Fixes: 5a023cdba50c ("block: enable dax for raw block devices") Signed-off-by: Ross Zwisler Cc: Alexander Viro Cc: Dan Williams Cc: Dave Chinner Reviewed-by: Jan Kara Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index afb437484362..39b3a174a425 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1730,6 +1730,12 @@ static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return __dax_fault(vma, vmf, blkdev_get_block, NULL); } +static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + return dax_pfn_mkwrite(vma, vmf); +} + static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags) { @@ -1739,7 +1745,7 @@ static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, static const struct vm_operations_struct blkdev_dax_vm_ops = { .fault = blkdev_dax_fault, .pmd_fault = blkdev_dax_pmd_fault, - .pfn_mkwrite = blkdev_dax_fault, + .pfn_mkwrite = blkdev_dax_pfn_mkwrite, }; static const struct vm_operations_struct blkdev_default_vm_ops = { From af1ddcb5c63cfb9744c29e88db0bab6bb25ad76b Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Fri, 5 Feb 2016 15:36:10 -0800 Subject: [PATCH 03/22] m32r: fix build failure due to SMP and MMU One of the randconfig build failed with the error: arch/m32r/kernel/smp.c: In function 'smp_flush_tlb_mm': arch/m32r/kernel/smp.c:283:20: error: subscripted value is neither array nor pointer nor vector mmc = &mm->context[cpu_id]; ^ arch/m32r/kernel/smp.c: In function 'smp_flush_tlb_page': arch/m32r/kernel/smp.c:353:20: error: subscripted value is neither array nor pointer nor vector mmc = &mm->context[cpu_id]; ^ arch/m32r/kernel/smp.c: In function 'smp_invalidate_interrupt': arch/m32r/kernel/smp.c:479:41: error: subscripted value is neither array nor pointer nor vector unsigned long *mmc = &flush_mm->context[cpu_id]; It turned out that CONFIG_SMP was defined but CONFIG_MMU was not defined. But arch/m32r/include/asm/mmu.h only defines mm_context_t as an array when both CONFIG_SMP and CONFIG_MMU are defined. And arch/m32r/kernel/smp.c is always using context as an array. So without MMU SMP can not work. Signed-off-by: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m32r/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 836ac5a963c8..2841c0a3fd3b 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -276,6 +276,7 @@ source "kernel/Kconfig.preempt" config SMP bool "Symmetric multi-processing support" + depends on MMU ---help--- This enables support for systems with more than one CPU. If you have a system with only one CPU, say N. If you have a system with more From acf128d048c76aeaa99646bce3488d73be215f70 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 5 Feb 2016 15:36:13 -0800 Subject: [PATCH 04/22] mm: validate_mm browse_rb SMP race condition The mmap_sem for reading in validate_mm called from expand_stack is not enough to prevent the argumented rbtree rb_subtree_gap information to change from under us because expand_stack may be running from other threads concurrently which will hold the mmap_sem for reading too. The argumented rbtree is updated with vma_gap_update under the page_table_lock so use it in browse_rb() too to avoid false positives. Signed-off-by: Andrea Arcangeli Reported-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Cc: Konstantin Khlebnikov Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index cfc0cdca421e..918c9ec5043f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -390,8 +390,9 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) } #ifdef CONFIG_DEBUG_VM_RB -static int browse_rb(struct rb_root *root) +static int browse_rb(struct mm_struct *mm) { + struct rb_root *root = &mm->mm_rb; int i = 0, j, bug = 0; struct rb_node *nd, *pn = NULL; unsigned long prev = 0, pend = 0; @@ -414,12 +415,14 @@ static int browse_rb(struct rb_root *root) vma->vm_start, vma->vm_end); bug = 1; } + spin_lock(&mm->page_table_lock); if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { pr_emerg("free gap %lx, correct %lx\n", vma->rb_subtree_gap, vma_compute_subtree_gap(vma)); bug = 1; } + spin_unlock(&mm->page_table_lock); i++; pn = nd; prev = vma->vm_start; @@ -475,7 +478,7 @@ static void validate_mm(struct mm_struct *mm) mm->highest_vm_end, highest_address); bug = 1; } - i = browse_rb(&mm->mm_rb); + i = browse_rb(mm); if (i != mm->map_count) { if (i != -1) pr_emerg("map_count %d rb %d\n", mm->map_count, i); From d7ce36924344ace0dbdc855b1206cacc46b36d45 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Feb 2016 15:36:16 -0800 Subject: [PATCH 05/22] dump_stack: avoid potential deadlocks Some servers experienced fatal deadlocks because of a combination of bugs, leading to multiple cpus calling dump_stack(). The checksumming bug was fixed in commit 34ae6a1aa054 ("ipv6: update skb->csum when CE mark is propagated"). The second problem is a faulty locking in dump_stack() CPU1 runs in process context and calls dump_stack(), grabs dump_lock. CPU2 receives a TCP packet under softirq, grabs socket spinlock, and call dump_stack() from netdev_rx_csum_fault(). dump_stack() spins on atomic_cmpxchg(&dump_lock, -1, 2), since dump_lock is owned by CPU1 While dumping its stack, CPU1 is interrupted by a softirq, and happens to process a packet for the TCP socket locked by CPU2. CPU1 spins forever in spin_lock() : deadlock Stack trace on CPU1 looked like : NMI backtrace for cpu 1 RIP: _raw_spin_lock+0x25/0x30 ... Call Trace: tcp_v6_rcv+0x243/0x620 ip6_input_finish+0x11f/0x330 ip6_input+0x38/0x40 ip6_rcv_finish+0x3c/0x90 ipv6_rcv+0x2a9/0x500 process_backlog+0x461/0xaa0 net_rx_action+0x147/0x430 __do_softirq+0x167/0x2d0 call_softirq+0x1c/0x30 do_softirq+0x3f/0x80 irq_exit+0x6e/0xc0 smp_call_function_single_interrupt+0x35/0x40 call_function_single_interrupt+0x6a/0x70 printk+0x4d/0x4f printk_address+0x31/0x33 print_trace_address+0x33/0x3c print_context_stack+0x7f/0x119 dump_trace+0x26b/0x28e show_trace_log_lvl+0x4f/0x5c show_stack_log_lvl+0x104/0x113 show_stack+0x42/0x44 dump_stack+0x46/0x58 netdev_rx_csum_fault+0x38/0x3c __skb_checksum_complete_head+0x6e/0x80 __skb_checksum_complete+0x11/0x20 tcp_rcv_established+0x2bd5/0x2fd0 tcp_v6_do_rcv+0x13c/0x620 sk_backlog_rcv+0x15/0x30 release_sock+0xd2/0x150 tcp_recvmsg+0x1c1/0xfc0 inet_recvmsg+0x7d/0x90 sock_recvmsg+0xaf/0xe0 ___sys_recvmsg+0x111/0x3b0 SyS_recvmsg+0x5c/0xb0 system_call_fastpath+0x16/0x1b Fixes: b58d977432c8 ("dump_stack: serialize the output from dump_stack()") Signed-off-by: Eric Dumazet Cc: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/dump_stack.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 6745c6230db3..c30d07e99dba 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -25,6 +25,7 @@ static atomic_t dump_lock = ATOMIC_INIT(-1); asmlinkage __visible void dump_stack(void) { + unsigned long flags; int was_locked; int old; int cpu; @@ -33,9 +34,8 @@ asmlinkage __visible void dump_stack(void) * Permit this cpu to perform nested stack dumps while serialising * against other CPUs */ - preempt_disable(); - retry: + local_irq_save(flags); cpu = smp_processor_id(); old = atomic_cmpxchg(&dump_lock, -1, cpu); if (old == -1) { @@ -43,6 +43,7 @@ retry: } else if (old == cpu) { was_locked = 1; } else { + local_irq_restore(flags); cpu_relax(); goto retry; } @@ -52,7 +53,7 @@ retry: if (!was_locked) atomic_set(&dump_lock, -1); - preempt_enable(); + local_irq_restore(flags); } #else asmlinkage __visible void dump_stack(void) From 1f1ffb8a151e8c59899c78019f76cb8f64be13f5 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 5 Feb 2016 15:36:19 -0800 Subject: [PATCH 06/22] memblock: don't mark memblock_phys_mem_size() as __init At the moment memblock_phys_mem_size() is marked as __init, and so is discarded after boot. This is different from most of the memblock functions which are marked __init_memblock, and are only discarded after boot if memory hotplug is not configured. To allow for upcoming code which will need memblock_phys_mem_size() in the hotplug path, change it from __init to __init_memblock. Signed-off-by: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memblock.c b/mm/memblock.c index d2ed81e59a94..dd7989929f13 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1448,7 +1448,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) * Remaining API functions */ -phys_addr_t __init memblock_phys_mem_size(void) +phys_addr_t __init_memblock memblock_phys_mem_size(void) { return memblock.memory.total_size; } From 1ce221036b83288358d8d5d5ff1784657b3ab4e9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 5 Feb 2016 15:36:21 -0800 Subject: [PATCH 07/22] mm/Kconfig: correct description of DEFERRED_STRUCT_PAGE_INIT The description mentions kswapd threads, while the deferred struct page initialization is actually done by one-off "pgdatinitX" threads. Fix the description so that potentially users are not confused about pgdatinit threads using CPU after boot instead of kswapd. Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index 97a4e06b15c0..03cbfa072f42 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -624,7 +624,7 @@ config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT bool config DEFERRED_STRUCT_PAGE_INIT - bool "Defer initialisation of struct pages to kswapd" + bool "Defer initialisation of struct pages to kthreads" default n depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG @@ -633,9 +633,10 @@ config DEFERRED_STRUCT_PAGE_INIT single thread. On very large machines this can take a considerable amount of time. If this option is set, large machines will bring up a subset of memmap at boot and then initialise the rest in parallel - when kswapd starts. This has a potential performance impact on - processes running early in the lifetime of the systemm until kswapd - finishes the initialisation. + by starting one-off "pgdatinitX" kernel thread for each node X. This + has a potential performance impact on processes running early in the + lifetime of the system until these kthreads finish the + initialisation. config IDLE_PAGE_TRACKING bool "Enable idle page tracking" From f01f17d3705bb6081c9e5728078f64067982be36 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 5 Feb 2016 15:36:24 -0800 Subject: [PATCH 08/22] mm, vmstat: make quiet_vmstat lighter Mike has reported a considerable overhead of refresh_cpu_vm_stats from the idle entry during pipe test: 12.89% [kernel] [k] refresh_cpu_vm_stats.isra.12 4.75% [kernel] [k] __schedule 4.70% [kernel] [k] mutex_unlock 3.14% [kernel] [k] __switch_to This is caused by commit 0eb77e988032 ("vmstat: make vmstat_updater deferrable again and shut down on idle") which has placed quiet_vmstat into cpu_idle_loop. The main reason here seems to be that the idle entry has to get over all zones and perform atomic operations for each vmstat entry even though there might be no per cpu diffs. This is a pointless overhead for _each_ idle entry. Make sure that quiet_vmstat is as light as possible. First of all it doesn't make any sense to do any local sync if the current cpu is already set in oncpu_stat_off because vmstat_update puts itself there only if there is nothing to do. Then we can check need_update which should be a cheap way to check for potential per-cpu diffs and only then do refresh_cpu_vm_stats. The original patch also did cancel_delayed_work which we are not doing here. There are two reasons for that. Firstly cancel_delayed_work from idle context will blow up on RT kernels (reported by Mike): CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.5.0-rt3 #7 Hardware name: MEDION MS-7848/MS-7848, BIOS M7848W08.20C 09/23/2013 Call Trace: dump_stack+0x49/0x67 ___might_sleep+0xf5/0x180 rt_spin_lock+0x20/0x50 try_to_grab_pending+0x69/0x240 cancel_delayed_work+0x26/0xe0 quiet_vmstat+0x75/0xa0 cpu_idle_loop+0x38/0x3e0 cpu_startup_entry+0x13/0x20 start_secondary+0x114/0x140 And secondly, even on !RT kernels it might add some non trivial overhead which is not necessary. Even if the vmstat worker wakes up and preempts idle then it will be most likely a single shot noop because the stats were already synced and so it would end up on the oncpu_stat_off anyway. We just need to teach both vmstat_shepherd and vmstat_update to stop scheduling the worker if there is nothing to do. [mgalbraith@suse.de: cancel pending work of the cpu_stat_off CPU] Signed-off-by: Michal Hocko Reported-by: Mike Galbraith Acked-by: Christoph Lameter Signed-off-by: Mike Galbraith Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 68 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 22 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 40b2c74ddf16..1543f64df3e6 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1396,10 +1396,15 @@ static void vmstat_update(struct work_struct *w) * Counters were updated so we expect more updates * to occur in the future. Keep on running the * update worker thread. + * If we were marked on cpu_stat_off clear the flag + * so that vmstat_shepherd doesn't schedule us again. */ - queue_delayed_work_on(smp_processor_id(), vmstat_wq, - this_cpu_ptr(&vmstat_work), - round_jiffies_relative(sysctl_stat_interval)); + if (!cpumask_test_and_clear_cpu(smp_processor_id(), + cpu_stat_off)) { + queue_delayed_work_on(smp_processor_id(), vmstat_wq, + this_cpu_ptr(&vmstat_work), + round_jiffies_relative(sysctl_stat_interval)); + } } else { /* * We did not update any counters so the app may be in @@ -1417,18 +1422,6 @@ static void vmstat_update(struct work_struct *w) * until the diffs stay at zero. The function is used by NOHZ and can only be * invoked when tick processing is not active. */ -void quiet_vmstat(void) -{ - if (system_state != SYSTEM_RUNNING) - return; - - do { - if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) - cancel_delayed_work(this_cpu_ptr(&vmstat_work)); - - } while (refresh_cpu_vm_stats(false)); -} - /* * Check if the diffs for a certain cpu indicate that * an update is needed. @@ -1452,6 +1445,30 @@ static bool need_update(int cpu) return false; } +void quiet_vmstat(void) +{ + if (system_state != SYSTEM_RUNNING) + return; + + /* + * If we are already in hands of the shepherd then there + * is nothing for us to do here. + */ + if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) + return; + + if (!need_update(smp_processor_id())) + return; + + /* + * Just refresh counters and do not care about the pending delayed + * vmstat_update. It doesn't fire that often to matter and canceling + * it would be too expensive from this path. + * vmstat_shepherd will take care about that for us. + */ + refresh_cpu_vm_stats(false); +} + /* * Shepherd worker thread that checks the @@ -1469,18 +1486,25 @@ static void vmstat_shepherd(struct work_struct *w) get_online_cpus(); /* Check processors whose vmstat worker threads have been disabled */ - for_each_cpu(cpu, cpu_stat_off) - if (need_update(cpu) && - cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) - - queue_delayed_work_on(cpu, vmstat_wq, - &per_cpu(vmstat_work, cpu), 0); + for_each_cpu(cpu, cpu_stat_off) { + struct delayed_work *dw = &per_cpu(vmstat_work, cpu); + if (need_update(cpu)) { + if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) + queue_delayed_work_on(cpu, vmstat_wq, dw, 0); + } else { + /* + * Cancel the work if quiet_vmstat has put this + * cpu on cpu_stat_off because the work item might + * be still scheduled + */ + cancel_delayed_work(dw); + } + } put_online_cpus(); schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); - } static void __init start_shepherd_timer(void) From ccde8bd4014eb2f01102f7a64f0fad3df193b758 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 5 Feb 2016 15:36:27 -0800 Subject: [PATCH 09/22] vmstat: make vmstat_update deferrable Commit 0eb77e988032 ("vmstat: make vmstat_updater deferrable again and shut down on idle") made vmstat_shepherd deferrable. vmstat_update itself is still useing standard timer which might interrupt idle task. This is possible because "mm, vmstat: make quiet_vmstat lighter" removed cancel_delayed_work from the quiet_vmstat. Change vmstat_work to use DEFERRABLE_WORK to prevent from pointless wakeups from the idle context. Acked-by: Christoph Lameter Signed-off-by: Michal Hocko Cc: Mike Galbraith Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 1543f64df3e6..084c6725b373 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1512,7 +1512,7 @@ static void __init start_shepherd_timer(void) int cpu; for_each_possible_cpu(cpu) - INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu), + INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), vmstat_update); if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) From 564e81a57f9788b1475127012e0fd44e9049e342 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 5 Feb 2016 15:36:30 -0800 Subject: [PATCH 10/22] mm, vmstat: fix wrong WQ sleep when memory reclaim doesn't make any progress Jan Stancek has reported that system occasionally hanging after "oom01" testcase from LTP triggers OOM. Guessing from a result that there is a kworker thread doing memory allocation and the values between "Node 0 Normal free:" and "Node 0 Normal:" differs when hanging, vmstat is not up-to-date for some reason. According to commit 373ccbe59270 ("mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't make any progress"), it meant to force the kworker thread to take a short sleep, but it by error used schedule_timeout(1). We missed that schedule_timeout() in state TASK_RUNNING doesn't do anything. Fix it by using schedule_timeout_uninterruptible(1) which forces the kworker thread to take a short sleep in order to make sure that vmstat is up-to-date. Fixes: 373ccbe59270 ("mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't make any progress") Signed-off-by: Tetsuo Handa Reported-by: Jan Stancek Acked-by: Michal Hocko Cc: Tejun Heo Cc: Cristopher Lameter Cc: Joonsoo Kim Cc: Arkadiusz Miskiewicz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index cc5d29d2da9b..926c76d56388 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -989,7 +989,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) * here rather than calling cond_resched(). */ if (current->flags & PF_WQ_WORKER) - schedule_timeout(1); + schedule_timeout_uninterruptible(1); else cond_resched(); From 77bf45e78050790d8f7fc30b87a0ca674bf6265a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 5 Feb 2016 15:36:33 -0800 Subject: [PATCH 11/22] mempolicy: do not try to queue pages from !vma_migratable() Maybe I miss some point, but I don't see a reason why we try to queue pages from non migratable VMAs. This testcase steps on VM_BUG_ON_PAGE() in isolate_lru_page(): #include #include #include #include #include #define SIZE 0x2000 int foo; int main() { int fd; char *p; unsigned long mask = 2; fd = open("/dev/sg0", O_RDWR); p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); /* Faultin pages */ foo = p[0] + p[0x1000]; mbind(p, SIZE, MPOL_BIND, &mask, 4, MPOL_MF_MOVE | MPOL_MF_STRICT); return 0; } The only case when we can queue pages from such VMA is MPOL_MF_STRICT plus MPOL_MF_MOVE or MPOL_MF_MOVE_ALL for VMA which has pages on LRU, but gfp mask is not sutable for migaration (see mapping_gfp_mask() check in vma_migratable()). That's looks like a bug to me. Let's filter out non-migratable vma at start of queue_pages_test_walk() and go to queue_pages_pte_range() only if MPOL_MF_MOVE or MPOL_MF_MOVE_ALL flag is set. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dmitry Vyukov Cc: Vlastimil Babka Cc: David Rientjes Cc: Naoya Horiguchi Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 27d135408a22..4c4187c0e1de 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -548,8 +548,7 @@ retry: goto retry; } - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) - migrate_page_add(page, qp->pagelist, flags); + migrate_page_add(page, qp->pagelist, flags); } pte_unmap_unlock(pte - 1, ptl); cond_resched(); @@ -625,7 +624,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; - if (vma->vm_flags & VM_PFNMAP) + if (!vma_migratable(vma)) return 1; if (endvma > end) @@ -644,16 +643,13 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ - if (vma_migratable(vma) && - vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) + if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) change_prot_numa(vma, start, endvma); return 1; } - if ((flags & MPOL_MF_STRICT) || - ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && - vma_migratable(vma))) - /* queue pages from current vma */ + /* queue pages from current vma */ + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return 0; return 1; } From cf2a82ee432730ab25c21d054c67f296af4fc4bd Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 5 Feb 2016 15:36:36 -0800 Subject: [PATCH 12/22] mm: downgrade VM_BUG in isolate_lru_page() to warning Calling isolate_lru_page() is wrong and shouldn't happen, but it not nessesary fatal: the page just will not be isolated if it's not on LRU. Let's downgrade the VM_BUG_ON_PAGE() to WARN_RATELIMIT(). Signed-off-by: Kirill A. Shutemov Cc: Dmitry Vyukov Cc: Vlastimil Babka Cc: David Rientjes Cc: Naoya Horiguchi Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index eb3dd37ccd7c..71b1c29948db 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1443,7 +1443,7 @@ int isolate_lru_page(struct page *page) int ret = -EBUSY; VM_BUG_ON_PAGE(!page_count(page), page); - VM_BUG_ON_PAGE(PageTail(page), page); + WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); if (PageLRU(page)) { struct zone *zone = page_zone(page); From b4330afbed0cdceeba33c4945158c55771047e81 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 5 Feb 2016 15:36:38 -0800 Subject: [PATCH 13/22] mm/hugetlb: fix gigantic page initialization/allocation Attempting to preallocate 1G gigantic huge pages at boot time with "hugepagesz=1G hugepages=1" on the kernel command line will prevent booting with the following: kernel BUG at mm/hugetlb.c:1218! When mapcount accounting was reworked, the setting of compound_mapcount_ptr in prep_compound_gigantic_page was overlooked. As a result, the validation of mapcount in free_huge_page fails. The "BUG_ON" checks in free_huge_page were also changed to "VM_BUG_ON_PAGE" to assist with debugging. Fixes: 53f9263baba69 ("mm: rework mapcount accounting to enable 4k mapping of THPs") Signed-off-by: Mike Kravetz Signed-off-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Acked-by: David Rientjes Tested-by: Vlastimil Babka Cc: "Aneesh Kumar K.V" Cc: Jerome Marchand Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 12908dcf5831..d7a802427ea8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1214,8 +1214,8 @@ void free_huge_page(struct page *page) set_page_private(page, 0); page->mapping = NULL; - BUG_ON(page_count(page)); - BUG_ON(page_mapcount(page)); + VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(page_mapcount(page), page); restore_reserve = PagePrivate(page); ClearPagePrivate(page); @@ -1286,6 +1286,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order) set_page_count(p, 0); set_compound_head(p, page); } + atomic_set(compound_mapcount_ptr(page), -1); } /* From 080fe2068e1c7f19f565b30b78baf78edf16a980 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 5 Feb 2016 15:36:41 -0800 Subject: [PATCH 14/22] mm, hugetlb: don't require CMA for runtime gigantic pages Commit 944d9fec8d7a ("hugetlb: add support for gigantic page allocation at runtime") has added the runtime gigantic page allocation via alloc_contig_range(), making this support available only when CONFIG_CMA is enabled. Because it doesn't depend on MIGRATE_CMA pageblocks and the associated infrastructure, it is possible with few simple adjustments to require only CONFIG_MEMORY_ISOLATION instead of full CONFIG_CMA. After this patch, alloc_contig_range() and related functions are available and used for gigantic pages with just CONFIG_MEMORY_ISOLATION enabled. Note CONFIG_CMA selects CONFIG_MEMORY_ISOLATION. This allows supporting runtime gigantic pages without the CMA-specific checks in page allocator fastpaths. Signed-off-by: Vlastimil Babka Cc: Luiz Capitulino Cc: Kirill A. Shutemov Cc: Zhang Yanfei Cc: Yasuaki Ishimatsu Cc: Joonsoo Kim Cc: Naoya Horiguchi Cc: Mel Gorman Cc: Davidlohr Bueso Cc: Hillf Danton Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/hugetlbpage.c | 4 ++-- include/linux/gfp.h | 6 +++--- mm/hugetlb.c | 2 +- mm/page_alloc.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 42982b26e32b..740d7ac03a55 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -173,10 +173,10 @@ static __init int setup_hugepagesz(char *opt) } __setup("hugepagesz=", setup_hugepagesz); -#ifdef CONFIG_CMA +#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) static __init int gigantic_pages_init(void) { - /* With CMA we can allocate gigantic pages at runtime */ + /* With compaction or CMA we can allocate gigantic pages at runtime */ if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT)) hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); return 0; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 28ad5f6494b0..af1f2b24bbe4 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -547,16 +547,16 @@ static inline bool pm_suspended_storage(void) } #endif /* CONFIG_PM_SLEEP */ -#ifdef CONFIG_CMA - +#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype); extern void free_contig_range(unsigned long pfn, unsigned nr_pages); +#endif +#ifdef CONFIG_CMA /* CMA stuff */ extern void init_cma_reserved_pageblock(struct page *page); - #endif #endif /* __LINUX_GFP_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d7a802427ea8..06ae13e869d0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1001,7 +1001,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -#if defined(CONFIG_CMA) && defined(CONFIG_X86_64) +#if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)) static void destroy_compound_gigantic_page(struct page *page, unsigned int order) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ea2c4d3e0c03..838ca8bb64f7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6620,7 +6620,7 @@ bool is_pageblock_removable_nolock(struct page *page) return !has_unmovable_pages(zone, page, 0, true); } -#ifdef CONFIG_CMA +#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) static unsigned long pfn_max_align_down(unsigned long pfn) { From 012a4163be4559d406ed2d7f2aa459538eb8b686 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Fri, 5 Feb 2016 15:36:44 -0800 Subject: [PATCH 15/22] um: asm/page.h: remove the pte_high member from struct pte_t Commit 16da306849d0 ("um: kill pfn_t") introduced a compile warning for defconfig (SUBARCH=i386): arch/um/kernel/skas/mmu.c:38:206: warning: right shift count >= width of type [-Wshift-count-overflow] Aforementioned patch changes the definition of the phys_to_pfn() macro from ((pfn_t) ((p) >> PAGE_SHIFT)) to ((p) >> PAGE_SHIFT) This effectively changes the phys_to_pfn() expansion's type from unsigned long long to unsigned long. Through the callchain init_stub_pte() => mk_pte(), the expansion of phys_to_pfn() is (indirectly) fed into the 'phys' argument of the pte_set_val(pte, phys, prot) macro, eventually leading to (pte).pte_high = (phys) >> 32; This results in the warning from above. Since UML only deals with 32 bit addresses, the upper 32 bits from 'phys' used to be always zero anyway. Also, all page protection flags defined by UML don't use any bits beyond bit 9. Since the contents of a PTE are defined within architecture scope only, the ->pte_high member can be safely removed. Remove the ->pte_high member from struct pte_t. Rename ->pte_low to ->pte. Adapt the pte helper macros in arch/um/include/asm/page.h. Noteworthy is the pte_copy() macro where a smp_wmb() gets dropped. This write barrier doesn't seem to be paired with any read barrier though and thus, was useless anyway. Fixes: 16da306849d0 ("um: kill pfn_t") Signed-off-by: Nicolai Stange Cc: Dan Williams Cc: Richard Weinberger Cc: Nicolai Stange Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/include/asm/page.h | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h index e13d41c392ae..f878bec23576 100644 --- a/arch/um/include/asm/page.h +++ b/arch/um/include/asm/page.h @@ -34,21 +34,18 @@ struct page; #if defined(CONFIG_3_LEVEL_PGTABLES) && !defined(CONFIG_64BIT) -typedef struct { unsigned long pte_low, pte_high; } pte_t; +typedef struct { unsigned long pte; } pte_t; typedef struct { unsigned long pmd; } pmd_t; typedef struct { unsigned long pgd; } pgd_t; -#define pte_val(x) ((x).pte_low | ((unsigned long long) (x).pte_high << 32)) +#define pte_val(p) ((p).pte) -#define pte_get_bits(pte, bits) ((pte).pte_low & (bits)) -#define pte_set_bits(pte, bits) ((pte).pte_low |= (bits)) -#define pte_clear_bits(pte, bits) ((pte).pte_low &= ~(bits)) -#define pte_copy(to, from) ({ (to).pte_high = (from).pte_high; \ - smp_wmb(); \ - (to).pte_low = (from).pte_low; }) -#define pte_is_zero(pte) (!((pte).pte_low & ~_PAGE_NEWPAGE) && !(pte).pte_high) -#define pte_set_val(pte, phys, prot) \ - ({ (pte).pte_high = (phys) >> 32; \ - (pte).pte_low = (phys) | pgprot_val(prot); }) +#define pte_get_bits(p, bits) ((p).pte & (bits)) +#define pte_set_bits(p, bits) ((p).pte |= (bits)) +#define pte_clear_bits(p, bits) ((p).pte &= ~(bits)) +#define pte_copy(to, from) ({ (to).pte = (from).pte; }) +#define pte_is_zero(p) (!((p).pte & ~_PAGE_NEWPAGE)) +#define pte_set_val(p, phys, prot) \ + ({ (p).pte = (phys) | pgprot_val(prot); }) #define pmd_val(x) ((x).pmd) #define __pmd(x) ((pmd_t) { (x) } ) From c95a51807b730e4681e2ecbdfd669ca52601959e Mon Sep 17 00:00:00 2001 From: xuejiufei Date: Fri, 5 Feb 2016 15:36:47 -0800 Subject: [PATCH 16/22] ocfs2/dlm: clear refmap bit of recovery lock while doing local recovery cleanup When recovery master down, dlm_do_local_recovery_cleanup() only remove the $RECOVERY lock owned by dead node, but do not clear the refmap bit. Which will make umount thread falling in dead loop migrating $RECOVERY to the dead node. Signed-off-by: xuejiufei Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index c5bdf02c213b..b94a425f0175 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2367,6 +2367,8 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) break; } } + dlm_lockres_clear_refmap_bit(dlm, res, + dead_node); spin_unlock(&res->spinlock); continue; } From 12352d3cae2cebe18805a91fab34b534d7444231 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 5 Feb 2016 15:36:50 -0800 Subject: [PATCH 17/22] mm: replace vma_lock_anon_vma with anon_vma_lock_read/write Sequence vma_lock_anon_vma() - vma_unlock_anon_vma() isn't safe if anon_vma appeared between lock and unlock. We have to check anon_vma first or call anon_vma_prepare() to be sure that it's here. There are only few users of these legacy helpers. Let's get rid of them. This patch fixes anon_vma lock imbalance in validate_mm(). Write lock isn't required here, read lock is enough. And reorders expand_downwards/expand_upwards: security_mmap_addr() and wrapping-around check don't have to be under anon vma lock. Link: https://lkml.kernel.org/r/CACT4Y+Y908EjM2z=706dv4rV6dWtxTLK9nFg9_7DhRMLppBo2g@mail.gmail.com Signed-off-by: Konstantin Khlebnikov Reported-by: Dmitry Vyukov Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 14 ----------- mm/mmap.c | 55 ++++++++++++++++++++------------------------ 2 files changed, 25 insertions(+), 44 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bdf597c4f0be..a07f42bedda3 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -109,20 +109,6 @@ static inline void put_anon_vma(struct anon_vma *anon_vma) __put_anon_vma(anon_vma); } -static inline void vma_lock_anon_vma(struct vm_area_struct *vma) -{ - struct anon_vma *anon_vma = vma->anon_vma; - if (anon_vma) - down_write(&anon_vma->root->rwsem); -} - -static inline void vma_unlock_anon_vma(struct vm_area_struct *vma) -{ - struct anon_vma *anon_vma = vma->anon_vma; - if (anon_vma) - up_write(&anon_vma->root->rwsem); -} - static inline void anon_vma_lock_write(struct anon_vma *anon_vma) { down_write(&anon_vma->root->rwsem); diff --git a/mm/mmap.c b/mm/mmap.c index 918c9ec5043f..2f2415a7a688 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -459,12 +459,16 @@ static void validate_mm(struct mm_struct *mm) struct vm_area_struct *vma = mm->mmap; while (vma) { + struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; - vma_lock_anon_vma(vma); - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_verify(avc); - vma_unlock_anon_vma(vma); + if (anon_vma) { + anon_vma_lock_read(anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_verify(avc); + anon_vma_unlock_read(anon_vma); + } + highest_address = vma->vm_end; vma = vma->vm_next; i++; @@ -2145,32 +2149,27 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; - int error; + int error = 0; if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ + /* Guard against wrapping around to address 0. */ + if (address < PAGE_ALIGN(address+4)) + address = PAGE_ALIGN(address+4); + else + return -ENOMEM; + + /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) return -ENOMEM; - vma_lock_anon_vma(vma); /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. - * Also guard against wrapping around to address 0. */ - if (address < PAGE_ALIGN(address+4)) - address = PAGE_ALIGN(address+4); - else { - vma_unlock_anon_vma(vma); - return -ENOMEM; - } - error = 0; + anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address > vma->vm_end) { @@ -2188,7 +2187,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) * updates, but we only hold a shared mmap_sem * lock here, so we need to protect against * concurrent vma expansions. - * vma_lock_anon_vma() doesn't help here, as + * anon_vma_lock_write() doesn't help here, as * we don't guarantee that all growable vmas * in a mm share the same root anon vma. * So, we reuse mm->page_table_lock to guard @@ -2211,7 +2210,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } } - vma_unlock_anon_vma(vma); + anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(mm); return error; @@ -2227,25 +2226,21 @@ int expand_downwards(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; int error; - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ - if (unlikely(anon_vma_prepare(vma))) - return -ENOMEM; - address &= PAGE_MASK; error = security_mmap_addr(address); if (error) return error; - vma_lock_anon_vma(vma); + /* We must make sure the anon_vma is allocated. */ + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. */ + anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address < vma->vm_start) { @@ -2263,7 +2258,7 @@ int expand_downwards(struct vm_area_struct *vma, * updates, but we only hold a shared mmap_sem * lock here, so we need to protect against * concurrent vma expansions. - * vma_lock_anon_vma() doesn't help here, as + * anon_vma_lock_write() doesn't help here, as * we don't guarantee that all growable vmas * in a mm share the same root anon vma. * So, we reuse mm->page_table_lock to guard @@ -2284,7 +2279,7 @@ int expand_downwards(struct vm_area_struct *vma, } } } - vma_unlock_anon_vma(vma); + anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(mm); return error; From ae026204a2b9060817503408906b35cefd824420 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 5 Feb 2016 15:36:53 -0800 Subject: [PATCH 18/22] thp: make deferred_split_scan() work again We need to iterate over split_queue, not local empty list to get anything split from the shrinker. Fixes: e3ae19535c66 ("thp: limit number of object to scan on deferred_split_scan()") Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 36c070167b71..08fc0ba2207e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3482,7 +3482,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, spin_lock_irqsave(&pgdata->split_queue_lock, flags); /* Take pin on all head pages to avoid freeing them under us */ - list_for_each_safe(pos, next, &list) { + list_for_each_safe(pos, next, &pgdata->split_queue) { page = list_entry((void *)pos, struct page, mapping); page = compound_head(page); if (get_page_unless_zero(page)) { From d2b2a28e640489df64cc50123f4c3c47d9ad7d13 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 5 Feb 2016 15:36:55 -0800 Subject: [PATCH 19/22] dax: dirty inode only if required Signed-off-by: Dmitry Monakhov Reviewed-by: Jan Kara Reviewed-by: Ross Zwisler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/dax.c b/fs/dax.c index e0e9358baf35..fc2e3141138b 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -358,7 +358,8 @@ static int dax_radix_entry(struct address_space *mapping, pgoff_t index, void *entry; WARN_ON_ONCE(pmd_entry && !dirty); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + if (dirty) + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); spin_lock_irq(&mapping->tree_lock); From b14fd334ff3dc47b31c3592c166af5ea42b204d3 Mon Sep 17 00:00:00 2001 From: "Michael Kerrisk (man-pages)" Date: Fri, 5 Feb 2016 15:36:58 -0800 Subject: [PATCH 20/22] MAINTAINERS: trim the file triggers for ABI/API Commit ea8f8fc8631 ("MAINTAINERS: add linux-api for review of API/ABI changes") added file triggers for various paths that likely indicated API/ABI changes. However, catching all changes in Documentation/ABI/ and include/uapi/ produces a large volume of mail to linux-api, rather than only API/ABI changes. Drop those two entries, but leave include/linux/syscalls.h and kernel/sys_ni.c to catch syscall-related changes. [josh@joshtriplett.org: redid changelog] Signed-off-by: Michael Kerrisk Acked-by: Shuah khan Cc: Josh Triplett Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 -- 1 file changed, 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index a9010d992da3..02a94eb64b52 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -223,9 +223,7 @@ F: drivers/scsi/aacraid/ ABI/API L: linux-api@vger.kernel.org -F: Documentation/ABI/ F: include/linux/syscalls.h -F: include/uapi/ F: kernel/sys_ni.c ABIT UGURU 1,2 HARDWARE MONITOR DRIVER From 732042821cfa106b3c20b9780e4c60fee9d68900 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 5 Feb 2016 15:37:01 -0800 Subject: [PATCH 21/22] radix-tree: fix oops after radix_tree_iter_retry Helper radix_tree_iter_retry() resets next_index to the current index. In following radix_tree_next_slot current chunk size becomes zero. This isn't checked and it tries to dereference null pointer in slot. Tagged iterator is fine because retry happens only at slot 0 where tag bitmask in iter->tags is filled with single bit. Fixes: 46437f9a554f ("radix-tree: fix race in gang lookup") Signed-off-by: Konstantin Khlebnikov Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Ohad Ben-Cohen Cc: Jeremiah Mahler Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 00b17c526c1f..f54be7082207 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -400,7 +400,7 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter) * @iter: pointer to radix tree iterator * Returns: current chunk size */ -static __always_inline unsigned +static __always_inline long radix_tree_chunk_size(struct radix_tree_iter *iter) { return iter->next_index - iter->index; @@ -434,9 +434,9 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) return slot + offset + 1; } } else { - unsigned size = radix_tree_chunk_size(iter) - 1; + long size = radix_tree_chunk_size(iter); - while (size--) { + while (--size > 0) { slot++; iter->index++; if (likely(*slot)) From b6a515c8a0f6c2010a52793b43a79520bc95f994 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 5 Feb 2016 15:37:04 -0800 Subject: [PATCH 22/22] epoll: restrict EPOLLEXCLUSIVE to POLLIN and POLLOUT In the current implementation of the EPOLLEXCLUSIVE flag (added for 4.5-rc1), if epoll waiters create different POLL* sets and register them as exclusive against the same target fd, the current implementation will stop waking any further waiters once it finds the first idle waiter. This means that waiters could miss wakeups in certain cases. For example, when we wake up a pipe for reading we do: wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); So if one epoll set or epfd is added to pipe p with POLLIN and a second set epfd2 is added to pipe p with POLLRDNORM, only epfd may receive the wakeup since the current implementation will stop after it finds any intersection of events with a waiter that is blocked in epoll_wait(). We could potentially address this by requiring all epoll waiters that are added to p be required to pass the same set of POLL* events. IE the first EPOLL_CTL_ADD that passes EPOLLEXCLUSIVE establishes the set POLL* flags to be used by any other epfds that are added as EPOLLEXCLUSIVE. However, I think it might be somewhat confusing interface as we would have to reference count the number of users for that set, and so userspace would have to keep track of that count, or we would need a more involved interface. It also adds some shared state that we'd have store somewhere. I don't think anybody will want to bloat __wait_queue_head for this. I think what we could do instead, is to simply restrict EPOLLEXCLUSIVE such that it can only be specified with EPOLLIN and/or EPOLLOUT. So that way if the wakeup includes 'POLLIN' and not 'POLLOUT', we can stop once we hit the first idle waiter that specifies the EPOLLIN bit, since any remaining waiters that only have 'POLLOUT' set wouldn't need to be woken. Likewise, we can do the same thing if 'POLLOUT' is in the wakeup bit set and not 'POLLIN'. If both 'POLLOUT' and 'POLLIN' are set in the wake bit set (there is at least one example of this I saw in fs/pipe.c), then we just wake the entire exclusive list. Having both 'POLLOUT' and 'POLLIN' both set should not be on any performance critical path, so I think that's ok (in fs/pipe.c its in pipe_release()). We also continue to include EPOLLERR and EPOLLHUP by default in any exclusive set. Thus, the user can specify EPOLLERR and/or EPOLLHUP but is not required to do so. Since epoll waiters may be interested in other events as well besides EPOLLIN, EPOLLOUT, EPOLLERR and EPOLLHUP, these can still be added by doing a 'dup' call on the target fd and adding that as one normally would with EPOLL_CTL_ADD. Since I think that the POLLIN and POLLOUT events are what we are interest in balancing, I think that the 'dup' thing could perhaps be added to only one of the waiter threads. However, I think that EPOLLIN, EPOLLOUT, EPOLLERR and EPOLLHUP should be sufficient for the majority of use-cases. Since EPOLLEXCLUSIVE is intended to be used with a target fd shared among multiple epfds, where between 1 and n of the epfds may receive an event, it does not satisfy the semantics of EPOLLONESHOT where only 1 epfd would get an event. Thus, it is not allowed to be specified in conjunction with EPOLLEXCLUSIVE. EPOLL_CTL_MOD is also not allowed if the fd was previously added as EPOLLEXCLUSIVE. It seems with the limited number of flags to not be as interesting, but this could be relaxed at some further point. Signed-off-by: Jason Baron Tested-by: Madars Vitolins Cc: Michael Kerrisk Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Al Viro Cc: Eric Wong Cc: Jonathan Corbet Cc: Andy Lutomirski Cc: Hagen Paul Pfeifer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ae1dbcf47e97..cde60741cad2 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -94,6 +94,11 @@ /* Epoll private bits inside the event mask */ #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) +#define EPOLLINOUT_BITS (POLLIN | POLLOUT) + +#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \ + EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE) + /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 @@ -1068,7 +1073,22 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k * wait list. */ if (waitqueue_active(&ep->wq)) { - ewake = 1; + if ((epi->event.events & EPOLLEXCLUSIVE) && + !((unsigned long)key & POLLFREE)) { + switch ((unsigned long)key & EPOLLINOUT_BITS) { + case POLLIN: + if (epi->event.events & POLLIN) + ewake = 1; + break; + case POLLOUT: + if (epi->event.events & POLLOUT) + ewake = 1; + break; + case 0: + ewake = 1; + break; + } + } wake_up_locked(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) @@ -1875,9 +1895,13 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. * Also, we do not currently supported nested exclusive wakeups. */ - if ((epds.events & EPOLLEXCLUSIVE) && (op == EPOLL_CTL_MOD || - (op == EPOLL_CTL_ADD && is_file_epoll(tf.file)))) - goto error_tgt_fput; + if (epds.events & EPOLLEXCLUSIVE) { + if (op == EPOLL_CTL_MOD) + goto error_tgt_fput; + if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || + (epds.events & ~EPOLLEXCLUSIVE_OK_BITS))) + goto error_tgt_fput; + } /* * At this point it is safe to assume that the "private_data" contains @@ -1950,8 +1974,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, break; case EPOLL_CTL_MOD: if (epi) { - epds.events |= POLLERR | POLLHUP; - error = ep_modify(ep, epi, &epds); + if (!(epi->event.events & EPOLLEXCLUSIVE)) { + epds.events |= POLLERR | POLLHUP; + error = ep_modify(ep, epi, &epds); + } } else error = -ENOENT; break;