From 1e3921471354244f70fe268586ff94a97a6dd4df Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 2 Nov 2017 15:59:29 -0700 Subject: [PATCH 1/7] userfaultfd: hugetlbfs: prevent UFFDIO_COPY to fill beyond the end of i_size This oops: kernel BUG at fs/hugetlbfs/inode.c:484! RIP: remove_inode_hugepages+0x3d0/0x410 Call Trace: hugetlbfs_setattr+0xd9/0x130 notify_change+0x292/0x410 do_truncate+0x65/0xa0 do_sys_ftruncate.constprop.3+0x11a/0x180 SyS_ftruncate+0xe/0x10 tracesys+0xd9/0xde was caused by the lack of i_size check in hugetlb_mcopy_atomic_pte. mmap() can still succeed beyond the end of the i_size after vmtruncate zapped vmas in those ranges, but the faults must not succeed, and that includes UFFDIO_COPY. We could differentiate the retval to userland to represent a SIGBUS like a page fault would do (vs SIGSEGV), but it doesn't seem very useful and we'd need to pick a random retval as there's no meaningful syscall retval that would differentiate from SIGSEGV and SIGBUS, there's just -EFAULT. Link: http://lkml.kernel.org/r/20171016223914.2421-2-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reviewed-by: Mike Kravetz Cc: Mike Rapoport Cc: "Dr. David Alan Gilbert" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 424b0ef08a60..2d2ff5e8bf2b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3984,6 +3984,9 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, unsigned long src_addr, struct page **pagep) { + struct address_space *mapping; + pgoff_t idx; + unsigned long size; int vm_shared = dst_vma->vm_flags & VM_SHARED; struct hstate *h = hstate_vma(dst_vma); pte_t _dst_pte; @@ -4021,13 +4024,24 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, __SetPageUptodate(page); set_page_huge_active(page); + mapping = dst_vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, dst_vma, dst_addr); + /* * If shared, add to page cache */ if (vm_shared) { - struct address_space *mapping = dst_vma->vm_file->f_mapping; - pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); + size = i_size_read(mapping->host) >> huge_page_shift(h); + ret = -EFAULT; + if (idx >= size) + goto out_release_nounlock; + /* + * Serialization between remove_inode_hugepages() and + * huge_add_to_page_cache() below happens through the + * hugetlb_fault_mutex_table that here must be hold by + * the caller. + */ ret = huge_add_to_page_cache(page, mapping, idx); if (ret) goto out_release_nounlock; @@ -4036,6 +4050,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, ptl = huge_pte_lockptr(h, dst_mm, dst_pte); spin_lock(ptl); + /* + * Recheck the i_size after holding PT lock to make sure not + * to leave any page mapped (as page_mapped()) beyond the end + * of the i_size (remove_inode_hugepages() is strict about + * enforcing that). If we bail out here, we'll also leave a + * page in the radix tree in the vm_shared case beyond the end + * of the i_size, but remove_inode_hugepages() will take care + * of it as soon as we drop the hugetlb_fault_mutex_table. + */ + size = i_size_read(mapping->host) >> huge_page_shift(h); + ret = -EFAULT; + if (idx >= size) + goto out_release_unlock; + ret = -EEXIST; if (!huge_pte_none(huge_ptep_get(dst_pte))) goto out_release_unlock; From b83d7e432399d44454411dec5c25afb5c4469e96 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 2 Nov 2017 15:59:34 -0700 Subject: [PATCH 2/7] mm, /proc/pid/pagemap: fix soft dirty marking for PMD migration entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the pagetable is walked in the implementation of /proc//pagemap, pmd_soft_dirty() is used for both the PMD huge page map and the PMD migration entries. That is wrong, pmd_swp_soft_dirty() should be used for the PMD migration entries instead because the different page table entry flag is used. As a result, /proc/pid/pagemap may report incorrect soft dirty information for PMD migration entries. Link: http://lkml.kernel.org/r/20171017081818.31795-1-ying.huang@intel.com Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path") Signed-off-by: "Huang, Ying" Acked-by: Kirill A. Shutemov Acked-by: Naoya Horiguchi Cc: Michal Hocko Cc: David Rientjes Cc: Arnd Bergmann Cc: Hugh Dickins Cc: "Jérôme Glisse" Cc: Daniel Colascione Cc: Zi Yan Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 280282b05bc7..6744bd706ecf 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1311,13 +1311,15 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, pmd_t pmd = *pmdp; struct page *page = NULL; - if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd)) + if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; if (pmd_present(pmd)) { page = pmd_page(pmd); flags |= PM_PRESENT; + if (pmd_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; if (pm->show_pfn) frame = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); @@ -1329,6 +1331,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags |= PM_SWAP; + if (pmd_swp_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; VM_BUG_ON(!is_pmd_migration_entry(pmd)); page = migration_entry_to_page(entry); } From 105ddc93f06ebe3e553f58563d11ed63dbcd59f0 Mon Sep 17 00:00:00 2001 From: Ashish Samant Date: Thu, 2 Nov 2017 15:59:37 -0700 Subject: [PATCH 3/7] ocfs2: fstrim: Fix start offset of first cluster group during fstrim The first cluster group descriptor is not stored at the start of the group but at an offset from the start. We need to take this into account while doing fstrim on the first cluster group. Otherwise we will wrongly start fstrim a few blocks after the desired start block and the range can cross over into the next cluster group and zero out the group descriptor there. This can cause filesytem corruption that cannot be fixed by fsck. Link: http://lkml.kernel.org/r/1507835579-7308-1-git-send-email-ashish.samant@oracle.com Signed-off-by: Ashish Samant Reviewed-by: Junxiao Bi Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a177eae3aa1a..addd7c5f2d3e 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7304,13 +7304,24 @@ out: static int ocfs2_trim_extent(struct super_block *sb, struct ocfs2_group_desc *gd, - u32 start, u32 count) + u64 group, u32 start, u32 count) { u64 discard, bcount; + struct ocfs2_super *osb = OCFS2_SB(sb); bcount = ocfs2_clusters_to_blocks(sb, count); - discard = le64_to_cpu(gd->bg_blkno) + - ocfs2_clusters_to_blocks(sb, start); + discard = ocfs2_clusters_to_blocks(sb, start); + + /* + * For the first cluster group, the gd->bg_blkno is not at the start + * of the group, but at an offset from the start. If we add it while + * calculating discard for first group, we will wrongly start fstrim a + * few blocks after the desried start block and the range can cross + * over into the next cluster group. So, add it only if this is not + * the first cluster group. + */ + if (group != osb->first_cluster_group_blkno) + discard += le64_to_cpu(gd->bg_blkno); trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount); @@ -7318,7 +7329,7 @@ static int ocfs2_trim_extent(struct super_block *sb, } static int ocfs2_trim_group(struct super_block *sb, - struct ocfs2_group_desc *gd, + struct ocfs2_group_desc *gd, u64 group, u32 start, u32 max, u32 minbits) { int ret = 0, count = 0, next; @@ -7337,7 +7348,7 @@ static int ocfs2_trim_group(struct super_block *sb, next = ocfs2_find_next_bit(bitmap, max, start); if ((next - start) >= minbits) { - ret = ocfs2_trim_extent(sb, gd, + ret = ocfs2_trim_extent(sb, gd, group, start, next - start); if (ret < 0) { mlog_errno(ret); @@ -7435,7 +7446,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) } gd = (struct ocfs2_group_desc *)gd_bh->b_data; - cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen); + cnt = ocfs2_trim_group(sb, gd, group, + first_bit, last_bit, minlen); brelse(gd_bh); gd_bh = NULL; if (cnt < 0) { From ab615a5b879292e83653be60aa82113f7c6f462d Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 2 Nov 2017 15:59:41 -0700 Subject: [PATCH 4/7] fs/hugetlbfs/inode.c: fix hwpoison reserve accounting Calling madvise(MADV_HWPOISON) on a hugetlbfs page will result in bad (negative) reserved huge page counts. This may not happen immediately, but may happen later when the underlying file is removed or filesystem unmounted. For example: AnonHugePages: 0 kB ShmemHugePages: 0 kB HugePages_Total: 1 HugePages_Free: 0 HugePages_Rsvd: 18446744073709551615 HugePages_Surp: 0 Hugepagesize: 2048 kB In routine hugetlbfs_error_remove_page(), hugetlb_fix_reserve_counts is called after remove_huge_page. hugetlb_fix_reserve_counts is designed to only be called/used only if a failure is returned from hugetlb_unreserve_pages. Therefore, call hugetlb_unreserve_pages as required and only call hugetlb_fix_reserve_counts in the unlikely event that hugetlb_unreserve_pages returns an error. Link: http://lkml.kernel.org/r/20171019230007.17043-2-mike.kravetz@oracle.com Fixes: 78bb920344b8 ("mm: hwpoison: dissolve in-use hugepage in unrecoverable memory error") Signed-off-by: Mike Kravetz Acked-by: Naoya Horiguchi Cc: Michal Hocko Cc: Aneesh Kumar Cc: Anshuman Khandual Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 59073e9f01a4..ed113ea17aff 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -842,9 +842,12 @@ static int hugetlbfs_error_remove_page(struct address_space *mapping, struct page *page) { struct inode *inode = mapping->host; + pgoff_t index = page->index; remove_huge_page(page); - hugetlb_fix_reserve_counts(inode); + if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1))) + hugetlb_fix_reserve_counts(inode); + return 0; } From e08b1877452a0055c65f6394163ce5a0fbd720a3 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 2 Nov 2017 15:59:44 -0700 Subject: [PATCH 5/7] initramfs: fix initramfs rebuilds w/ compression after disabling This is a follow-up to commit 57ddfdaa9a72 ("initramfs: fix disabling of initramfs (and its compression)"). This particular commit fixed the use case where we build the kernel with an initramfs with no compression, and then we build the kernel with no initramfs. Now this still left us with the same case as described here: http://lkml.kernel.org/r/20170521033337.6197-1-f.fainelli@gmail.com not working with initramfs compression. This can be seen by the following steps/timestamps: https://www.spinics.net/lists/kernel/msg2598153.html .initramfs_data.cpio.gz.cmd is correct: cmd_usr/initramfs_data.cpio.gz := /bin/bash ./scripts/gen_initramfs_list.sh -o usr/initramfs_data.cpio.gz -u 1000 -g 1000 /home/fainelli/work/uclinux-rootfs/romfs /home/fainelli/work/uclinux-rootfs/misc/initramfs.dev and was generated the first time we did generate the gzip initramfs, so the command has not changed, nor its arguments, so we just don't call it, no initramfs cpio is re-generated as a consequence. The fix for this problem is just to properly keep track of the .initramfs_cpio_data.d file by suffixing it with the compression extension. This takes care of properly tracking dependencies such that the initramfs get (re)generated any time files are added/deleted etc. Link: http://lkml.kernel.org/r/20170930033936.6722-1-f.fainelli@gmail.com Fixes: db2aa7fd15e8 ("initramfs: allow again choice of the embedded initramfs compression algorithm") Fixes: 9e3596b0c653 ("kbuild: initramfs cleanup, set target from Kconfig") Signed-off-by: Florian Fainelli Cc: "Francisco Blas Izquierdo Riera (klondike)" Cc: Nicholas Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- usr/Makefile | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/usr/Makefile b/usr/Makefile index 34a9fcd0f537..237a028693ce 100644 --- a/usr/Makefile +++ b/usr/Makefile @@ -8,6 +8,7 @@ PHONY += klibcdirs suffix_y = $(subst $\",,$(CONFIG_INITRAMFS_COMPRESSION)) datafile_y = initramfs_data.cpio$(suffix_y) +datafile_d_y = .$(datafile_y).d AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/$(datafile_y)" @@ -30,12 +31,12 @@ ramfs-args := \ $(if $(CONFIG_INITRAMFS_ROOT_UID), -u $(CONFIG_INITRAMFS_ROOT_UID)) \ $(if $(CONFIG_INITRAMFS_ROOT_GID), -g $(CONFIG_INITRAMFS_ROOT_GID)) -# .initramfs_data.cpio.d is used to identify all files included +# $(datafile_d_y) is used to identify all files included # in initramfs and to detect if any files are added/removed. # Removed files are identified by directory timestamp being updated # The dependency list is generated by gen_initramfs.sh -l -ifneq ($(wildcard $(obj)/.initramfs_data.cpio.d),) - include $(obj)/.initramfs_data.cpio.d +ifneq ($(wildcard $(obj)/$(datafile_d_y)),) + include $(obj)/$(datafile_d_y) endif quiet_cmd_initfs = GEN $@ @@ -53,5 +54,5 @@ $(deps_initramfs): klibcdirs # 3) If gen_init_cpio are newer than initramfs_data.cpio # 4) arguments to gen_initramfs.sh changes $(obj)/$(datafile_y): $(obj)/gen_init_cpio $(deps_initramfs) klibcdirs - $(Q)$(initramfs) -l $(ramfs-input) > $(obj)/.initramfs_data.cpio.d + $(Q)$(initramfs) -l $(ramfs-input) > $(obj)/$(datafile_d_y) $(call if_changed,initfs) From dd8a67f9a37c74b61e5e050924ceec9ffb4f8c3c Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 2 Nov 2017 15:59:47 -0700 Subject: [PATCH 6/7] mm/huge_memory.c: deposit page table when copying a PMD migration entry We need to deposit pre-allocated PTE page table when a PMD migration entry is copied in copy_huge_pmd(). Otherwise, we will leak the pre-allocated page and cause a NULL pointer dereference later in zap_huge_pmd(). The missing counters during PMD migration entry copy process are added as well. The bug report is here: https://lkml.org/lkml/2017/10/29/214 Link: http://lkml.kernel.org/r/20171030144636.4836-1-zi.yan@sent.com Fixes: 84c3fc4e9c563 ("mm: thp: check pmd migration entry in common path") Signed-off-by: Zi Yan Reported-by: Fengguang Wu Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 269b5df58543..1981ed697dab 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -941,6 +941,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd = pmd_swp_mksoft_dirty(pmd); set_pmd_at(src_mm, addr, src_pmd, pmd); } + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + atomic_long_inc(&dst_mm->nr_ptes); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); ret = 0; goto out_unlock; From 2628bd6fc052bd85e9864dae4de494d8a6313391 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 2 Nov 2017 15:59:50 -0700 Subject: [PATCH 7/7] mm, swap: fix race between swap count continuation operations One page may store a set of entries of the sis->swap_map (swap_info_struct->swap_map) in multiple swap clusters. If some of the entries has sis->swap_map[offset] > SWAP_MAP_MAX, multiple pages will be used to store the set of entries of the sis->swap_map. And the pages are linked with page->lru. This is called swap count continuation. To access the pages which store the set of entries of the sis->swap_map simultaneously, previously, sis->lock is used. But to improve the scalability of __swap_duplicate(), swap cluster lock may be used in swap_count_continued() now. This may race with add_swap_count_continuation() which operates on a nearby swap cluster, in which the sis->swap_map entries are stored in the same page. The race can cause wrong swap count in practice, thus cause unfreeable swap entries or software lockup, etc. To fix the race, a new spin lock called cont_lock is added to struct swap_info_struct to protect the swap count continuation page list. This is a lock at the swap device level, so the scalability isn't very well. But it is still much better than the original sis->lock, because it is only acquired/released when swap count continuation is used. Which is considered rare in practice. If it turns out that the scalability becomes an issue for some workloads, we can split the lock into some more fine grained locks. Link: http://lkml.kernel.org/r/20171017081320.28133-1-ying.huang@intel.com Fixes: 235b62176712 ("mm/swap: add cluster lock") Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Shaohua Li Cc: Tim Chen Cc: Michal Hocko Cc: Aaron Lu Cc: Dave Hansen Cc: Andi Kleen Cc: Minchan Kim Cc: Hugh Dickins Cc: [4.11+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 4 ++++ mm/swapfile.c | 23 +++++++++++++++++------ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index b489bd77bbdc..f02fb5db8914 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -266,6 +266,10 @@ struct swap_info_struct { * both locks need hold, hold swap_lock * first. */ + spinlock_t cont_lock; /* + * protect swap count continuation page + * list. + */ struct work_struct discard_work; /* discard worker */ struct swap_cluster_list discard_clusters; /* discard clusters list */ }; diff --git a/mm/swapfile.c b/mm/swapfile.c index bf91dc9e7a79..e47a21e64764 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2869,6 +2869,7 @@ static struct swap_info_struct *alloc_swap_info(void) p->flags = SWP_USED; spin_unlock(&swap_lock); spin_lock_init(&p->lock); + spin_lock_init(&p->cont_lock); return p; } @@ -3545,6 +3546,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) head = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; + spin_lock(&si->cont_lock); /* * Page allocation does not initialize the page's lru field, * but it does always reset its private field. @@ -3564,7 +3566,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) * a continuation page, free our allocation and use this one. */ if (!(count & COUNT_CONTINUED)) - goto out; + goto out_unlock_cont; map = kmap_atomic(list_page) + offset; count = *map; @@ -3575,11 +3577,13 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) * free our allocation and use this one. */ if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) - goto out; + goto out_unlock_cont; } list_add_tail(&page->lru, &head->lru); page = NULL; /* now it's attached, don't free it */ +out_unlock_cont: + spin_unlock(&si->cont_lock); out: unlock_cluster(ci); spin_unlock(&si->lock); @@ -3604,6 +3608,7 @@ static bool swap_count_continued(struct swap_info_struct *si, struct page *head; struct page *page; unsigned char *map; + bool ret; head = vmalloc_to_page(si->swap_map + offset); if (page_private(head) != SWP_CONTINUED) { @@ -3611,6 +3616,7 @@ static bool swap_count_continued(struct swap_info_struct *si, return false; /* need to add count continuation */ } + spin_lock(&si->cont_lock); offset &= ~PAGE_MASK; page = list_entry(head->lru.next, struct page, lru); map = kmap_atomic(page) + offset; @@ -3631,8 +3637,10 @@ static bool swap_count_continued(struct swap_info_struct *si, if (*map == SWAP_CONT_MAX) { kunmap_atomic(map); page = list_entry(page->lru.next, struct page, lru); - if (page == head) - return false; /* add count continuation */ + if (page == head) { + ret = false; /* add count continuation */ + goto out; + } map = kmap_atomic(page) + offset; init_map: *map = 0; /* we didn't zero the page */ } @@ -3645,7 +3653,7 @@ init_map: *map = 0; /* we didn't zero the page */ kunmap_atomic(map); page = list_entry(page->lru.prev, struct page, lru); } - return true; /* incremented */ + ret = true; /* incremented */ } else { /* decrementing */ /* @@ -3671,8 +3679,11 @@ init_map: *map = 0; /* we didn't zero the page */ kunmap_atomic(map); page = list_entry(page->lru.prev, struct page, lru); } - return count == COUNT_CONTINUED; + ret = count == COUNT_CONTINUED; } +out: + spin_unlock(&si->cont_lock); + return ret; } /*