From 1e3921471354244f70fe268586ff94a97a6dd4df Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 2 Nov 2017 15:59:29 -0700
Subject: [PATCH 1/7] userfaultfd: hugetlbfs: prevent UFFDIO_COPY to fill
 beyond the end of i_size

This oops:

  kernel BUG at fs/hugetlbfs/inode.c:484!
  RIP: remove_inode_hugepages+0x3d0/0x410
  Call Trace:
    hugetlbfs_setattr+0xd9/0x130
    notify_change+0x292/0x410
    do_truncate+0x65/0xa0
    do_sys_ftruncate.constprop.3+0x11a/0x180
    SyS_ftruncate+0xe/0x10
    tracesys+0xd9/0xde

was caused by the lack of i_size check in hugetlb_mcopy_atomic_pte.

mmap() can still succeed beyond the end of the i_size after vmtruncate
zapped vmas in those ranges, but the faults must not succeed, and that
includes UFFDIO_COPY.

We could differentiate the retval to userland to represent a SIGBUS like
a page fault would do (vs SIGSEGV), but it doesn't seem very useful and
we'd need to pick a random retval as there's no meaningful syscall
retval that would differentiate from SIGSEGV and SIGBUS, there's just
-EFAULT.

Link: http://lkml.kernel.org/r/20171016223914.2421-2-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 424b0ef08a60..2d2ff5e8bf2b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3984,6 +3984,9 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 			    unsigned long src_addr,
 			    struct page **pagep)
 {
+	struct address_space *mapping;
+	pgoff_t idx;
+	unsigned long size;
 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
 	struct hstate *h = hstate_vma(dst_vma);
 	pte_t _dst_pte;
@@ -4021,13 +4024,24 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	__SetPageUptodate(page);
 	set_page_huge_active(page);
 
+	mapping = dst_vma->vm_file->f_mapping;
+	idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+
 	/*
 	 * If shared, add to page cache
 	 */
 	if (vm_shared) {
-		struct address_space *mapping = dst_vma->vm_file->f_mapping;
-		pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+		size = i_size_read(mapping->host) >> huge_page_shift(h);
+		ret = -EFAULT;
+		if (idx >= size)
+			goto out_release_nounlock;
 
+		/*
+		 * Serialization between remove_inode_hugepages() and
+		 * huge_add_to_page_cache() below happens through the
+		 * hugetlb_fault_mutex_table that here must be hold by
+		 * the caller.
+		 */
 		ret = huge_add_to_page_cache(page, mapping, idx);
 		if (ret)
 			goto out_release_nounlock;
@@ -4036,6 +4050,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
 	spin_lock(ptl);
 
+	/*
+	 * Recheck the i_size after holding PT lock to make sure not
+	 * to leave any page mapped (as page_mapped()) beyond the end
+	 * of the i_size (remove_inode_hugepages() is strict about
+	 * enforcing that). If we bail out here, we'll also leave a
+	 * page in the radix tree in the vm_shared case beyond the end
+	 * of the i_size, but remove_inode_hugepages() will take care
+	 * of it as soon as we drop the hugetlb_fault_mutex_table.
+	 */
+	size = i_size_read(mapping->host) >> huge_page_shift(h);
+	ret = -EFAULT;
+	if (idx >= size)
+		goto out_release_unlock;
+
 	ret = -EEXIST;
 	if (!huge_pte_none(huge_ptep_get(dst_pte)))
 		goto out_release_unlock;

From b83d7e432399d44454411dec5c25afb5c4469e96 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Thu, 2 Nov 2017 15:59:34 -0700
Subject: [PATCH 2/7] mm, /proc/pid/pagemap: fix soft dirty marking for PMD
 migration entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the pagetable is walked in the implementation of /proc/<pid>/pagemap,
pmd_soft_dirty() is used for both the PMD huge page map and the PMD
migration entries.  That is wrong, pmd_swp_soft_dirty() should be used
for the PMD migration entries instead because the different page table
entry flag is used.

As a result, /proc/pid/pagemap may report incorrect soft dirty information
for PMD migration entries.

Link: http://lkml.kernel.org/r/20171017081818.31795-1-ying.huang@intel.com
Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: Daniel Colascione <dancol@google.com>
Cc: Zi Yan <zi.yan@cs.rutgers.edu>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 280282b05bc7..6744bd706ecf 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1311,13 +1311,15 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		pmd_t pmd = *pmdp;
 		struct page *page = NULL;
 
-		if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
+		if (vma->vm_flags & VM_SOFTDIRTY)
 			flags |= PM_SOFT_DIRTY;
 
 		if (pmd_present(pmd)) {
 			page = pmd_page(pmd);
 
 			flags |= PM_PRESENT;
+			if (pmd_soft_dirty(pmd))
+				flags |= PM_SOFT_DIRTY;
 			if (pm->show_pfn)
 				frame = pmd_pfn(pmd) +
 					((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1329,6 +1331,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 			frame = swp_type(entry) |
 				(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
 			flags |= PM_SWAP;
+			if (pmd_swp_soft_dirty(pmd))
+				flags |= PM_SOFT_DIRTY;
 			VM_BUG_ON(!is_pmd_migration_entry(pmd));
 			page = migration_entry_to_page(entry);
 		}

From 105ddc93f06ebe3e553f58563d11ed63dbcd59f0 Mon Sep 17 00:00:00 2001
From: Ashish Samant <ashish.samant@oracle.com>
Date: Thu, 2 Nov 2017 15:59:37 -0700
Subject: [PATCH 3/7] ocfs2: fstrim: Fix start offset of first cluster group
 during fstrim

The first cluster group descriptor is not stored at the start of the
group but at an offset from the start.  We need to take this into
account while doing fstrim on the first cluster group.  Otherwise we
will wrongly start fstrim a few blocks after the desired start block and
the range can cross over into the next cluster group and zero out the
group descriptor there.  This can cause filesytem corruption that cannot
be fixed by fsck.

Link: http://lkml.kernel.org/r/1507835579-7308-1-git-send-email-ashish.samant@oracle.com
Signed-off-by: Ashish Samant <ashish.samant@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Joseph Qi <jiangqi903@gmail.com>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/alloc.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index a177eae3aa1a..addd7c5f2d3e 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7304,13 +7304,24 @@ out:
 
 static int ocfs2_trim_extent(struct super_block *sb,
 			     struct ocfs2_group_desc *gd,
-			     u32 start, u32 count)
+			     u64 group, u32 start, u32 count)
 {
 	u64 discard, bcount;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
 
 	bcount = ocfs2_clusters_to_blocks(sb, count);
-	discard = le64_to_cpu(gd->bg_blkno) +
-			ocfs2_clusters_to_blocks(sb, start);
+	discard = ocfs2_clusters_to_blocks(sb, start);
+
+	/*
+	 * For the first cluster group, the gd->bg_blkno is not at the start
+	 * of the group, but at an offset from the start. If we add it while
+	 * calculating discard for first group, we will wrongly start fstrim a
+	 * few blocks after the desried start block and the range can cross
+	 * over into the next cluster group. So, add it only if this is not
+	 * the first cluster group.
+	 */
+	if (group != osb->first_cluster_group_blkno)
+		discard += le64_to_cpu(gd->bg_blkno);
 
 	trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
 
@@ -7318,7 +7329,7 @@ static int ocfs2_trim_extent(struct super_block *sb,
 }
 
 static int ocfs2_trim_group(struct super_block *sb,
-			    struct ocfs2_group_desc *gd,
+			    struct ocfs2_group_desc *gd, u64 group,
 			    u32 start, u32 max, u32 minbits)
 {
 	int ret = 0, count = 0, next;
@@ -7337,7 +7348,7 @@ static int ocfs2_trim_group(struct super_block *sb,
 		next = ocfs2_find_next_bit(bitmap, max, start);
 
 		if ((next - start) >= minbits) {
-			ret = ocfs2_trim_extent(sb, gd,
+			ret = ocfs2_trim_extent(sb, gd, group,
 						start, next - start);
 			if (ret < 0) {
 				mlog_errno(ret);
@@ -7435,7 +7446,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
 		}
 
 		gd = (struct ocfs2_group_desc *)gd_bh->b_data;
-		cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+		cnt = ocfs2_trim_group(sb, gd, group,
+				       first_bit, last_bit, minlen);
 		brelse(gd_bh);
 		gd_bh = NULL;
 		if (cnt < 0) {

From ab615a5b879292e83653be60aa82113f7c6f462d Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Thu, 2 Nov 2017 15:59:41 -0700
Subject: [PATCH 4/7] fs/hugetlbfs/inode.c: fix hwpoison reserve accounting

Calling madvise(MADV_HWPOISON) on a hugetlbfs page will result in bad
(negative) reserved huge page counts.  This may not happen immediately,
but may happen later when the underlying file is removed or filesystem
unmounted.  For example:

  AnonHugePages:         0 kB
  ShmemHugePages:        0 kB
  HugePages_Total:       1
  HugePages_Free:        0
  HugePages_Rsvd:    18446744073709551615
  HugePages_Surp:        0
  Hugepagesize:       2048 kB

In routine hugetlbfs_error_remove_page(), hugetlb_fix_reserve_counts is
called after remove_huge_page.  hugetlb_fix_reserve_counts is designed
to only be called/used only if a failure is returned from
hugetlb_unreserve_pages.  Therefore, call hugetlb_unreserve_pages as
required and only call hugetlb_fix_reserve_counts in the unlikely event
that hugetlb_unreserve_pages returns an error.

Link: http://lkml.kernel.org/r/20171019230007.17043-2-mike.kravetz@oracle.com
Fixes: 78bb920344b8 ("mm: hwpoison: dissolve in-use hugepage in unrecoverable memory error")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 59073e9f01a4..ed113ea17aff 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -842,9 +842,12 @@ static int hugetlbfs_error_remove_page(struct address_space *mapping,
 				struct page *page)
 {
 	struct inode *inode = mapping->host;
+	pgoff_t index = page->index;
 
 	remove_huge_page(page);
-	hugetlb_fix_reserve_counts(inode);
+	if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
+		hugetlb_fix_reserve_counts(inode);
+
 	return 0;
 }
 

From e08b1877452a0055c65f6394163ce5a0fbd720a3 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 2 Nov 2017 15:59:44 -0700
Subject: [PATCH 5/7] initramfs: fix initramfs rebuilds w/ compression after
 disabling

This is a follow-up to commit 57ddfdaa9a72 ("initramfs: fix disabling of
initramfs (and its compression)").  This particular commit fixed the use
case where we build the kernel with an initramfs with no compression,
and then we build the kernel with no initramfs.

Now this still left us with the same case as described here:

  http://lkml.kernel.org/r/20170521033337.6197-1-f.fainelli@gmail.com

not working with initramfs compression.  This can be seen by the
following steps/timestamps:

  https://www.spinics.net/lists/kernel/msg2598153.html

.initramfs_data.cpio.gz.cmd is correct:

  cmd_usr/initramfs_data.cpio.gz := /bin/bash
  ./scripts/gen_initramfs_list.sh -o usr/initramfs_data.cpio.gz  -u 1000 -g 1000  /home/fainelli/work/uclinux-rootfs/romfs /home/fainelli/work/uclinux-rootfs/misc/initramfs.dev

and was generated the first time we did generate the gzip initramfs, so
the command has not changed, nor its arguments, so we just don't call
it, no initramfs cpio is re-generated as a consequence.

The fix for this problem is just to properly keep track of the
.initramfs_cpio_data.d file by suffixing it with the compression
extension.  This takes care of properly tracking dependencies such that
the initramfs get (re)generated any time files are added/deleted etc.

Link: http://lkml.kernel.org/r/20170930033936.6722-1-f.fainelli@gmail.com
Fixes: db2aa7fd15e8 ("initramfs: allow again choice of the embedded initramfs compression algorithm")
Fixes: 9e3596b0c653 ("kbuild: initramfs cleanup, set target from Kconfig")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Cc: "Francisco Blas Izquierdo Riera (klondike)" <klondike@xiscosoft.net>
Cc: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 usr/Makefile | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/usr/Makefile b/usr/Makefile
index 34a9fcd0f537..237a028693ce 100644
--- a/usr/Makefile
+++ b/usr/Makefile
@@ -8,6 +8,7 @@ PHONY += klibcdirs
 
 suffix_y = $(subst $\",,$(CONFIG_INITRAMFS_COMPRESSION))
 datafile_y = initramfs_data.cpio$(suffix_y)
+datafile_d_y = .$(datafile_y).d
 AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/$(datafile_y)"
 
 
@@ -30,12 +31,12 @@ ramfs-args  := \
         $(if $(CONFIG_INITRAMFS_ROOT_UID), -u $(CONFIG_INITRAMFS_ROOT_UID)) \
         $(if $(CONFIG_INITRAMFS_ROOT_GID), -g $(CONFIG_INITRAMFS_ROOT_GID))
 
-# .initramfs_data.cpio.d is used to identify all files included
+# $(datafile_d_y) is used to identify all files included
 # in initramfs and to detect if any files are added/removed.
 # Removed files are identified by directory timestamp being updated
 # The dependency list is generated by gen_initramfs.sh -l
-ifneq ($(wildcard $(obj)/.initramfs_data.cpio.d),)
-	include $(obj)/.initramfs_data.cpio.d
+ifneq ($(wildcard $(obj)/$(datafile_d_y)),)
+	include $(obj)/$(datafile_d_y)
 endif
 
 quiet_cmd_initfs = GEN     $@
@@ -53,5 +54,5 @@ $(deps_initramfs): klibcdirs
 # 3) If gen_init_cpio are newer than initramfs_data.cpio
 # 4) arguments to gen_initramfs.sh changes
 $(obj)/$(datafile_y): $(obj)/gen_init_cpio $(deps_initramfs) klibcdirs
-	$(Q)$(initramfs) -l $(ramfs-input) > $(obj)/.initramfs_data.cpio.d
+	$(Q)$(initramfs) -l $(ramfs-input) > $(obj)/$(datafile_d_y)
 	$(call if_changed,initfs)

From dd8a67f9a37c74b61e5e050924ceec9ffb4f8c3c Mon Sep 17 00:00:00 2001
From: Zi Yan <zi.yan@cs.rutgers.edu>
Date: Thu, 2 Nov 2017 15:59:47 -0700
Subject: [PATCH 6/7] mm/huge_memory.c: deposit page table when copying a PMD
 migration entry

We need to deposit pre-allocated PTE page table when a PMD migration
entry is copied in copy_huge_pmd().  Otherwise, we will leak the
pre-allocated page and cause a NULL pointer dereference later in
zap_huge_pmd().

The missing counters during PMD migration entry copy process are added
as well.

The bug report is here: https://lkml.org/lkml/2017/10/29/214

Link: http://lkml.kernel.org/r/20171030144636.4836-1-zi.yan@sent.com
Fixes: 84c3fc4e9c563 ("mm: thp: check pmd migration entry in common path")
Signed-off-by: Zi Yan <zi.yan@cs.rutgers.edu>
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 269b5df58543..1981ed697dab 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -941,6 +941,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 				pmd = pmd_swp_mksoft_dirty(pmd);
 			set_pmd_at(src_mm, addr, src_pmd, pmd);
 		}
+		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+		atomic_long_inc(&dst_mm->nr_ptes);
+		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 		set_pmd_at(dst_mm, addr, dst_pmd, pmd);
 		ret = 0;
 		goto out_unlock;

From 2628bd6fc052bd85e9864dae4de494d8a6313391 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Thu, 2 Nov 2017 15:59:50 -0700
Subject: [PATCH 7/7] mm, swap: fix race between swap count continuation
 operations

One page may store a set of entries of the sis->swap_map
(swap_info_struct->swap_map) in multiple swap clusters.

If some of the entries has sis->swap_map[offset] > SWAP_MAP_MAX,
multiple pages will be used to store the set of entries of the
sis->swap_map.  And the pages are linked with page->lru.  This is called
swap count continuation.  To access the pages which store the set of
entries of the sis->swap_map simultaneously, previously, sis->lock is
used.  But to improve the scalability of __swap_duplicate(), swap
cluster lock may be used in swap_count_continued() now.  This may race
with add_swap_count_continuation() which operates on a nearby swap
cluster, in which the sis->swap_map entries are stored in the same page.

The race can cause wrong swap count in practice, thus cause unfreeable
swap entries or software lockup, etc.

To fix the race, a new spin lock called cont_lock is added to struct
swap_info_struct to protect the swap count continuation page list.  This
is a lock at the swap device level, so the scalability isn't very well.
But it is still much better than the original sis->lock, because it is
only acquired/released when swap count continuation is used.  Which is
considered rare in practice.  If it turns out that the scalability
becomes an issue for some workloads, we can split the lock into some
more fine grained locks.

Link: http://lkml.kernel.org/r/20171017081320.28133-1-ying.huang@intel.com
Fixes: 235b62176712 ("mm/swap: add cluster lock")
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: <stable@vger.kernel.org>	[4.11+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  4 ++++
 mm/swapfile.c        | 23 +++++++++++++++++------
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index b489bd77bbdc..f02fb5db8914 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -266,6 +266,10 @@ struct swap_info_struct {
 					 * both locks need hold, hold swap_lock
 					 * first.
 					 */
+	spinlock_t cont_lock;		/*
+					 * protect swap count continuation page
+					 * list.
+					 */
 	struct work_struct discard_work; /* discard worker */
 	struct swap_cluster_list discard_clusters; /* discard clusters list */
 };
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bf91dc9e7a79..e47a21e64764 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2869,6 +2869,7 @@ static struct swap_info_struct *alloc_swap_info(void)
 	p->flags = SWP_USED;
 	spin_unlock(&swap_lock);
 	spin_lock_init(&p->lock);
+	spin_lock_init(&p->cont_lock);
 
 	return p;
 }
@@ -3545,6 +3546,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
 	head = vmalloc_to_page(si->swap_map + offset);
 	offset &= ~PAGE_MASK;
 
+	spin_lock(&si->cont_lock);
 	/*
 	 * Page allocation does not initialize the page's lru field,
 	 * but it does always reset its private field.
@@ -3564,7 +3566,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
 		 * a continuation page, free our allocation and use this one.
 		 */
 		if (!(count & COUNT_CONTINUED))
-			goto out;
+			goto out_unlock_cont;
 
 		map = kmap_atomic(list_page) + offset;
 		count = *map;
@@ -3575,11 +3577,13 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
 		 * free our allocation and use this one.
 		 */
 		if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
-			goto out;
+			goto out_unlock_cont;
 	}
 
 	list_add_tail(&page->lru, &head->lru);
 	page = NULL;			/* now it's attached, don't free it */
+out_unlock_cont:
+	spin_unlock(&si->cont_lock);
 out:
 	unlock_cluster(ci);
 	spin_unlock(&si->lock);
@@ -3604,6 +3608,7 @@ static bool swap_count_continued(struct swap_info_struct *si,
 	struct page *head;
 	struct page *page;
 	unsigned char *map;
+	bool ret;
 
 	head = vmalloc_to_page(si->swap_map + offset);
 	if (page_private(head) != SWP_CONTINUED) {
@@ -3611,6 +3616,7 @@ static bool swap_count_continued(struct swap_info_struct *si,
 		return false;		/* need to add count continuation */
 	}
 
+	spin_lock(&si->cont_lock);
 	offset &= ~PAGE_MASK;
 	page = list_entry(head->lru.next, struct page, lru);
 	map = kmap_atomic(page) + offset;
@@ -3631,8 +3637,10 @@ static bool swap_count_continued(struct swap_info_struct *si,
 		if (*map == SWAP_CONT_MAX) {
 			kunmap_atomic(map);
 			page = list_entry(page->lru.next, struct page, lru);
-			if (page == head)
-				return false;	/* add count continuation */
+			if (page == head) {
+				ret = false;	/* add count continuation */
+				goto out;
+			}
 			map = kmap_atomic(page) + offset;
 init_map:		*map = 0;		/* we didn't zero the page */
 		}
@@ -3645,7 +3653,7 @@ init_map:		*map = 0;		/* we didn't zero the page */
 			kunmap_atomic(map);
 			page = list_entry(page->lru.prev, struct page, lru);
 		}
-		return true;			/* incremented */
+		ret = true;			/* incremented */
 
 	} else {				/* decrementing */
 		/*
@@ -3671,8 +3679,11 @@ init_map:		*map = 0;		/* we didn't zero the page */
 			kunmap_atomic(map);
 			page = list_entry(page->lru.prev, struct page, lru);
 		}
-		return count == COUNT_CONTINUED;
+		ret = count == COUNT_CONTINUED;
 	}
+out:
+	spin_unlock(&si->cont_lock);
+	return ret;
 }
 
 /*